{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 8.585055643879173, "eval_steps": 500, "global_step": 5400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001589825119236884, "grad_norm": 3.795235013851999, "learning_rate": 0.0, "loss": 0.6801, "step": 1 }, { "epoch": 0.003179650238473768, "grad_norm": 6.028367183562747, "learning_rate": 5.0000000000000004e-08, "loss": 0.6741, "step": 2 }, { "epoch": 0.0047694753577106515, "grad_norm": 3.6732651811382016, "learning_rate": 1.0000000000000001e-07, "loss": 0.5347, "step": 3 }, { "epoch": 0.006359300476947536, "grad_norm": 2.473833847570131, "learning_rate": 1.5000000000000002e-07, "loss": 0.578, "step": 4 }, { "epoch": 0.00794912559618442, "grad_norm": 6.859032436531032, "learning_rate": 2.0000000000000002e-07, "loss": 0.698, "step": 5 }, { "epoch": 0.009538950715421303, "grad_norm": 2.548152524845228, "learning_rate": 2.5e-07, "loss": 0.5583, "step": 6 }, { "epoch": 0.011128775834658187, "grad_norm": 3.7082336084796736, "learning_rate": 3.0000000000000004e-07, "loss": 0.8574, "step": 7 }, { "epoch": 0.012718600953895072, "grad_norm": 5.6502460829957615, "learning_rate": 3.5000000000000004e-07, "loss": 1.3824, "step": 8 }, { "epoch": 0.014308426073131956, "grad_norm": 5.838557402656807, "learning_rate": 4.0000000000000003e-07, "loss": 0.5955, "step": 9 }, { "epoch": 0.01589825119236884, "grad_norm": 4.405485478830656, "learning_rate": 4.5e-07, "loss": 0.6537, "step": 10 }, { "epoch": 0.017488076311605722, "grad_norm": 4.352230388503188, "learning_rate": 5e-07, "loss": 0.7808, "step": 11 }, { "epoch": 0.019077901430842606, "grad_norm": 4.566091731189592, "learning_rate": 5.5e-07, "loss": 0.5023, "step": 12 }, { "epoch": 0.02066772655007949, "grad_norm": 4.693754198892459, "learning_rate": 6.000000000000001e-07, "loss": 0.6848, "step": 13 }, { "epoch": 0.022257551669316374, "grad_norm": 2.183453710717913, "learning_rate": 6.5e-07, "loss": 0.5395, "step": 14 }, { "epoch": 0.02384737678855326, "grad_norm": 3.478226082868467, "learning_rate": 7.000000000000001e-07, "loss": 0.569, "step": 15 }, { "epoch": 0.025437201907790145, "grad_norm": 3.9488669302935406, "learning_rate": 7.5e-07, "loss": 0.744, "step": 16 }, { "epoch": 0.02702702702702703, "grad_norm": 3.4256019814432914, "learning_rate": 8.000000000000001e-07, "loss": 0.4897, "step": 17 }, { "epoch": 0.028616852146263912, "grad_norm": 3.4582194594942677, "learning_rate": 8.5e-07, "loss": 0.7124, "step": 18 }, { "epoch": 0.030206677265500796, "grad_norm": 3.2847288868842215, "learning_rate": 9e-07, "loss": 0.5599, "step": 19 }, { "epoch": 0.03179650238473768, "grad_norm": 4.5365052729153375, "learning_rate": 9.500000000000001e-07, "loss": 0.6273, "step": 20 }, { "epoch": 0.033386327503974564, "grad_norm": 2.3454914931107202, "learning_rate": 1e-06, "loss": 0.654, "step": 21 }, { "epoch": 0.034976152623211444, "grad_norm": 13.017990148845971, "learning_rate": 1.0500000000000001e-06, "loss": 0.7806, "step": 22 }, { "epoch": 0.03656597774244833, "grad_norm": 3.0735310695295537, "learning_rate": 1.1e-06, "loss": 0.4908, "step": 23 }, { "epoch": 0.03815580286168521, "grad_norm": 4.4854562533514795, "learning_rate": 1.15e-06, "loss": 0.6324, "step": 24 }, { "epoch": 0.0397456279809221, "grad_norm": 3.724914090813323, "learning_rate": 1.2000000000000002e-06, "loss": 0.6541, "step": 25 }, { "epoch": 0.04133545310015898, "grad_norm": 2.864902725530838, "learning_rate": 1.2499999999999999e-06, "loss": 0.5107, "step": 26 }, { "epoch": 0.04292527821939587, "grad_norm": 3.661800421624066, "learning_rate": 1.3e-06, "loss": 0.7261, "step": 27 }, { "epoch": 0.04451510333863275, "grad_norm": 2.488482275129523, "learning_rate": 1.35e-06, "loss": 0.5254, "step": 28 }, { "epoch": 0.046104928457869634, "grad_norm": 3.91195440532004, "learning_rate": 1.4000000000000001e-06, "loss": 0.5203, "step": 29 }, { "epoch": 0.04769475357710652, "grad_norm": 3.33721026092274, "learning_rate": 1.45e-06, "loss": 0.5819, "step": 30 }, { "epoch": 0.0492845786963434, "grad_norm": 3.0107615084105688, "learning_rate": 1.5e-06, "loss": 0.5539, "step": 31 }, { "epoch": 0.05087440381558029, "grad_norm": 3.057329129212838, "learning_rate": 1.55e-06, "loss": 0.6725, "step": 32 }, { "epoch": 0.05246422893481717, "grad_norm": 3.138243445029786, "learning_rate": 1.6000000000000001e-06, "loss": 0.5199, "step": 33 }, { "epoch": 0.05405405405405406, "grad_norm": 3.1558323687632837, "learning_rate": 1.65e-06, "loss": 0.4547, "step": 34 }, { "epoch": 0.05564387917329094, "grad_norm": 3.6418286297368483, "learning_rate": 1.7e-06, "loss": 0.663, "step": 35 }, { "epoch": 0.057233704292527825, "grad_norm": 1.972435978752039, "learning_rate": 1.75e-06, "loss": 0.3549, "step": 36 }, { "epoch": 0.058823529411764705, "grad_norm": 10.456881966513196, "learning_rate": 1.8e-06, "loss": 0.6311, "step": 37 }, { "epoch": 0.06041335453100159, "grad_norm": 2.5858978442042484, "learning_rate": 1.85e-06, "loss": 0.4097, "step": 38 }, { "epoch": 0.06200317965023847, "grad_norm": 3.340287874361837, "learning_rate": 1.9000000000000002e-06, "loss": 0.636, "step": 39 }, { "epoch": 0.06359300476947535, "grad_norm": 3.88574573447017, "learning_rate": 1.95e-06, "loss": 0.5211, "step": 40 }, { "epoch": 0.06518282988871224, "grad_norm": 3.4345469234953385, "learning_rate": 2e-06, "loss": 0.4854, "step": 41 }, { "epoch": 0.06677265500794913, "grad_norm": 8.753581492987225, "learning_rate": 2.05e-06, "loss": 0.3323, "step": 42 }, { "epoch": 0.06836248012718601, "grad_norm": 1.8743545519752527, "learning_rate": 2.1000000000000002e-06, "loss": 0.3416, "step": 43 }, { "epoch": 0.06995230524642289, "grad_norm": 2.7848000736439076, "learning_rate": 2.15e-06, "loss": 0.4476, "step": 44 }, { "epoch": 0.07154213036565978, "grad_norm": 4.1767808962119615, "learning_rate": 2.2e-06, "loss": 0.6096, "step": 45 }, { "epoch": 0.07313195548489666, "grad_norm": 2.732515763981667, "learning_rate": 2.25e-06, "loss": 0.6896, "step": 46 }, { "epoch": 0.07472178060413355, "grad_norm": 2.4892134977113045, "learning_rate": 2.3e-06, "loss": 0.692, "step": 47 }, { "epoch": 0.07631160572337042, "grad_norm": 3.5722737710584833, "learning_rate": 2.3500000000000004e-06, "loss": 1.2525, "step": 48 }, { "epoch": 0.07790143084260731, "grad_norm": 2.58716321667536, "learning_rate": 2.4000000000000003e-06, "loss": 0.6116, "step": 49 }, { "epoch": 0.0794912559618442, "grad_norm": 2.0753360795189346, "learning_rate": 2.45e-06, "loss": 0.3557, "step": 50 }, { "epoch": 0.08108108108108109, "grad_norm": 2.4516850889788424, "learning_rate": 2.4999999999999998e-06, "loss": 0.5785, "step": 51 }, { "epoch": 0.08267090620031796, "grad_norm": 3.489855211431725, "learning_rate": 2.55e-06, "loss": 0.5671, "step": 52 }, { "epoch": 0.08426073131955485, "grad_norm": 2.576899528411801, "learning_rate": 2.6e-06, "loss": 0.3073, "step": 53 }, { "epoch": 0.08585055643879173, "grad_norm": 2.4708493861674907, "learning_rate": 2.65e-06, "loss": 0.3626, "step": 54 }, { "epoch": 0.08744038155802862, "grad_norm": 4.632028043287789, "learning_rate": 2.7e-06, "loss": 0.5277, "step": 55 }, { "epoch": 0.0890302066772655, "grad_norm": 3.7424543322608157, "learning_rate": 2.75e-06, "loss": 0.5012, "step": 56 }, { "epoch": 0.09062003179650238, "grad_norm": 2.0483528283718373, "learning_rate": 2.8000000000000003e-06, "loss": 0.394, "step": 57 }, { "epoch": 0.09220985691573927, "grad_norm": 1.9792144750603868, "learning_rate": 2.8500000000000002e-06, "loss": 0.3581, "step": 58 }, { "epoch": 0.09379968203497616, "grad_norm": 3.8982763422672018, "learning_rate": 2.9e-06, "loss": 0.4495, "step": 59 }, { "epoch": 0.09538950715421304, "grad_norm": 2.450485499301386, "learning_rate": 2.9499999999999997e-06, "loss": 0.3947, "step": 60 }, { "epoch": 0.09697933227344992, "grad_norm": 2.9115830161776977, "learning_rate": 3e-06, "loss": 0.5786, "step": 61 }, { "epoch": 0.0985691573926868, "grad_norm": 1.9944897401992767, "learning_rate": 3.05e-06, "loss": 0.3494, "step": 62 }, { "epoch": 0.10015898251192369, "grad_norm": 1.8572833024485675, "learning_rate": 3.1e-06, "loss": 0.4715, "step": 63 }, { "epoch": 0.10174880763116058, "grad_norm": 1.6569837912278225, "learning_rate": 3.15e-06, "loss": 0.3315, "step": 64 }, { "epoch": 0.10333863275039745, "grad_norm": 2.1197045009140187, "learning_rate": 3.2000000000000003e-06, "loss": 0.4436, "step": 65 }, { "epoch": 0.10492845786963434, "grad_norm": 1.7210866836040277, "learning_rate": 3.2500000000000002e-06, "loss": 0.3754, "step": 66 }, { "epoch": 0.10651828298887123, "grad_norm": 2.7349500613318862, "learning_rate": 3.3e-06, "loss": 0.4898, "step": 67 }, { "epoch": 0.10810810810810811, "grad_norm": 1.5380087808606269, "learning_rate": 3.35e-06, "loss": 0.3176, "step": 68 }, { "epoch": 0.10969793322734499, "grad_norm": 2.348187789302059, "learning_rate": 3.4e-06, "loss": 0.4243, "step": 69 }, { "epoch": 0.11128775834658187, "grad_norm": 2.3685160258091185, "learning_rate": 3.4500000000000004e-06, "loss": 0.4655, "step": 70 }, { "epoch": 0.11287758346581876, "grad_norm": 1.762841631604455, "learning_rate": 3.5e-06, "loss": 0.3365, "step": 71 }, { "epoch": 0.11446740858505565, "grad_norm": 1.9734055325732067, "learning_rate": 3.55e-06, "loss": 0.2708, "step": 72 }, { "epoch": 0.11605723370429252, "grad_norm": 2.3424700315917164, "learning_rate": 3.6e-06, "loss": 0.3157, "step": 73 }, { "epoch": 0.11764705882352941, "grad_norm": 4.29268894749181, "learning_rate": 3.65e-06, "loss": 0.4066, "step": 74 }, { "epoch": 0.1192368839427663, "grad_norm": 2.100104227775165, "learning_rate": 3.7e-06, "loss": 0.4176, "step": 75 }, { "epoch": 0.12082670906200318, "grad_norm": 2.772703843174882, "learning_rate": 3.75e-06, "loss": 0.4132, "step": 76 }, { "epoch": 0.12241653418124006, "grad_norm": 1.9051357535961146, "learning_rate": 3.8000000000000005e-06, "loss": 0.3785, "step": 77 }, { "epoch": 0.12400635930047695, "grad_norm": 4.015127912491486, "learning_rate": 3.8499999999999996e-06, "loss": 0.4117, "step": 78 }, { "epoch": 0.12559618441971382, "grad_norm": 1.533700791686892, "learning_rate": 3.9e-06, "loss": 0.2292, "step": 79 }, { "epoch": 0.1271860095389507, "grad_norm": 2.0713583752923928, "learning_rate": 3.9499999999999995e-06, "loss": 0.2906, "step": 80 }, { "epoch": 0.1287758346581876, "grad_norm": 4.150836854647218, "learning_rate": 4e-06, "loss": 0.5251, "step": 81 }, { "epoch": 0.13036565977742448, "grad_norm": 2.0329564716601447, "learning_rate": 4.05e-06, "loss": 0.3036, "step": 82 }, { "epoch": 0.13195548489666137, "grad_norm": 1.8454617656131709, "learning_rate": 4.1e-06, "loss": 0.4063, "step": 83 }, { "epoch": 0.13354531001589826, "grad_norm": 3.498503236315121, "learning_rate": 4.15e-06, "loss": 0.4475, "step": 84 }, { "epoch": 0.13513513513513514, "grad_norm": 2.3428010147959886, "learning_rate": 4.2000000000000004e-06, "loss": 0.2985, "step": 85 }, { "epoch": 0.13672496025437203, "grad_norm": 3.246869837216726, "learning_rate": 4.25e-06, "loss": 0.3859, "step": 86 }, { "epoch": 0.1383147853736089, "grad_norm": 6.907869958180637, "learning_rate": 4.3e-06, "loss": 0.5418, "step": 87 }, { "epoch": 0.13990461049284578, "grad_norm": 2.417911796670967, "learning_rate": 4.35e-06, "loss": 0.2827, "step": 88 }, { "epoch": 0.14149443561208266, "grad_norm": 3.614348948612056, "learning_rate": 4.4e-06, "loss": 0.3201, "step": 89 }, { "epoch": 0.14308426073131955, "grad_norm": 1.7733989273861601, "learning_rate": 4.450000000000001e-06, "loss": 0.2974, "step": 90 }, { "epoch": 0.14467408585055644, "grad_norm": 2.1336073944163494, "learning_rate": 4.5e-06, "loss": 0.2637, "step": 91 }, { "epoch": 0.14626391096979333, "grad_norm": 3.487206083713662, "learning_rate": 4.5500000000000005e-06, "loss": 0.3126, "step": 92 }, { "epoch": 0.1478537360890302, "grad_norm": 2.7020670510152143, "learning_rate": 4.6e-06, "loss": 0.3527, "step": 93 }, { "epoch": 0.1494435612082671, "grad_norm": 3.6301491114326776, "learning_rate": 4.65e-06, "loss": 0.3443, "step": 94 }, { "epoch": 0.151033386327504, "grad_norm": 9.1371951560765, "learning_rate": 4.700000000000001e-06, "loss": 0.4076, "step": 95 }, { "epoch": 0.15262321144674085, "grad_norm": 3.7007232647417037, "learning_rate": 4.75e-06, "loss": 1.3226, "step": 96 }, { "epoch": 0.15421303656597773, "grad_norm": 1.4436397128358596, "learning_rate": 4.800000000000001e-06, "loss": 0.2498, "step": 97 }, { "epoch": 0.15580286168521462, "grad_norm": 2.121937515278619, "learning_rate": 4.849999999999999e-06, "loss": 0.3089, "step": 98 }, { "epoch": 0.1573926868044515, "grad_norm": 21.430379825129187, "learning_rate": 4.9e-06, "loss": 57.7393, "step": 99 }, { "epoch": 0.1589825119236884, "grad_norm": 2.1455680295612796, "learning_rate": 4.95e-06, "loss": 0.3399, "step": 100 }, { "epoch": 0.16057233704292528, "grad_norm": 2.141545468935083, "learning_rate": 4.9999999999999996e-06, "loss": 0.335, "step": 101 }, { "epoch": 0.16216216216216217, "grad_norm": 1.6598816898454043, "learning_rate": 5.05e-06, "loss": 0.2577, "step": 102 }, { "epoch": 0.16375198728139906, "grad_norm": 2.224124896314827, "learning_rate": 5.1e-06, "loss": 0.3286, "step": 103 }, { "epoch": 0.16534181240063592, "grad_norm": 1.5340621081005676, "learning_rate": 5.15e-06, "loss": 0.2321, "step": 104 }, { "epoch": 0.1669316375198728, "grad_norm": 1.3964454966985334, "learning_rate": 5.2e-06, "loss": 0.2918, "step": 105 }, { "epoch": 0.1685214626391097, "grad_norm": 2.3488960783500676, "learning_rate": 5.25e-06, "loss": 0.3884, "step": 106 }, { "epoch": 0.17011128775834658, "grad_norm": 1.4748270908180765, "learning_rate": 5.3e-06, "loss": 0.303, "step": 107 }, { "epoch": 0.17170111287758347, "grad_norm": 4.359653887439957, "learning_rate": 5.3500000000000004e-06, "loss": 0.4091, "step": 108 }, { "epoch": 0.17329093799682035, "grad_norm": 1.8363152211718876, "learning_rate": 5.4e-06, "loss": 0.3519, "step": 109 }, { "epoch": 0.17488076311605724, "grad_norm": 2.21210172687297, "learning_rate": 5.45e-06, "loss": 0.3841, "step": 110 }, { "epoch": 0.17647058823529413, "grad_norm": 1.3700568484943283, "learning_rate": 5.5e-06, "loss": 0.2965, "step": 111 }, { "epoch": 0.178060413354531, "grad_norm": 1.8351097591888834, "learning_rate": 5.55e-06, "loss": 0.3233, "step": 112 }, { "epoch": 0.17965023847376788, "grad_norm": 1.873418352257156, "learning_rate": 5.600000000000001e-06, "loss": 0.3046, "step": 113 }, { "epoch": 0.18124006359300476, "grad_norm": 1.5683857367827994, "learning_rate": 5.65e-06, "loss": 0.3191, "step": 114 }, { "epoch": 0.18282988871224165, "grad_norm": 1.9229291386905674, "learning_rate": 5.7000000000000005e-06, "loss": 0.3689, "step": 115 }, { "epoch": 0.18441971383147854, "grad_norm": 2.287181355620036, "learning_rate": 5.750000000000001e-06, "loss": 0.488, "step": 116 }, { "epoch": 0.18600953895071543, "grad_norm": 2.56292831164503, "learning_rate": 5.8e-06, "loss": 0.2834, "step": 117 }, { "epoch": 0.1875993640699523, "grad_norm": 1.8255763312328974, "learning_rate": 5.850000000000001e-06, "loss": 0.3151, "step": 118 }, { "epoch": 0.1891891891891892, "grad_norm": 1.69002084197253, "learning_rate": 5.899999999999999e-06, "loss": 0.3033, "step": 119 }, { "epoch": 0.1907790143084261, "grad_norm": 2.092237912197873, "learning_rate": 5.95e-06, "loss": 0.2884, "step": 120 }, { "epoch": 0.19236883942766295, "grad_norm": 19.46802098072797, "learning_rate": 6e-06, "loss": 42.7243, "step": 121 }, { "epoch": 0.19395866454689983, "grad_norm": 34.59585080980778, "learning_rate": 6.05e-06, "loss": 21.425, "step": 122 }, { "epoch": 0.19554848966613672, "grad_norm": 4.888550520806797, "learning_rate": 6.1e-06, "loss": 0.5551, "step": 123 }, { "epoch": 0.1971383147853736, "grad_norm": 2.7459983682638973, "learning_rate": 6.1499999999999996e-06, "loss": 0.3278, "step": 124 }, { "epoch": 0.1987281399046105, "grad_norm": 3.4195016424214812, "learning_rate": 6.2e-06, "loss": 0.3194, "step": 125 }, { "epoch": 0.20031796502384738, "grad_norm": 2.5311546520857315, "learning_rate": 6.25e-06, "loss": 0.3153, "step": 126 }, { "epoch": 0.20190779014308427, "grad_norm": 2.405135633031996, "learning_rate": 6.3e-06, "loss": 0.3361, "step": 127 }, { "epoch": 0.20349761526232116, "grad_norm": 3.450737385082818, "learning_rate": 6.35e-06, "loss": 0.2145, "step": 128 }, { "epoch": 0.20508744038155802, "grad_norm": 4.421353614121937, "learning_rate": 6.4000000000000006e-06, "loss": 0.2826, "step": 129 }, { "epoch": 0.2066772655007949, "grad_norm": 2.9407818450750174, "learning_rate": 6.45e-06, "loss": 0.3814, "step": 130 }, { "epoch": 0.2082670906200318, "grad_norm": 1.928727797916912, "learning_rate": 6.5000000000000004e-06, "loss": 0.2184, "step": 131 }, { "epoch": 0.20985691573926868, "grad_norm": 1.643137005529309, "learning_rate": 6.55e-06, "loss": 0.3137, "step": 132 }, { "epoch": 0.21144674085850557, "grad_norm": 5.036813239131316, "learning_rate": 6.6e-06, "loss": 0.4503, "step": 133 }, { "epoch": 0.21303656597774245, "grad_norm": 2.99279631609939, "learning_rate": 6.650000000000001e-06, "loss": 0.4495, "step": 134 }, { "epoch": 0.21462639109697934, "grad_norm": 1.9404581617893732, "learning_rate": 6.7e-06, "loss": 0.3792, "step": 135 }, { "epoch": 0.21621621621621623, "grad_norm": 1.5484886426898032, "learning_rate": 6.750000000000001e-06, "loss": 0.3009, "step": 136 }, { "epoch": 0.2178060413354531, "grad_norm": 9.607796410170126, "learning_rate": 6.8e-06, "loss": 22.7958, "step": 137 }, { "epoch": 0.21939586645468998, "grad_norm": 1.5829929844883943, "learning_rate": 6.8500000000000005e-06, "loss": 0.3367, "step": 138 }, { "epoch": 0.22098569157392686, "grad_norm": 2.2091365254610102, "learning_rate": 6.900000000000001e-06, "loss": 0.3167, "step": 139 }, { "epoch": 0.22257551669316375, "grad_norm": 6.1395924485133735, "learning_rate": 6.95e-06, "loss": 0.55, "step": 140 }, { "epoch": 0.22416534181240064, "grad_norm": 1.5672514551703651, "learning_rate": 7e-06, "loss": 0.3542, "step": 141 }, { "epoch": 0.22575516693163752, "grad_norm": 1.690540673835988, "learning_rate": 7.049999999999999e-06, "loss": 0.2882, "step": 142 }, { "epoch": 0.2273449920508744, "grad_norm": 2.5064738996434257, "learning_rate": 7.1e-06, "loss": 1.1543, "step": 143 }, { "epoch": 0.2289348171701113, "grad_norm": 2.082095601128218, "learning_rate": 7.15e-06, "loss": 0.3266, "step": 144 }, { "epoch": 0.23052464228934816, "grad_norm": 1.321504688457293, "learning_rate": 7.2e-06, "loss": 0.2466, "step": 145 }, { "epoch": 0.23211446740858505, "grad_norm": 1.9986308918387374, "learning_rate": 7.25e-06, "loss": 0.3142, "step": 146 }, { "epoch": 0.23370429252782193, "grad_norm": 1.557083659456137, "learning_rate": 7.3e-06, "loss": 0.3395, "step": 147 }, { "epoch": 0.23529411764705882, "grad_norm": 1.3208461362234136, "learning_rate": 7.35e-06, "loss": 0.2838, "step": 148 }, { "epoch": 0.2368839427662957, "grad_norm": 5.695105395530786, "learning_rate": 7.4e-06, "loss": 0.7558, "step": 149 }, { "epoch": 0.2384737678855326, "grad_norm": 2.254505991514579, "learning_rate": 7.45e-06, "loss": 0.2436, "step": 150 }, { "epoch": 0.24006359300476948, "grad_norm": 2.421778755954633, "learning_rate": 7.5e-06, "loss": 0.2663, "step": 151 }, { "epoch": 0.24165341812400637, "grad_norm": 1.3210898288169435, "learning_rate": 7.55e-06, "loss": 0.2112, "step": 152 }, { "epoch": 0.24324324324324326, "grad_norm": 1.492282992694592, "learning_rate": 7.600000000000001e-06, "loss": 0.275, "step": 153 }, { "epoch": 0.24483306836248012, "grad_norm": 2.963536638326518, "learning_rate": 7.65e-06, "loss": 0.278, "step": 154 }, { "epoch": 0.246422893481717, "grad_norm": 1.5176140678622898, "learning_rate": 7.699999999999999e-06, "loss": 0.2454, "step": 155 }, { "epoch": 0.2480127186009539, "grad_norm": 1.3768462513595667, "learning_rate": 7.75e-06, "loss": 0.2503, "step": 156 }, { "epoch": 0.24960254372019078, "grad_norm": 1.7215217704863925, "learning_rate": 7.8e-06, "loss": 0.3222, "step": 157 }, { "epoch": 0.25119236883942764, "grad_norm": 1.8496999629756794, "learning_rate": 7.85e-06, "loss": 0.352, "step": 158 }, { "epoch": 0.2527821939586645, "grad_norm": 1.6785722194284984, "learning_rate": 7.899999999999999e-06, "loss": 0.3066, "step": 159 }, { "epoch": 0.2543720190779014, "grad_norm": 2.035739688925869, "learning_rate": 7.95e-06, "loss": 0.3625, "step": 160 }, { "epoch": 0.2559618441971383, "grad_norm": 3.354163500253444, "learning_rate": 8e-06, "loss": 0.4428, "step": 161 }, { "epoch": 0.2575516693163752, "grad_norm": 1.6169523176618321, "learning_rate": 8.05e-06, "loss": 0.2023, "step": 162 }, { "epoch": 0.2591414944356121, "grad_norm": 1.2185647428224278, "learning_rate": 8.1e-06, "loss": 0.2255, "step": 163 }, { "epoch": 0.26073131955484896, "grad_norm": 1.4568855672643708, "learning_rate": 8.15e-06, "loss": 0.2997, "step": 164 }, { "epoch": 0.26232114467408585, "grad_norm": 2.394779371193609, "learning_rate": 8.2e-06, "loss": 0.3152, "step": 165 }, { "epoch": 0.26391096979332274, "grad_norm": 1.1579327561012442, "learning_rate": 8.25e-06, "loss": 0.2538, "step": 166 }, { "epoch": 0.2655007949125596, "grad_norm": 2.7570689862592466, "learning_rate": 8.3e-06, "loss": 0.263, "step": 167 }, { "epoch": 0.2670906200317965, "grad_norm": 1.5076129219881738, "learning_rate": 8.35e-06, "loss": 0.2457, "step": 168 }, { "epoch": 0.2686804451510334, "grad_norm": 9.94150493674604, "learning_rate": 8.400000000000001e-06, "loss": 28.1962, "step": 169 }, { "epoch": 0.2702702702702703, "grad_norm": 2.8058614569139664, "learning_rate": 8.45e-06, "loss": 0.368, "step": 170 }, { "epoch": 0.2718600953895072, "grad_norm": 2.309486088012813, "learning_rate": 8.5e-06, "loss": 0.3274, "step": 171 }, { "epoch": 0.27344992050874406, "grad_norm": 1.2696733059164054, "learning_rate": 8.55e-06, "loss": 0.2359, "step": 172 }, { "epoch": 0.27503974562798095, "grad_norm": 3.5596992784562165, "learning_rate": 8.6e-06, "loss": 0.324, "step": 173 }, { "epoch": 0.2766295707472178, "grad_norm": 2.6517365152645884, "learning_rate": 8.65e-06, "loss": 0.3599, "step": 174 }, { "epoch": 0.27821939586645467, "grad_norm": 1.398768431561282, "learning_rate": 8.7e-06, "loss": 0.2918, "step": 175 }, { "epoch": 0.27980922098569155, "grad_norm": 1.340141571239029, "learning_rate": 8.750000000000001e-06, "loss": 0.2428, "step": 176 }, { "epoch": 0.28139904610492844, "grad_norm": 2.2922241474115523, "learning_rate": 8.8e-06, "loss": 0.3045, "step": 177 }, { "epoch": 0.28298887122416533, "grad_norm": 3.7452420641326976, "learning_rate": 8.85e-06, "loss": 0.3966, "step": 178 }, { "epoch": 0.2845786963434022, "grad_norm": 4.291839023163834, "learning_rate": 8.900000000000001e-06, "loss": 0.4295, "step": 179 }, { "epoch": 0.2861685214626391, "grad_norm": 2.0585872904954705, "learning_rate": 8.95e-06, "loss": 0.2536, "step": 180 }, { "epoch": 0.287758346581876, "grad_norm": 2.216551894401291, "learning_rate": 9e-06, "loss": 0.2114, "step": 181 }, { "epoch": 0.2893481717011129, "grad_norm": 1.6394566545921296, "learning_rate": 9.050000000000001e-06, "loss": 0.2591, "step": 182 }, { "epoch": 0.29093799682034976, "grad_norm": 0.8015361465635777, "learning_rate": 9.100000000000001e-06, "loss": 0.2162, "step": 183 }, { "epoch": 0.29252782193958665, "grad_norm": 1.589354708478782, "learning_rate": 9.15e-06, "loss": 0.2602, "step": 184 }, { "epoch": 0.29411764705882354, "grad_norm": 1.0050721548621755, "learning_rate": 9.2e-06, "loss": 0.2641, "step": 185 }, { "epoch": 0.2957074721780604, "grad_norm": 1.2645804731964436, "learning_rate": 9.250000000000001e-06, "loss": 0.3015, "step": 186 }, { "epoch": 0.2972972972972973, "grad_norm": 1.0007065259328647, "learning_rate": 9.3e-06, "loss": 0.441, "step": 187 }, { "epoch": 0.2988871224165342, "grad_norm": 2.5886877542770286, "learning_rate": 9.35e-06, "loss": 0.259, "step": 188 }, { "epoch": 0.3004769475357711, "grad_norm": 1.1340307925651814, "learning_rate": 9.400000000000001e-06, "loss": 0.2049, "step": 189 }, { "epoch": 0.302066772655008, "grad_norm": 3.299680279210414, "learning_rate": 9.450000000000001e-06, "loss": 0.4893, "step": 190 }, { "epoch": 0.3036565977742448, "grad_norm": 1.8654852143992833, "learning_rate": 9.5e-06, "loss": 0.3392, "step": 191 }, { "epoch": 0.3052464228934817, "grad_norm": 2.1534063524174036, "learning_rate": 9.550000000000002e-06, "loss": 0.4219, "step": 192 }, { "epoch": 0.3068362480127186, "grad_norm": 1.206918562135429, "learning_rate": 9.600000000000001e-06, "loss": 0.2479, "step": 193 }, { "epoch": 0.30842607313195547, "grad_norm": 1.2185136445901936, "learning_rate": 9.649999999999999e-06, "loss": 0.2553, "step": 194 }, { "epoch": 0.31001589825119236, "grad_norm": 2.8504487907491605, "learning_rate": 9.699999999999999e-06, "loss": 0.4003, "step": 195 }, { "epoch": 0.31160572337042924, "grad_norm": 1.2910035087675946, "learning_rate": 9.75e-06, "loss": 0.2609, "step": 196 }, { "epoch": 0.31319554848966613, "grad_norm": 1.5035623115464205, "learning_rate": 9.8e-06, "loss": 0.384, "step": 197 }, { "epoch": 0.314785373608903, "grad_norm": 1.1597723452634396, "learning_rate": 9.849999999999999e-06, "loss": 0.2552, "step": 198 }, { "epoch": 0.3163751987281399, "grad_norm": 1.5846388381977266, "learning_rate": 9.9e-06, "loss": 0.3061, "step": 199 }, { "epoch": 0.3179650238473768, "grad_norm": 4.08839026091095, "learning_rate": 9.95e-06, "loss": 0.2834, "step": 200 }, { "epoch": 0.3195548489666137, "grad_norm": 12.28117840075908, "learning_rate": 9.999999999999999e-06, "loss": 38.6443, "step": 201 }, { "epoch": 0.32114467408585057, "grad_norm": 12.093634795709107, "learning_rate": 1.005e-05, "loss": 41.111, "step": 202 }, { "epoch": 0.32273449920508746, "grad_norm": 1.6921477329907502, "learning_rate": 1.01e-05, "loss": 0.2379, "step": 203 }, { "epoch": 0.32432432432432434, "grad_norm": 1.3309852079860687, "learning_rate": 1.015e-05, "loss": 0.2413, "step": 204 }, { "epoch": 0.32591414944356123, "grad_norm": 1.7479271514906554, "learning_rate": 1.02e-05, "loss": 0.279, "step": 205 }, { "epoch": 0.3275039745627981, "grad_norm": 2.674379392952364, "learning_rate": 1.025e-05, "loss": 0.3523, "step": 206 }, { "epoch": 0.32909379968203495, "grad_norm": 1.3580620916865656, "learning_rate": 1.03e-05, "loss": 0.2742, "step": 207 }, { "epoch": 0.33068362480127184, "grad_norm": 2.641514512119881, "learning_rate": 1.035e-05, "loss": 0.2324, "step": 208 }, { "epoch": 0.3322734499205087, "grad_norm": 1.6188361619701048, "learning_rate": 1.04e-05, "loss": 0.2613, "step": 209 }, { "epoch": 0.3338632750397456, "grad_norm": 1.076352196378147, "learning_rate": 1.045e-05, "loss": 0.215, "step": 210 }, { "epoch": 0.3354531001589825, "grad_norm": 0.8052076039512558, "learning_rate": 1.05e-05, "loss": 0.1973, "step": 211 }, { "epoch": 0.3370429252782194, "grad_norm": 5.067719529518202, "learning_rate": 1.055e-05, "loss": 0.8736, "step": 212 }, { "epoch": 0.3386327503974563, "grad_norm": 1.590474697799204, "learning_rate": 1.06e-05, "loss": 0.2879, "step": 213 }, { "epoch": 0.34022257551669316, "grad_norm": 0.9272548961019641, "learning_rate": 1.065e-05, "loss": 0.3491, "step": 214 }, { "epoch": 0.34181240063593005, "grad_norm": 1.5710282915503249, "learning_rate": 1.0700000000000001e-05, "loss": 0.3012, "step": 215 }, { "epoch": 0.34340222575516693, "grad_norm": 2.071716564644239, "learning_rate": 1.075e-05, "loss": 0.291, "step": 216 }, { "epoch": 0.3449920508744038, "grad_norm": 3.584964691228448, "learning_rate": 1.08e-05, "loss": 0.482, "step": 217 }, { "epoch": 0.3465818759936407, "grad_norm": 1.7350891684332745, "learning_rate": 1.0850000000000001e-05, "loss": 0.2093, "step": 218 }, { "epoch": 0.3481717011128776, "grad_norm": 3.2872999006781134, "learning_rate": 1.09e-05, "loss": 0.2299, "step": 219 }, { "epoch": 0.3497615262321145, "grad_norm": 0.8435806277760939, "learning_rate": 1.095e-05, "loss": 0.1958, "step": 220 }, { "epoch": 0.35135135135135137, "grad_norm": 1.5890354065982326, "learning_rate": 1.1e-05, "loss": 0.3099, "step": 221 }, { "epoch": 0.35294117647058826, "grad_norm": 1.4852859519718327, "learning_rate": 1.1050000000000001e-05, "loss": 0.2526, "step": 222 }, { "epoch": 0.35453100158982515, "grad_norm": 2.0393514138937987, "learning_rate": 1.11e-05, "loss": 0.3452, "step": 223 }, { "epoch": 0.356120826709062, "grad_norm": 1.557939102190742, "learning_rate": 1.115e-05, "loss": 0.257, "step": 224 }, { "epoch": 0.35771065182829886, "grad_norm": 3.6990638368839748, "learning_rate": 1.1200000000000001e-05, "loss": 0.3914, "step": 225 }, { "epoch": 0.35930047694753575, "grad_norm": 1.9782098847151628, "learning_rate": 1.125e-05, "loss": 0.3182, "step": 226 }, { "epoch": 0.36089030206677264, "grad_norm": 1.0700909683464617, "learning_rate": 1.13e-05, "loss": 0.2279, "step": 227 }, { "epoch": 0.3624801271860095, "grad_norm": 3.339744311250999, "learning_rate": 1.1350000000000001e-05, "loss": 0.3349, "step": 228 }, { "epoch": 0.3640699523052464, "grad_norm": 1.439250645657682, "learning_rate": 1.1400000000000001e-05, "loss": 0.2959, "step": 229 }, { "epoch": 0.3656597774244833, "grad_norm": 1.200827090589856, "learning_rate": 1.145e-05, "loss": 0.2294, "step": 230 }, { "epoch": 0.3672496025437202, "grad_norm": 1.1228652930090741, "learning_rate": 1.1500000000000002e-05, "loss": 0.2227, "step": 231 }, { "epoch": 0.3688394276629571, "grad_norm": 1.009665522432123, "learning_rate": 1.1550000000000001e-05, "loss": 0.2475, "step": 232 }, { "epoch": 0.37042925278219396, "grad_norm": 2.45055375131345, "learning_rate": 1.16e-05, "loss": 0.3104, "step": 233 }, { "epoch": 0.37201907790143085, "grad_norm": 1.334482583147438, "learning_rate": 1.165e-05, "loss": 0.2407, "step": 234 }, { "epoch": 0.37360890302066774, "grad_norm": 1.1373994317607716, "learning_rate": 1.1700000000000001e-05, "loss": 0.2097, "step": 235 }, { "epoch": 0.3751987281399046, "grad_norm": 2.2888577314528393, "learning_rate": 1.1750000000000001e-05, "loss": 0.2828, "step": 236 }, { "epoch": 0.3767885532591415, "grad_norm": 1.772901914915241, "learning_rate": 1.1799999999999999e-05, "loss": 0.4346, "step": 237 }, { "epoch": 0.3783783783783784, "grad_norm": 2.7663283691164637, "learning_rate": 1.185e-05, "loss": 0.2589, "step": 238 }, { "epoch": 0.3799682034976153, "grad_norm": 3.473546176045371, "learning_rate": 1.19e-05, "loss": 0.3963, "step": 239 }, { "epoch": 0.3815580286168522, "grad_norm": 2.1803729758718626, "learning_rate": 1.1949999999999999e-05, "loss": 0.2586, "step": 240 }, { "epoch": 0.383147853736089, "grad_norm": 1.2768054161456535, "learning_rate": 1.2e-05, "loss": 0.2981, "step": 241 }, { "epoch": 0.3847376788553259, "grad_norm": 1.2511560170472653, "learning_rate": 1.205e-05, "loss": 0.2804, "step": 242 }, { "epoch": 0.3863275039745628, "grad_norm": 1.292697354269098, "learning_rate": 1.21e-05, "loss": 0.1988, "step": 243 }, { "epoch": 0.38791732909379967, "grad_norm": 2.6874218269000596, "learning_rate": 1.215e-05, "loss": 0.3822, "step": 244 }, { "epoch": 0.38950715421303655, "grad_norm": 1.237471635990396, "learning_rate": 1.22e-05, "loss": 0.2254, "step": 245 }, { "epoch": 0.39109697933227344, "grad_norm": 1.0959191038863583, "learning_rate": 1.225e-05, "loss": 0.2208, "step": 246 }, { "epoch": 0.39268680445151033, "grad_norm": 1.1912097074904957, "learning_rate": 1.2299999999999999e-05, "loss": 0.2608, "step": 247 }, { "epoch": 0.3942766295707472, "grad_norm": 0.8813251812994075, "learning_rate": 1.235e-05, "loss": 0.2455, "step": 248 }, { "epoch": 0.3958664546899841, "grad_norm": 1.5931295945709578, "learning_rate": 1.24e-05, "loss": 0.236, "step": 249 }, { "epoch": 0.397456279809221, "grad_norm": 3.4614521260692035, "learning_rate": 1.245e-05, "loss": 0.3172, "step": 250 }, { "epoch": 0.3990461049284579, "grad_norm": 1.3051934108896754, "learning_rate": 1.25e-05, "loss": 0.2421, "step": 251 }, { "epoch": 0.40063593004769477, "grad_norm": 1.0621933834025052, "learning_rate": 1.255e-05, "loss": 0.2352, "step": 252 }, { "epoch": 0.40222575516693165, "grad_norm": 1.1621702738797568, "learning_rate": 1.26e-05, "loss": 0.2364, "step": 253 }, { "epoch": 0.40381558028616854, "grad_norm": 1.1667314450255237, "learning_rate": 1.2650000000000001e-05, "loss": 0.201, "step": 254 }, { "epoch": 0.40540540540540543, "grad_norm": 1.664746076597334, "learning_rate": 1.27e-05, "loss": 0.2139, "step": 255 }, { "epoch": 0.4069952305246423, "grad_norm": 1.6102209733895332, "learning_rate": 1.275e-05, "loss": 0.2064, "step": 256 }, { "epoch": 0.40858505564387915, "grad_norm": 1.1731897391111512, "learning_rate": 1.2800000000000001e-05, "loss": 0.232, "step": 257 }, { "epoch": 0.41017488076311603, "grad_norm": 1.1462826577116, "learning_rate": 1.285e-05, "loss": 0.2035, "step": 258 }, { "epoch": 0.4117647058823529, "grad_norm": 1.6985771415219733, "learning_rate": 1.29e-05, "loss": 0.283, "step": 259 }, { "epoch": 0.4133545310015898, "grad_norm": 1.2855505146722028, "learning_rate": 1.295e-05, "loss": 0.2478, "step": 260 }, { "epoch": 0.4149443561208267, "grad_norm": 1.9269782260284674, "learning_rate": 1.3000000000000001e-05, "loss": 0.2638, "step": 261 }, { "epoch": 0.4165341812400636, "grad_norm": 1.8250106165671103, "learning_rate": 1.305e-05, "loss": 0.3383, "step": 262 }, { "epoch": 0.41812400635930047, "grad_norm": 1.4613961756046159, "learning_rate": 1.31e-05, "loss": 0.2502, "step": 263 }, { "epoch": 0.41971383147853736, "grad_norm": 0.7493017588435321, "learning_rate": 1.3150000000000001e-05, "loss": 0.1383, "step": 264 }, { "epoch": 0.42130365659777425, "grad_norm": 1.8114713425161009, "learning_rate": 1.32e-05, "loss": 0.2281, "step": 265 }, { "epoch": 0.42289348171701113, "grad_norm": 1.33626984581196, "learning_rate": 1.325e-05, "loss": 0.2491, "step": 266 }, { "epoch": 0.424483306836248, "grad_norm": 18.412830529366733, "learning_rate": 1.3300000000000001e-05, "loss": 40.2664, "step": 267 }, { "epoch": 0.4260731319554849, "grad_norm": 1.635465849878839, "learning_rate": 1.3350000000000001e-05, "loss": 0.3017, "step": 268 }, { "epoch": 0.4276629570747218, "grad_norm": 1.2856599380909932, "learning_rate": 1.34e-05, "loss": 0.2055, "step": 269 }, { "epoch": 0.4292527821939587, "grad_norm": 1.206983913274647, "learning_rate": 1.345e-05, "loss": 0.2019, "step": 270 }, { "epoch": 0.43084260731319557, "grad_norm": 0.8173874713790353, "learning_rate": 1.3500000000000001e-05, "loss": 0.166, "step": 271 }, { "epoch": 0.43243243243243246, "grad_norm": 2.5308695241674055, "learning_rate": 1.355e-05, "loss": 0.2845, "step": 272 }, { "epoch": 0.43402225755166934, "grad_norm": 6.919223253912198, "learning_rate": 1.36e-05, "loss": 40.099, "step": 273 }, { "epoch": 0.4356120826709062, "grad_norm": 1.2644081127298414, "learning_rate": 1.3650000000000001e-05, "loss": 0.2289, "step": 274 }, { "epoch": 0.43720190779014306, "grad_norm": 1.69828801543595, "learning_rate": 1.3700000000000001e-05, "loss": 0.2224, "step": 275 }, { "epoch": 0.43879173290937995, "grad_norm": 1.4628722050279872, "learning_rate": 1.375e-05, "loss": 0.239, "step": 276 }, { "epoch": 0.44038155802861684, "grad_norm": 1.3452217657175023, "learning_rate": 1.3800000000000002e-05, "loss": 0.2152, "step": 277 }, { "epoch": 0.4419713831478537, "grad_norm": 1.4384259177353511, "learning_rate": 1.3850000000000001e-05, "loss": 0.2345, "step": 278 }, { "epoch": 0.4435612082670906, "grad_norm": 1.8228572507259209, "learning_rate": 1.39e-05, "loss": 0.3404, "step": 279 }, { "epoch": 0.4451510333863275, "grad_norm": 2.8411570719014745, "learning_rate": 1.395e-05, "loss": 0.2896, "step": 280 }, { "epoch": 0.4467408585055644, "grad_norm": 2.2520469464811033, "learning_rate": 1.4e-05, "loss": 0.3273, "step": 281 }, { "epoch": 0.4483306836248013, "grad_norm": 2.3028172021890003, "learning_rate": 1.405e-05, "loss": 0.2578, "step": 282 }, { "epoch": 0.44992050874403816, "grad_norm": 1.5174515611200659, "learning_rate": 1.4099999999999999e-05, "loss": 0.2476, "step": 283 }, { "epoch": 0.45151033386327505, "grad_norm": 12.27447768285051, "learning_rate": 1.415e-05, "loss": 40.0011, "step": 284 }, { "epoch": 0.45310015898251194, "grad_norm": 1.2507424712141602, "learning_rate": 1.42e-05, "loss": 0.2684, "step": 285 }, { "epoch": 0.4546899841017488, "grad_norm": 1.2818280436191316, "learning_rate": 1.4249999999999999e-05, "loss": 0.9527, "step": 286 }, { "epoch": 0.4562798092209857, "grad_norm": 1.6381485523684878, "learning_rate": 1.43e-05, "loss": 0.2345, "step": 287 }, { "epoch": 0.4578696343402226, "grad_norm": 3.9510954847417374, "learning_rate": 1.435e-05, "loss": 0.2735, "step": 288 }, { "epoch": 0.4594594594594595, "grad_norm": 1.788610426204811, "learning_rate": 1.44e-05, "loss": 0.2634, "step": 289 }, { "epoch": 0.4610492845786963, "grad_norm": 1.3846110201600816, "learning_rate": 1.445e-05, "loss": 0.2811, "step": 290 }, { "epoch": 0.4626391096979332, "grad_norm": 1.4826623126769016, "learning_rate": 1.45e-05, "loss": 0.2982, "step": 291 }, { "epoch": 0.4642289348171701, "grad_norm": 2.843407437636561, "learning_rate": 1.455e-05, "loss": 0.3564, "step": 292 }, { "epoch": 0.465818759936407, "grad_norm": 1.1862451460629464, "learning_rate": 1.46e-05, "loss": 0.2242, "step": 293 }, { "epoch": 0.46740858505564387, "grad_norm": 2.4456443514585056, "learning_rate": 1.465e-05, "loss": 0.3022, "step": 294 }, { "epoch": 0.46899841017488075, "grad_norm": 1.8245130065155648, "learning_rate": 1.47e-05, "loss": 0.2805, "step": 295 }, { "epoch": 0.47058823529411764, "grad_norm": 1.985608797685802, "learning_rate": 1.475e-05, "loss": 0.278, "step": 296 }, { "epoch": 0.47217806041335453, "grad_norm": 10.478212933183018, "learning_rate": 1.48e-05, "loss": 37.0519, "step": 297 }, { "epoch": 0.4737678855325914, "grad_norm": 1.176374921671176, "learning_rate": 1.485e-05, "loss": 0.2445, "step": 298 }, { "epoch": 0.4753577106518283, "grad_norm": 1.606927869759438, "learning_rate": 1.49e-05, "loss": 0.3542, "step": 299 }, { "epoch": 0.4769475357710652, "grad_norm": 1.289245434892135, "learning_rate": 1.4950000000000001e-05, "loss": 0.2708, "step": 300 }, { "epoch": 0.4785373608903021, "grad_norm": 0.9702469573159896, "learning_rate": 1.5e-05, "loss": 0.1889, "step": 301 }, { "epoch": 0.48012718600953896, "grad_norm": 2.041437140050196, "learning_rate": 1.5050000000000002e-05, "loss": 0.2941, "step": 302 }, { "epoch": 0.48171701112877585, "grad_norm": 1.55368993673939, "learning_rate": 1.51e-05, "loss": 0.348, "step": 303 }, { "epoch": 0.48330683624801274, "grad_norm": 1.635769831633422, "learning_rate": 1.515e-05, "loss": 0.3151, "step": 304 }, { "epoch": 0.4848966613672496, "grad_norm": 4.174578657280089, "learning_rate": 1.5200000000000002e-05, "loss": 0.3855, "step": 305 }, { "epoch": 0.4864864864864865, "grad_norm": 1.5434445887723258, "learning_rate": 1.525e-05, "loss": 0.3291, "step": 306 }, { "epoch": 0.48807631160572335, "grad_norm": 1.0480145496409916, "learning_rate": 1.53e-05, "loss": 0.2576, "step": 307 }, { "epoch": 0.48966613672496023, "grad_norm": 1.0149642923234097, "learning_rate": 1.535e-05, "loss": 0.2072, "step": 308 }, { "epoch": 0.4912559618441971, "grad_norm": 1.0188810509015556, "learning_rate": 1.5399999999999998e-05, "loss": 0.2282, "step": 309 }, { "epoch": 0.492845786963434, "grad_norm": 0.9601687889794551, "learning_rate": 1.545e-05, "loss": 0.2184, "step": 310 }, { "epoch": 0.4944356120826709, "grad_norm": 1.2040954041203409, "learning_rate": 1.55e-05, "loss": 0.2678, "step": 311 }, { "epoch": 0.4960254372019078, "grad_norm": 2.6554655421596114, "learning_rate": 1.555e-05, "loss": 0.3025, "step": 312 }, { "epoch": 0.49761526232114467, "grad_norm": 1.8483336847424112, "learning_rate": 1.56e-05, "loss": 0.3405, "step": 313 }, { "epoch": 0.49920508744038156, "grad_norm": 0.8564175783463368, "learning_rate": 1.5649999999999998e-05, "loss": 0.2299, "step": 314 }, { "epoch": 0.5007949125596184, "grad_norm": 2.227540542263857, "learning_rate": 1.57e-05, "loss": 0.2332, "step": 315 }, { "epoch": 0.5023847376788553, "grad_norm": 1.3924747956345411, "learning_rate": 1.575e-05, "loss": 0.2275, "step": 316 }, { "epoch": 0.5039745627980922, "grad_norm": 1.4055398814767508, "learning_rate": 1.5799999999999998e-05, "loss": 0.2293, "step": 317 }, { "epoch": 0.505564387917329, "grad_norm": 0.9260122709335098, "learning_rate": 1.585e-05, "loss": 0.243, "step": 318 }, { "epoch": 0.5071542130365659, "grad_norm": 1.7893005705909428, "learning_rate": 1.59e-05, "loss": 0.4086, "step": 319 }, { "epoch": 0.5087440381558028, "grad_norm": 4.614275825676911, "learning_rate": 1.5949999999999998e-05, "loss": 0.3321, "step": 320 }, { "epoch": 0.5103338632750397, "grad_norm": 1.0813017953068882, "learning_rate": 1.6e-05, "loss": 0.2685, "step": 321 }, { "epoch": 0.5119236883942766, "grad_norm": 2.7191285958069114, "learning_rate": 1.605e-05, "loss": 0.3528, "step": 322 }, { "epoch": 0.5135135135135135, "grad_norm": 1.313854062762596, "learning_rate": 1.61e-05, "loss": 0.2346, "step": 323 }, { "epoch": 0.5151033386327504, "grad_norm": 2.355938867409812, "learning_rate": 1.615e-05, "loss": 0.2756, "step": 324 }, { "epoch": 0.5166931637519873, "grad_norm": 1.6983055392926085, "learning_rate": 1.62e-05, "loss": 0.9052, "step": 325 }, { "epoch": 0.5182829888712241, "grad_norm": 2.770798759344969, "learning_rate": 1.625e-05, "loss": 0.3467, "step": 326 }, { "epoch": 0.519872813990461, "grad_norm": 1.7815817430137466, "learning_rate": 1.63e-05, "loss": 0.2842, "step": 327 }, { "epoch": 0.5214626391096979, "grad_norm": 1.3687909737800006, "learning_rate": 1.635e-05, "loss": 0.2808, "step": 328 }, { "epoch": 0.5230524642289348, "grad_norm": 1.998063474228559, "learning_rate": 1.64e-05, "loss": 0.7284, "step": 329 }, { "epoch": 0.5246422893481717, "grad_norm": 3.3701653810155316, "learning_rate": 1.645e-05, "loss": 0.4206, "step": 330 }, { "epoch": 0.5262321144674086, "grad_norm": 1.6453175559425075, "learning_rate": 1.65e-05, "loss": 0.3053, "step": 331 }, { "epoch": 0.5278219395866455, "grad_norm": 1.399331730481635, "learning_rate": 1.655e-05, "loss": 0.272, "step": 332 }, { "epoch": 0.5294117647058824, "grad_norm": 1.3972015370518003, "learning_rate": 1.66e-05, "loss": 0.2385, "step": 333 }, { "epoch": 0.5310015898251192, "grad_norm": 0.9854314948197552, "learning_rate": 1.665e-05, "loss": 0.2144, "step": 334 }, { "epoch": 0.5325914149443561, "grad_norm": 1.7976241069817438, "learning_rate": 1.67e-05, "loss": 0.2104, "step": 335 }, { "epoch": 0.534181240063593, "grad_norm": 1.6843837020774817, "learning_rate": 1.675e-05, "loss": 0.3518, "step": 336 }, { "epoch": 0.5357710651828299, "grad_norm": 1.4188177658561945, "learning_rate": 1.6800000000000002e-05, "loss": 0.2281, "step": 337 }, { "epoch": 0.5373608903020668, "grad_norm": 8.172265802117579, "learning_rate": 1.685e-05, "loss": 20.671, "step": 338 }, { "epoch": 0.5389507154213037, "grad_norm": 2.2787759559989467, "learning_rate": 1.69e-05, "loss": 0.8958, "step": 339 }, { "epoch": 0.5405405405405406, "grad_norm": 0.9690356902895194, "learning_rate": 1.695e-05, "loss": 0.2454, "step": 340 }, { "epoch": 0.5421303656597775, "grad_norm": 1.6265479273463743, "learning_rate": 1.7e-05, "loss": 0.2856, "step": 341 }, { "epoch": 0.5437201907790143, "grad_norm": 0.8409714376322419, "learning_rate": 1.705e-05, "loss": 0.187, "step": 342 }, { "epoch": 0.5453100158982512, "grad_norm": 2.6584611131450795, "learning_rate": 1.71e-05, "loss": 0.2437, "step": 343 }, { "epoch": 0.5468998410174881, "grad_norm": 1.3497061068913465, "learning_rate": 1.715e-05, "loss": 0.2392, "step": 344 }, { "epoch": 0.548489666136725, "grad_norm": 2.111399252812515, "learning_rate": 1.72e-05, "loss": 0.2222, "step": 345 }, { "epoch": 0.5500794912559619, "grad_norm": 1.0677951045315937, "learning_rate": 1.725e-05, "loss": 0.2527, "step": 346 }, { "epoch": 0.5516693163751988, "grad_norm": 1.665499951069647, "learning_rate": 1.73e-05, "loss": 0.2599, "step": 347 }, { "epoch": 0.5532591414944356, "grad_norm": 0.5919484742703548, "learning_rate": 1.735e-05, "loss": 0.1868, "step": 348 }, { "epoch": 0.5548489666136724, "grad_norm": 1.1436781965715417, "learning_rate": 1.74e-05, "loss": 0.2061, "step": 349 }, { "epoch": 0.5564387917329093, "grad_norm": 1.1570378907839431, "learning_rate": 1.745e-05, "loss": 0.2838, "step": 350 }, { "epoch": 0.5580286168521462, "grad_norm": 1.916505258736185, "learning_rate": 1.7500000000000002e-05, "loss": 0.2067, "step": 351 }, { "epoch": 0.5596184419713831, "grad_norm": 1.8068534886869236, "learning_rate": 1.755e-05, "loss": 0.2316, "step": 352 }, { "epoch": 0.56120826709062, "grad_norm": 0.9095040989529973, "learning_rate": 1.76e-05, "loss": 0.2487, "step": 353 }, { "epoch": 0.5627980922098569, "grad_norm": 1.0516198347736088, "learning_rate": 1.7650000000000002e-05, "loss": 0.2239, "step": 354 }, { "epoch": 0.5643879173290938, "grad_norm": 1.1849062890281172, "learning_rate": 1.77e-05, "loss": 0.2688, "step": 355 }, { "epoch": 0.5659777424483307, "grad_norm": 1.4908541457604094, "learning_rate": 1.775e-05, "loss": 0.2216, "step": 356 }, { "epoch": 0.5675675675675675, "grad_norm": 1.5677962505337695, "learning_rate": 1.7800000000000002e-05, "loss": 0.2254, "step": 357 }, { "epoch": 0.5691573926868044, "grad_norm": 1.5461002045936105, "learning_rate": 1.785e-05, "loss": 0.265, "step": 358 }, { "epoch": 0.5707472178060413, "grad_norm": 1.3077581300287706, "learning_rate": 1.79e-05, "loss": 0.2088, "step": 359 }, { "epoch": 0.5723370429252782, "grad_norm": 0.6775878895493114, "learning_rate": 1.7950000000000003e-05, "loss": 0.2175, "step": 360 }, { "epoch": 0.5739268680445151, "grad_norm": 11.67457637130232, "learning_rate": 1.8e-05, "loss": 36.2782, "step": 361 }, { "epoch": 0.575516693163752, "grad_norm": 0.895049467191542, "learning_rate": 1.805e-05, "loss": 0.252, "step": 362 }, { "epoch": 0.5771065182829889, "grad_norm": 10.408834793780843, "learning_rate": 1.8100000000000003e-05, "loss": 35.4284, "step": 363 }, { "epoch": 0.5786963434022258, "grad_norm": 0.9706059689029187, "learning_rate": 1.815e-05, "loss": 0.2175, "step": 364 }, { "epoch": 0.5802861685214626, "grad_norm": 1.8131627061748241, "learning_rate": 1.8200000000000002e-05, "loss": 0.2869, "step": 365 }, { "epoch": 0.5818759936406995, "grad_norm": 1.5442152349703204, "learning_rate": 1.825e-05, "loss": 0.2494, "step": 366 }, { "epoch": 0.5834658187599364, "grad_norm": 0.6149179291206693, "learning_rate": 1.83e-05, "loss": 0.1741, "step": 367 }, { "epoch": 0.5850556438791733, "grad_norm": 1.6557239810530067, "learning_rate": 1.8350000000000002e-05, "loss": 0.3211, "step": 368 }, { "epoch": 0.5866454689984102, "grad_norm": 21.93905647991169, "learning_rate": 1.84e-05, "loss": 35.3367, "step": 369 }, { "epoch": 0.5882352941176471, "grad_norm": 1.3943284447943827, "learning_rate": 1.845e-05, "loss": 0.2035, "step": 370 }, { "epoch": 0.589825119236884, "grad_norm": 4.420744802527324, "learning_rate": 1.8500000000000002e-05, "loss": 0.3177, "step": 371 }, { "epoch": 0.5914149443561209, "grad_norm": 0.9558316868817092, "learning_rate": 1.855e-05, "loss": 0.1871, "step": 372 }, { "epoch": 0.5930047694753577, "grad_norm": 2.6049479406132643, "learning_rate": 1.86e-05, "loss": 0.2902, "step": 373 }, { "epoch": 0.5945945945945946, "grad_norm": 1.2232004510214258, "learning_rate": 1.8650000000000003e-05, "loss": 0.2659, "step": 374 }, { "epoch": 0.5961844197138315, "grad_norm": 0.972050177325397, "learning_rate": 1.87e-05, "loss": 0.1993, "step": 375 }, { "epoch": 0.5977742448330684, "grad_norm": 1.1990823518469167, "learning_rate": 1.8750000000000002e-05, "loss": 0.2043, "step": 376 }, { "epoch": 0.5993640699523053, "grad_norm": 0.7252285151857282, "learning_rate": 1.8800000000000003e-05, "loss": 0.1711, "step": 377 }, { "epoch": 0.6009538950715422, "grad_norm": 1.2684277566435336, "learning_rate": 1.885e-05, "loss": 0.2079, "step": 378 }, { "epoch": 0.6025437201907791, "grad_norm": 1.5879408560782209, "learning_rate": 1.8900000000000002e-05, "loss": 0.1985, "step": 379 }, { "epoch": 0.604133545310016, "grad_norm": 1.5042691942498285, "learning_rate": 1.8950000000000003e-05, "loss": 0.3018, "step": 380 }, { "epoch": 0.6057233704292527, "grad_norm": 1.375702974387334, "learning_rate": 1.9e-05, "loss": 0.2054, "step": 381 }, { "epoch": 0.6073131955484896, "grad_norm": 1.0811347000362872, "learning_rate": 1.9050000000000002e-05, "loss": 0.2735, "step": 382 }, { "epoch": 0.6089030206677265, "grad_norm": 1.164210544600029, "learning_rate": 1.9100000000000003e-05, "loss": 0.2215, "step": 383 }, { "epoch": 0.6104928457869634, "grad_norm": 2.109651566935429, "learning_rate": 1.915e-05, "loss": 0.3174, "step": 384 }, { "epoch": 0.6120826709062003, "grad_norm": 1.0846599190397557, "learning_rate": 1.9200000000000003e-05, "loss": 0.2567, "step": 385 }, { "epoch": 0.6136724960254372, "grad_norm": 1.7543664162270383, "learning_rate": 1.9250000000000004e-05, "loss": 0.2594, "step": 386 }, { "epoch": 0.615262321144674, "grad_norm": 18.086638383427005, "learning_rate": 1.9299999999999998e-05, "loss": 46.7105, "step": 387 }, { "epoch": 0.6168521462639109, "grad_norm": 1.4493850373945567, "learning_rate": 1.935e-05, "loss": 0.2563, "step": 388 }, { "epoch": 0.6184419713831478, "grad_norm": 1.8805379661987693, "learning_rate": 1.9399999999999997e-05, "loss": 0.3318, "step": 389 }, { "epoch": 0.6200317965023847, "grad_norm": 0.8612682303204704, "learning_rate": 1.945e-05, "loss": 0.2152, "step": 390 }, { "epoch": 0.6216216216216216, "grad_norm": 1.5212939916615305, "learning_rate": 1.95e-05, "loss": 0.2475, "step": 391 }, { "epoch": 0.6232114467408585, "grad_norm": 0.9110151872445261, "learning_rate": 1.9549999999999997e-05, "loss": 0.207, "step": 392 }, { "epoch": 0.6248012718600954, "grad_norm": 0.9835167402219618, "learning_rate": 1.96e-05, "loss": 0.222, "step": 393 }, { "epoch": 0.6263910969793323, "grad_norm": 2.766062005906614, "learning_rate": 1.965e-05, "loss": 0.2089, "step": 394 }, { "epoch": 0.6279809220985691, "grad_norm": 0.9759666575323273, "learning_rate": 1.9699999999999998e-05, "loss": 0.2042, "step": 395 }, { "epoch": 0.629570747217806, "grad_norm": 1.5022306101238487, "learning_rate": 1.975e-05, "loss": 0.1914, "step": 396 }, { "epoch": 0.6311605723370429, "grad_norm": 0.8866005579380816, "learning_rate": 1.98e-05, "loss": 0.2047, "step": 397 }, { "epoch": 0.6327503974562798, "grad_norm": 1.188039233167719, "learning_rate": 1.9849999999999998e-05, "loss": 0.2646, "step": 398 }, { "epoch": 0.6343402225755167, "grad_norm": 0.9531727140787785, "learning_rate": 1.99e-05, "loss": 0.2756, "step": 399 }, { "epoch": 0.6359300476947536, "grad_norm": 1.1162721782283782, "learning_rate": 1.995e-05, "loss": 0.2978, "step": 400 }, { "epoch": 0.6375198728139905, "grad_norm": 1.1507352615538533, "learning_rate": 1.9999999999999998e-05, "loss": 0.24, "step": 401 }, { "epoch": 0.6391096979332274, "grad_norm": 1.2799444436146774, "learning_rate": 2.005e-05, "loss": 0.2715, "step": 402 }, { "epoch": 0.6406995230524642, "grad_norm": 1.5372070881470923, "learning_rate": 2.01e-05, "loss": 0.3011, "step": 403 }, { "epoch": 0.6422893481717011, "grad_norm": 0.7208075477730832, "learning_rate": 2.015e-05, "loss": 0.182, "step": 404 }, { "epoch": 0.643879173290938, "grad_norm": 1.8569291428266264, "learning_rate": 2.02e-05, "loss": 0.2872, "step": 405 }, { "epoch": 0.6454689984101749, "grad_norm": 1.274493500320388, "learning_rate": 2.025e-05, "loss": 0.2158, "step": 406 }, { "epoch": 0.6470588235294118, "grad_norm": 1.3539976910909095, "learning_rate": 2.03e-05, "loss": 0.2033, "step": 407 }, { "epoch": 0.6486486486486487, "grad_norm": 0.8269817603609642, "learning_rate": 2.035e-05, "loss": 0.2141, "step": 408 }, { "epoch": 0.6502384737678856, "grad_norm": 1.031075241900295, "learning_rate": 2.04e-05, "loss": 0.2482, "step": 409 }, { "epoch": 0.6518282988871225, "grad_norm": 0.7367858246783379, "learning_rate": 2.045e-05, "loss": 0.1929, "step": 410 }, { "epoch": 0.6534181240063593, "grad_norm": 0.6887521282117136, "learning_rate": 2.05e-05, "loss": 0.1427, "step": 411 }, { "epoch": 0.6550079491255962, "grad_norm": 0.8767579835923949, "learning_rate": 2.055e-05, "loss": 0.1997, "step": 412 }, { "epoch": 0.6565977742448331, "grad_norm": 1.0091956243235478, "learning_rate": 2.06e-05, "loss": 0.2418, "step": 413 }, { "epoch": 0.6581875993640699, "grad_norm": 0.7677707870409065, "learning_rate": 2.065e-05, "loss": 0.2606, "step": 414 }, { "epoch": 0.6597774244833068, "grad_norm": 1.5062432333715783, "learning_rate": 2.07e-05, "loss": 0.2374, "step": 415 }, { "epoch": 0.6613672496025437, "grad_norm": 2.073150985730754, "learning_rate": 2.075e-05, "loss": 0.3889, "step": 416 }, { "epoch": 0.6629570747217806, "grad_norm": 1.234404517563459, "learning_rate": 2.08e-05, "loss": 0.3124, "step": 417 }, { "epoch": 0.6645468998410174, "grad_norm": 1.083511606848498, "learning_rate": 2.085e-05, "loss": 0.2402, "step": 418 }, { "epoch": 0.6661367249602543, "grad_norm": 1.3217278958533336, "learning_rate": 2.09e-05, "loss": 0.2674, "step": 419 }, { "epoch": 0.6677265500794912, "grad_norm": 0.9075566625540784, "learning_rate": 2.095e-05, "loss": 0.2569, "step": 420 }, { "epoch": 0.6693163751987281, "grad_norm": 1.5176093200891982, "learning_rate": 2.1e-05, "loss": 0.2696, "step": 421 }, { "epoch": 0.670906200317965, "grad_norm": 1.311761609888292, "learning_rate": 2.105e-05, "loss": 0.2494, "step": 422 }, { "epoch": 0.6724960254372019, "grad_norm": 0.9197246958889534, "learning_rate": 2.11e-05, "loss": 0.1942, "step": 423 }, { "epoch": 0.6740858505564388, "grad_norm": 0.836455602197686, "learning_rate": 2.115e-05, "loss": 0.1871, "step": 424 }, { "epoch": 0.6756756756756757, "grad_norm": 1.92224761315951, "learning_rate": 2.12e-05, "loss": 0.3251, "step": 425 }, { "epoch": 0.6772655007949125, "grad_norm": 0.8093418655426429, "learning_rate": 2.125e-05, "loss": 0.1974, "step": 426 }, { "epoch": 0.6788553259141494, "grad_norm": 1.3113720693662, "learning_rate": 2.13e-05, "loss": 0.2304, "step": 427 }, { "epoch": 0.6804451510333863, "grad_norm": 1.8710707649842238, "learning_rate": 2.135e-05, "loss": 0.3286, "step": 428 }, { "epoch": 0.6820349761526232, "grad_norm": 1.3758565195440136, "learning_rate": 2.1400000000000002e-05, "loss": 0.1867, "step": 429 }, { "epoch": 0.6836248012718601, "grad_norm": 2.4907257806326553, "learning_rate": 2.145e-05, "loss": 0.2354, "step": 430 }, { "epoch": 0.685214626391097, "grad_norm": 1.2803557904074059, "learning_rate": 2.15e-05, "loss": 0.2018, "step": 431 }, { "epoch": 0.6868044515103339, "grad_norm": 2.2491952182977513, "learning_rate": 2.1550000000000002e-05, "loss": 0.2248, "step": 432 }, { "epoch": 0.6883942766295708, "grad_norm": 2.4878816277607294, "learning_rate": 2.16e-05, "loss": 0.255, "step": 433 }, { "epoch": 0.6899841017488076, "grad_norm": 2.3734190295674593, "learning_rate": 2.165e-05, "loss": 0.2783, "step": 434 }, { "epoch": 0.6915739268680445, "grad_norm": 2.6037241345595747, "learning_rate": 2.1700000000000002e-05, "loss": 0.3913, "step": 435 }, { "epoch": 0.6931637519872814, "grad_norm": 1.0438598039487368, "learning_rate": 2.175e-05, "loss": 0.1921, "step": 436 }, { "epoch": 0.6947535771065183, "grad_norm": 2.4971906438784846, "learning_rate": 2.18e-05, "loss": 0.2851, "step": 437 }, { "epoch": 0.6963434022257552, "grad_norm": 15.17818027109146, "learning_rate": 2.1850000000000003e-05, "loss": 34.6932, "step": 438 }, { "epoch": 0.6979332273449921, "grad_norm": 2.0774003977892566, "learning_rate": 2.19e-05, "loss": 0.2929, "step": 439 }, { "epoch": 0.699523052464229, "grad_norm": 1.5629109099576126, "learning_rate": 2.195e-05, "loss": 0.3082, "step": 440 }, { "epoch": 0.7011128775834659, "grad_norm": 1.7664022102585626, "learning_rate": 2.2e-05, "loss": 0.2367, "step": 441 }, { "epoch": 0.7027027027027027, "grad_norm": 1.727077385431117, "learning_rate": 2.205e-05, "loss": 0.2285, "step": 442 }, { "epoch": 0.7042925278219396, "grad_norm": 3.6624333276020735, "learning_rate": 2.2100000000000002e-05, "loss": 0.2804, "step": 443 }, { "epoch": 0.7058823529411765, "grad_norm": 1.793650077807731, "learning_rate": 2.215e-05, "loss": 0.2906, "step": 444 }, { "epoch": 0.7074721780604134, "grad_norm": 1.226351259706569, "learning_rate": 2.22e-05, "loss": 0.2709, "step": 445 }, { "epoch": 0.7090620031796503, "grad_norm": 1.020389687996493, "learning_rate": 2.2250000000000002e-05, "loss": 0.2119, "step": 446 }, { "epoch": 0.7106518282988871, "grad_norm": 1.4107150443955376, "learning_rate": 2.23e-05, "loss": 0.2613, "step": 447 }, { "epoch": 0.712241653418124, "grad_norm": 1.919350223023952, "learning_rate": 2.235e-05, "loss": 0.267, "step": 448 }, { "epoch": 0.7138314785373608, "grad_norm": 1.5806369448612663, "learning_rate": 2.2400000000000002e-05, "loss": 0.2502, "step": 449 }, { "epoch": 0.7154213036565977, "grad_norm": 1.7989949820303464, "learning_rate": 2.245e-05, "loss": 0.2203, "step": 450 }, { "epoch": 0.7170111287758346, "grad_norm": 1.700133875939382, "learning_rate": 2.25e-05, "loss": 0.1961, "step": 451 }, { "epoch": 0.7186009538950715, "grad_norm": 1.560158790551767, "learning_rate": 2.2550000000000003e-05, "loss": 0.2562, "step": 452 }, { "epoch": 0.7201907790143084, "grad_norm": 1.49759384159501, "learning_rate": 2.26e-05, "loss": 0.1815, "step": 453 }, { "epoch": 0.7217806041335453, "grad_norm": 1.5272038838472888, "learning_rate": 2.265e-05, "loss": 0.3082, "step": 454 }, { "epoch": 0.7233704292527822, "grad_norm": 1.4654228423980578, "learning_rate": 2.2700000000000003e-05, "loss": 0.2525, "step": 455 }, { "epoch": 0.724960254372019, "grad_norm": 2.6419267054939395, "learning_rate": 2.275e-05, "loss": 0.2989, "step": 456 }, { "epoch": 0.7265500794912559, "grad_norm": 1.8752035664498794, "learning_rate": 2.2800000000000002e-05, "loss": 0.2538, "step": 457 }, { "epoch": 0.7281399046104928, "grad_norm": 2.264584820258348, "learning_rate": 2.2850000000000003e-05, "loss": 0.805, "step": 458 }, { "epoch": 0.7297297297297297, "grad_norm": 1.5925093357045932, "learning_rate": 2.29e-05, "loss": 0.2073, "step": 459 }, { "epoch": 0.7313195548489666, "grad_norm": 0.9148269917229194, "learning_rate": 2.2950000000000002e-05, "loss": 0.2443, "step": 460 }, { "epoch": 0.7329093799682035, "grad_norm": 1.6769542796185932, "learning_rate": 2.3000000000000003e-05, "loss": 0.3197, "step": 461 }, { "epoch": 0.7344992050874404, "grad_norm": 1.0108113358566797, "learning_rate": 2.305e-05, "loss": 0.1797, "step": 462 }, { "epoch": 0.7360890302066773, "grad_norm": 0.83390898111003, "learning_rate": 2.3100000000000002e-05, "loss": 0.1997, "step": 463 }, { "epoch": 0.7376788553259142, "grad_norm": 1.9984904283104243, "learning_rate": 2.315e-05, "loss": 0.2419, "step": 464 }, { "epoch": 0.739268680445151, "grad_norm": 1.0798815271392466, "learning_rate": 2.32e-05, "loss": 0.2532, "step": 465 }, { "epoch": 0.7408585055643879, "grad_norm": 0.8270367502099909, "learning_rate": 2.3250000000000003e-05, "loss": 0.1752, "step": 466 }, { "epoch": 0.7424483306836248, "grad_norm": 0.9491758147452222, "learning_rate": 2.33e-05, "loss": 0.2039, "step": 467 }, { "epoch": 0.7440381558028617, "grad_norm": 4.52927124181405, "learning_rate": 2.3350000000000002e-05, "loss": 0.3417, "step": 468 }, { "epoch": 0.7456279809220986, "grad_norm": 0.9449924613372288, "learning_rate": 2.3400000000000003e-05, "loss": 0.2111, "step": 469 }, { "epoch": 0.7472178060413355, "grad_norm": 1.25915235032024, "learning_rate": 2.345e-05, "loss": 0.3406, "step": 470 }, { "epoch": 0.7488076311605724, "grad_norm": 1.9106270981608868, "learning_rate": 2.3500000000000002e-05, "loss": 0.3402, "step": 471 }, { "epoch": 0.7503974562798092, "grad_norm": 2.0228229077786257, "learning_rate": 2.3550000000000003e-05, "loss": 0.2905, "step": 472 }, { "epoch": 0.7519872813990461, "grad_norm": 1.5095326494047443, "learning_rate": 2.3599999999999998e-05, "loss": 0.2425, "step": 473 }, { "epoch": 0.753577106518283, "grad_norm": 1.320784148146851, "learning_rate": 2.365e-05, "loss": 0.234, "step": 474 }, { "epoch": 0.7551669316375199, "grad_norm": 1.2855968910201843, "learning_rate": 2.37e-05, "loss": 0.2842, "step": 475 }, { "epoch": 0.7567567567567568, "grad_norm": 1.4494956264699426, "learning_rate": 2.3749999999999998e-05, "loss": 0.2664, "step": 476 }, { "epoch": 0.7583465818759937, "grad_norm": 0.8222991382675661, "learning_rate": 2.38e-05, "loss": 0.1855, "step": 477 }, { "epoch": 0.7599364069952306, "grad_norm": 0.9844096899563934, "learning_rate": 2.385e-05, "loss": 0.2505, "step": 478 }, { "epoch": 0.7615262321144675, "grad_norm": 1.329242141767537, "learning_rate": 2.3899999999999998e-05, "loss": 0.3044, "step": 479 }, { "epoch": 0.7631160572337043, "grad_norm": 1.9242624036774687, "learning_rate": 2.395e-05, "loss": 0.7914, "step": 480 }, { "epoch": 0.7647058823529411, "grad_norm": 1.6340339463147588, "learning_rate": 2.4e-05, "loss": 0.2829, "step": 481 }, { "epoch": 0.766295707472178, "grad_norm": 1.142024448703623, "learning_rate": 2.405e-05, "loss": 0.2375, "step": 482 }, { "epoch": 0.7678855325914149, "grad_norm": 0.9759993785459193, "learning_rate": 2.41e-05, "loss": 0.1668, "step": 483 }, { "epoch": 0.7694753577106518, "grad_norm": 3.1130990271113355, "learning_rate": 2.415e-05, "loss": 0.2815, "step": 484 }, { "epoch": 0.7710651828298887, "grad_norm": 3.6609922525153094, "learning_rate": 2.42e-05, "loss": 0.2728, "step": 485 }, { "epoch": 0.7726550079491256, "grad_norm": 0.9154339253360586, "learning_rate": 2.425e-05, "loss": 0.2652, "step": 486 }, { "epoch": 0.7742448330683624, "grad_norm": 0.8585997435988617, "learning_rate": 2.43e-05, "loss": 0.2496, "step": 487 }, { "epoch": 0.7758346581875993, "grad_norm": 1.3528869649005333, "learning_rate": 2.435e-05, "loss": 0.2498, "step": 488 }, { "epoch": 0.7774244833068362, "grad_norm": 3.9479601974577845, "learning_rate": 2.44e-05, "loss": 0.2524, "step": 489 }, { "epoch": 0.7790143084260731, "grad_norm": 1.6569264345352395, "learning_rate": 2.4449999999999998e-05, "loss": 0.3827, "step": 490 }, { "epoch": 0.78060413354531, "grad_norm": 1.0531724378223204, "learning_rate": 2.45e-05, "loss": 0.2136, "step": 491 }, { "epoch": 0.7821939586645469, "grad_norm": 18.663037057927276, "learning_rate": 2.455e-05, "loss": 34.2023, "step": 492 }, { "epoch": 0.7837837837837838, "grad_norm": 1.920925270347047, "learning_rate": 2.4599999999999998e-05, "loss": 0.2954, "step": 493 }, { "epoch": 0.7853736089030207, "grad_norm": 1.8949845627030673, "learning_rate": 2.465e-05, "loss": 0.2053, "step": 494 }, { "epoch": 0.7869634340222575, "grad_norm": 2.195073450963552, "learning_rate": 2.47e-05, "loss": 0.2328, "step": 495 }, { "epoch": 0.7885532591414944, "grad_norm": 1.6320673046451244, "learning_rate": 2.475e-05, "loss": 0.232, "step": 496 }, { "epoch": 0.7901430842607313, "grad_norm": 0.9253733786099647, "learning_rate": 2.48e-05, "loss": 0.2053, "step": 497 }, { "epoch": 0.7917329093799682, "grad_norm": 1.2368538293556715, "learning_rate": 2.485e-05, "loss": 0.2329, "step": 498 }, { "epoch": 0.7933227344992051, "grad_norm": 1.1677231112599487, "learning_rate": 2.49e-05, "loss": 0.2244, "step": 499 }, { "epoch": 0.794912559618442, "grad_norm": 1.7771671200799681, "learning_rate": 2.495e-05, "loss": 0.246, "step": 500 }, { "epoch": 0.7965023847376789, "grad_norm": 1.5357997030076218, "learning_rate": 2.5e-05, "loss": 0.253, "step": 501 }, { "epoch": 0.7980922098569158, "grad_norm": 2.0534501382468378, "learning_rate": 2.505e-05, "loss": 0.198, "step": 502 }, { "epoch": 0.7996820349761526, "grad_norm": 1.2319068579530479, "learning_rate": 2.51e-05, "loss": 0.2458, "step": 503 }, { "epoch": 0.8012718600953895, "grad_norm": 0.8732640370197168, "learning_rate": 2.515e-05, "loss": 0.2118, "step": 504 }, { "epoch": 0.8028616852146264, "grad_norm": 4.41176534113802, "learning_rate": 2.52e-05, "loss": 0.4236, "step": 505 }, { "epoch": 0.8044515103338633, "grad_norm": 0.7180629972653341, "learning_rate": 2.525e-05, "loss": 0.1931, "step": 506 }, { "epoch": 0.8060413354531002, "grad_norm": 0.6347767273532949, "learning_rate": 2.5300000000000002e-05, "loss": 0.1859, "step": 507 }, { "epoch": 0.8076311605723371, "grad_norm": 0.9791426745533244, "learning_rate": 2.535e-05, "loss": 0.2799, "step": 508 }, { "epoch": 0.809220985691574, "grad_norm": 31.58151475070209, "learning_rate": 2.54e-05, "loss": 33.3643, "step": 509 }, { "epoch": 0.8108108108108109, "grad_norm": 0.935794606298465, "learning_rate": 2.5450000000000002e-05, "loss": 0.1858, "step": 510 }, { "epoch": 0.8124006359300477, "grad_norm": 4.784009392534647, "learning_rate": 2.55e-05, "loss": 0.3739, "step": 511 }, { "epoch": 0.8139904610492846, "grad_norm": 6.019860716766144, "learning_rate": 2.555e-05, "loss": 0.3004, "step": 512 }, { "epoch": 0.8155802861685215, "grad_norm": 1.1532182944468512, "learning_rate": 2.5600000000000002e-05, "loss": 0.2125, "step": 513 }, { "epoch": 0.8171701112877583, "grad_norm": 0.9353130479866597, "learning_rate": 2.565e-05, "loss": 0.2075, "step": 514 }, { "epoch": 0.8187599364069952, "grad_norm": 2.2324430700755276, "learning_rate": 2.57e-05, "loss": 0.2162, "step": 515 }, { "epoch": 0.8203497615262321, "grad_norm": 0.9308155590992341, "learning_rate": 2.575e-05, "loss": 0.2124, "step": 516 }, { "epoch": 0.821939586645469, "grad_norm": 0.9740386688178756, "learning_rate": 2.58e-05, "loss": 0.2846, "step": 517 }, { "epoch": 0.8235294117647058, "grad_norm": 1.1504004035932034, "learning_rate": 2.585e-05, "loss": 0.2811, "step": 518 }, { "epoch": 0.8251192368839427, "grad_norm": 1.8083635934870836, "learning_rate": 2.59e-05, "loss": 0.2674, "step": 519 }, { "epoch": 0.8267090620031796, "grad_norm": 2.955932180890619, "learning_rate": 2.595e-05, "loss": 0.33, "step": 520 }, { "epoch": 0.8282988871224165, "grad_norm": 2.5779379390702437, "learning_rate": 2.6000000000000002e-05, "loss": 0.3311, "step": 521 }, { "epoch": 0.8298887122416534, "grad_norm": 1.1818559752883053, "learning_rate": 2.605e-05, "loss": 0.3906, "step": 522 }, { "epoch": 0.8314785373608903, "grad_norm": 1.00316593008836, "learning_rate": 2.61e-05, "loss": 0.241, "step": 523 }, { "epoch": 0.8330683624801272, "grad_norm": 1.8935949064822435, "learning_rate": 2.6150000000000002e-05, "loss": 0.2752, "step": 524 }, { "epoch": 0.834658187599364, "grad_norm": 5.06112572064763, "learning_rate": 2.62e-05, "loss": 0.4075, "step": 525 }, { "epoch": 0.8362480127186009, "grad_norm": 1.131170852932, "learning_rate": 2.625e-05, "loss": 0.2779, "step": 526 }, { "epoch": 0.8378378378378378, "grad_norm": 1.5262419720788403, "learning_rate": 2.6300000000000002e-05, "loss": 0.2798, "step": 527 }, { "epoch": 0.8394276629570747, "grad_norm": 1.2109422840577981, "learning_rate": 2.635e-05, "loss": 0.2831, "step": 528 }, { "epoch": 0.8410174880763116, "grad_norm": 1.180551181034208, "learning_rate": 2.64e-05, "loss": 0.2676, "step": 529 }, { "epoch": 0.8426073131955485, "grad_norm": 1.5709962624260545, "learning_rate": 2.6450000000000003e-05, "loss": 0.2174, "step": 530 }, { "epoch": 0.8441971383147854, "grad_norm": 1.0202447362995952, "learning_rate": 2.65e-05, "loss": 0.1891, "step": 531 }, { "epoch": 0.8457869634340223, "grad_norm": 2.2267126167782503, "learning_rate": 2.655e-05, "loss": 0.2788, "step": 532 }, { "epoch": 0.8473767885532592, "grad_norm": 2.968170962753858, "learning_rate": 2.6600000000000003e-05, "loss": 0.364, "step": 533 }, { "epoch": 0.848966613672496, "grad_norm": 1.6322900689086306, "learning_rate": 2.665e-05, "loss": 0.2787, "step": 534 }, { "epoch": 0.8505564387917329, "grad_norm": 1.7253332488025737, "learning_rate": 2.6700000000000002e-05, "loss": 0.322, "step": 535 }, { "epoch": 0.8521462639109698, "grad_norm": 1.7820066248592579, "learning_rate": 2.6750000000000003e-05, "loss": 0.3073, "step": 536 }, { "epoch": 0.8537360890302067, "grad_norm": 1.5822247601093904, "learning_rate": 2.68e-05, "loss": 0.2782, "step": 537 }, { "epoch": 0.8553259141494436, "grad_norm": 0.8342169926862201, "learning_rate": 2.6850000000000002e-05, "loss": 0.2006, "step": 538 }, { "epoch": 0.8569157392686805, "grad_norm": 0.734823269579416, "learning_rate": 2.69e-05, "loss": 0.1563, "step": 539 }, { "epoch": 0.8585055643879174, "grad_norm": 0.9276752031141264, "learning_rate": 2.695e-05, "loss": 0.2182, "step": 540 }, { "epoch": 0.8600953895071543, "grad_norm": 0.8092208365303625, "learning_rate": 2.7000000000000002e-05, "loss": 0.2417, "step": 541 }, { "epoch": 0.8616852146263911, "grad_norm": 3.66079875460622, "learning_rate": 2.705e-05, "loss": 0.4777, "step": 542 }, { "epoch": 0.863275039745628, "grad_norm": 2.0012817252751636, "learning_rate": 2.71e-05, "loss": 0.2564, "step": 543 }, { "epoch": 0.8648648648648649, "grad_norm": 0.9741312295304617, "learning_rate": 2.7150000000000003e-05, "loss": 0.2345, "step": 544 }, { "epoch": 0.8664546899841018, "grad_norm": 2.811529000676048, "learning_rate": 2.72e-05, "loss": 0.3443, "step": 545 }, { "epoch": 0.8680445151033387, "grad_norm": 1.2524045283902445, "learning_rate": 2.725e-05, "loss": 0.2882, "step": 546 }, { "epoch": 0.8696343402225755, "grad_norm": 0.8214317656721801, "learning_rate": 2.7300000000000003e-05, "loss": 0.2476, "step": 547 }, { "epoch": 0.8712241653418124, "grad_norm": 3.934379177825496, "learning_rate": 2.735e-05, "loss": 0.2921, "step": 548 }, { "epoch": 0.8728139904610492, "grad_norm": 3.341373621978701, "learning_rate": 2.7400000000000002e-05, "loss": 1.1221, "step": 549 }, { "epoch": 0.8744038155802861, "grad_norm": 1.715300532463595, "learning_rate": 2.7450000000000003e-05, "loss": 0.2264, "step": 550 }, { "epoch": 0.875993640699523, "grad_norm": 1.7980672728192069, "learning_rate": 2.75e-05, "loss": 0.2656, "step": 551 }, { "epoch": 0.8775834658187599, "grad_norm": 2.222185061776179, "learning_rate": 2.7550000000000002e-05, "loss": 0.2648, "step": 552 }, { "epoch": 0.8791732909379968, "grad_norm": 1.286869998742626, "learning_rate": 2.7600000000000003e-05, "loss": 0.2524, "step": 553 }, { "epoch": 0.8807631160572337, "grad_norm": 1.7807532005365678, "learning_rate": 2.765e-05, "loss": 0.2686, "step": 554 }, { "epoch": 0.8823529411764706, "grad_norm": 2.2800136053495836, "learning_rate": 2.7700000000000002e-05, "loss": 0.775, "step": 555 }, { "epoch": 0.8839427662957074, "grad_norm": 1.676459694140018, "learning_rate": 2.7750000000000004e-05, "loss": 0.261, "step": 556 }, { "epoch": 0.8855325914149443, "grad_norm": 3.154493706640757, "learning_rate": 2.78e-05, "loss": 0.3026, "step": 557 }, { "epoch": 0.8871224165341812, "grad_norm": 1.5682506010155899, "learning_rate": 2.7850000000000003e-05, "loss": 0.2674, "step": 558 }, { "epoch": 0.8887122416534181, "grad_norm": 1.4730844725055516, "learning_rate": 2.79e-05, "loss": 0.2351, "step": 559 }, { "epoch": 0.890302066772655, "grad_norm": 1.2630208967180552, "learning_rate": 2.795e-05, "loss": 0.2459, "step": 560 }, { "epoch": 0.8918918918918919, "grad_norm": 1.9830459201554855, "learning_rate": 2.8e-05, "loss": 0.2424, "step": 561 }, { "epoch": 0.8934817170111288, "grad_norm": 1.0880761476341176, "learning_rate": 2.805e-05, "loss": 0.2227, "step": 562 }, { "epoch": 0.8950715421303657, "grad_norm": 0.9161443357295311, "learning_rate": 2.81e-05, "loss": 0.1574, "step": 563 }, { "epoch": 0.8966613672496025, "grad_norm": 1.661947821577921, "learning_rate": 2.815e-05, "loss": 0.2506, "step": 564 }, { "epoch": 0.8982511923688394, "grad_norm": 1.1763465802154114, "learning_rate": 2.8199999999999998e-05, "loss": 0.2567, "step": 565 }, { "epoch": 0.8998410174880763, "grad_norm": 0.8644306861576502, "learning_rate": 2.825e-05, "loss": 0.1474, "step": 566 }, { "epoch": 0.9014308426073132, "grad_norm": 0.9554644903618156, "learning_rate": 2.83e-05, "loss": 0.2543, "step": 567 }, { "epoch": 0.9030206677265501, "grad_norm": 1.2130384406634964, "learning_rate": 2.8349999999999998e-05, "loss": 0.2093, "step": 568 }, { "epoch": 0.904610492845787, "grad_norm": 1.1611308000289235, "learning_rate": 2.84e-05, "loss": 0.2628, "step": 569 }, { "epoch": 0.9062003179650239, "grad_norm": 1.0419132902835948, "learning_rate": 2.845e-05, "loss": 0.1982, "step": 570 }, { "epoch": 0.9077901430842608, "grad_norm": 1.2281229501219237, "learning_rate": 2.8499999999999998e-05, "loss": 0.2227, "step": 571 }, { "epoch": 0.9093799682034976, "grad_norm": 0.8083882338957968, "learning_rate": 2.855e-05, "loss": 0.1992, "step": 572 }, { "epoch": 0.9109697933227345, "grad_norm": 0.9555461701338538, "learning_rate": 2.86e-05, "loss": 0.2388, "step": 573 }, { "epoch": 0.9125596184419714, "grad_norm": 1.606807656501827, "learning_rate": 2.865e-05, "loss": 0.2244, "step": 574 }, { "epoch": 0.9141494435612083, "grad_norm": 0.6549068893545189, "learning_rate": 2.87e-05, "loss": 0.2069, "step": 575 }, { "epoch": 0.9157392686804452, "grad_norm": 2.43899204545746, "learning_rate": 2.875e-05, "loss": 0.1898, "step": 576 }, { "epoch": 0.9173290937996821, "grad_norm": 0.9315903411957124, "learning_rate": 2.88e-05, "loss": 0.2881, "step": 577 }, { "epoch": 0.918918918918919, "grad_norm": 1.0715576730901948, "learning_rate": 2.885e-05, "loss": 0.2212, "step": 578 }, { "epoch": 0.9205087440381559, "grad_norm": 1.0667589805163178, "learning_rate": 2.89e-05, "loss": 0.2369, "step": 579 }, { "epoch": 0.9220985691573926, "grad_norm": 13.20690071658688, "learning_rate": 2.895e-05, "loss": 33.2293, "step": 580 }, { "epoch": 0.9236883942766295, "grad_norm": 0.9963529355605913, "learning_rate": 2.9e-05, "loss": 0.2242, "step": 581 }, { "epoch": 0.9252782193958664, "grad_norm": 0.8927584997248048, "learning_rate": 2.905e-05, "loss": 0.233, "step": 582 }, { "epoch": 0.9268680445151033, "grad_norm": 1.2739715347103169, "learning_rate": 2.91e-05, "loss": 0.2052, "step": 583 }, { "epoch": 0.9284578696343402, "grad_norm": 0.8169976480159199, "learning_rate": 2.915e-05, "loss": 0.2306, "step": 584 }, { "epoch": 0.9300476947535771, "grad_norm": 1.158265859211442, "learning_rate": 2.92e-05, "loss": 0.2241, "step": 585 }, { "epoch": 0.931637519872814, "grad_norm": 1.155667591594751, "learning_rate": 2.925e-05, "loss": 0.2756, "step": 586 }, { "epoch": 0.9332273449920508, "grad_norm": 1.2060672193737954, "learning_rate": 2.93e-05, "loss": 0.2508, "step": 587 }, { "epoch": 0.9348171701112877, "grad_norm": 1.1307378745583698, "learning_rate": 2.9350000000000002e-05, "loss": 0.2242, "step": 588 }, { "epoch": 0.9364069952305246, "grad_norm": 1.138250046228856, "learning_rate": 2.94e-05, "loss": 0.2727, "step": 589 }, { "epoch": 0.9379968203497615, "grad_norm": 1.1337602907713822, "learning_rate": 2.945e-05, "loss": 0.2292, "step": 590 }, { "epoch": 0.9395866454689984, "grad_norm": 0.9103409950201136, "learning_rate": 2.95e-05, "loss": 0.2128, "step": 591 }, { "epoch": 0.9411764705882353, "grad_norm": 1.0614237421911457, "learning_rate": 2.955e-05, "loss": 0.2175, "step": 592 }, { "epoch": 0.9427662957074722, "grad_norm": 1.2753047562449893, "learning_rate": 2.96e-05, "loss": 0.2246, "step": 593 }, { "epoch": 0.9443561208267091, "grad_norm": 0.7703276924313528, "learning_rate": 2.965e-05, "loss": 0.2425, "step": 594 }, { "epoch": 0.9459459459459459, "grad_norm": 1.191085729051244, "learning_rate": 2.97e-05, "loss": 0.2098, "step": 595 }, { "epoch": 0.9475357710651828, "grad_norm": 1.4335055767357534, "learning_rate": 2.975e-05, "loss": 0.2657, "step": 596 }, { "epoch": 0.9491255961844197, "grad_norm": 2.5902824163470273, "learning_rate": 2.98e-05, "loss": 0.3129, "step": 597 }, { "epoch": 0.9507154213036566, "grad_norm": 0.7592792164328412, "learning_rate": 2.985e-05, "loss": 0.1866, "step": 598 }, { "epoch": 0.9523052464228935, "grad_norm": 1.2323263878273853, "learning_rate": 2.9900000000000002e-05, "loss": 0.219, "step": 599 }, { "epoch": 0.9538950715421304, "grad_norm": 15.906398995325986, "learning_rate": 2.995e-05, "loss": 34.064, "step": 600 }, { "epoch": 0.9554848966613673, "grad_norm": 1.7152370438901547, "learning_rate": 3e-05, "loss": 0.2537, "step": 601 }, { "epoch": 0.9570747217806042, "grad_norm": 0.8289995835163445, "learning_rate": 3.000000169231895e-05, "loss": 0.1988, "step": 602 }, { "epoch": 0.958664546899841, "grad_norm": 1.0540008720898175, "learning_rate": 3.0000006769275235e-05, "loss": 0.2197, "step": 603 }, { "epoch": 0.9602543720190779, "grad_norm": 0.7089689894602045, "learning_rate": 3.000001523086713e-05, "loss": 0.2212, "step": 604 }, { "epoch": 0.9618441971383148, "grad_norm": 0.9315549187227836, "learning_rate": 3.0000027077091763e-05, "loss": 0.2128, "step": 605 }, { "epoch": 0.9634340222575517, "grad_norm": 1.0733256441562586, "learning_rate": 3.0000042307945136e-05, "loss": 0.2399, "step": 606 }, { "epoch": 0.9650238473767886, "grad_norm": 0.65838321699355, "learning_rate": 3.0000060923422093e-05, "loss": 0.2058, "step": 607 }, { "epoch": 0.9666136724960255, "grad_norm": 1.176262274339728, "learning_rate": 3.0000082923516334e-05, "loss": 0.2547, "step": 608 }, { "epoch": 0.9682034976152624, "grad_norm": 1.5636274083086186, "learning_rate": 3.0000108308220412e-05, "loss": 0.597, "step": 609 }, { "epoch": 0.9697933227344993, "grad_norm": 1.1129054524387443, "learning_rate": 3.000013707752573e-05, "loss": 0.2609, "step": 610 }, { "epoch": 0.9713831478537361, "grad_norm": 0.8584432539009351, "learning_rate": 3.0000169231422557e-05, "loss": 0.2409, "step": 611 }, { "epoch": 0.972972972972973, "grad_norm": 1.3067867415208732, "learning_rate": 3.000020476990001e-05, "loss": 0.2282, "step": 612 }, { "epoch": 0.9745627980922098, "grad_norm": 0.8080146441665003, "learning_rate": 3.000024369294605e-05, "loss": 0.2043, "step": 613 }, { "epoch": 0.9761526232114467, "grad_norm": 0.8995723386100994, "learning_rate": 3.000028600054751e-05, "loss": 0.2024, "step": 614 }, { "epoch": 0.9777424483306836, "grad_norm": 1.049692608457188, "learning_rate": 3.000033169269009e-05, "loss": 0.2015, "step": 615 }, { "epoch": 0.9793322734499205, "grad_norm": 0.8227808191012699, "learning_rate": 3.0000380769358285e-05, "loss": 0.207, "step": 616 }, { "epoch": 0.9809220985691574, "grad_norm": 0.7559858101796544, "learning_rate": 3.0000433230535512e-05, "loss": 0.1947, "step": 617 }, { "epoch": 0.9825119236883942, "grad_norm": 1.3469294454976612, "learning_rate": 3.0000489076204015e-05, "loss": 0.3511, "step": 618 }, { "epoch": 0.9841017488076311, "grad_norm": 6.575492156700965, "learning_rate": 3.0000548306344874e-05, "loss": 9.4442, "step": 619 }, { "epoch": 0.985691573926868, "grad_norm": 1.6443233762862122, "learning_rate": 3.0000610920938065e-05, "loss": 0.2584, "step": 620 }, { "epoch": 0.9872813990461049, "grad_norm": 1.9143339205402958, "learning_rate": 3.000067691996238e-05, "loss": 0.2602, "step": 621 }, { "epoch": 0.9888712241653418, "grad_norm": 1.1548804450365433, "learning_rate": 3.0000746303395484e-05, "loss": 0.2257, "step": 622 }, { "epoch": 0.9904610492845787, "grad_norm": 0.8345467468980763, "learning_rate": 3.0000819071213893e-05, "loss": 0.2484, "step": 623 }, { "epoch": 0.9920508744038156, "grad_norm": 1.4228214355831577, "learning_rate": 3.0000895223392975e-05, "loss": 0.2067, "step": 624 }, { "epoch": 0.9936406995230525, "grad_norm": 1.0355629065943222, "learning_rate": 3.0000974759906957e-05, "loss": 0.1806, "step": 625 }, { "epoch": 0.9952305246422893, "grad_norm": 3.156797680044191, "learning_rate": 3.000105768072892e-05, "loss": 0.3152, "step": 626 }, { "epoch": 0.9968203497615262, "grad_norm": 1.0999265133106408, "learning_rate": 3.0001143985830813e-05, "loss": 0.2226, "step": 627 }, { "epoch": 0.9984101748807631, "grad_norm": 1.5296788184001642, "learning_rate": 3.0001233675183396e-05, "loss": 0.2072, "step": 628 }, { "epoch": 1.0, "grad_norm": 0.7712974215796701, "learning_rate": 3.0001326748756327e-05, "loss": 0.2053, "step": 629 }, { "epoch": 1.0015898251192368, "grad_norm": 2.995340358807991, "learning_rate": 3.0001423206518105e-05, "loss": 0.2754, "step": 630 }, { "epoch": 1.0031796502384738, "grad_norm": 3.0538351724870103, "learning_rate": 3.0001523048436092e-05, "loss": 0.3338, "step": 631 }, { "epoch": 1.0047694753577106, "grad_norm": 0.7369463119843839, "learning_rate": 3.000162627447647e-05, "loss": 0.2328, "step": 632 }, { "epoch": 1.0063593004769475, "grad_norm": 1.3135358500770693, "learning_rate": 3.000173288460432e-05, "loss": 0.2649, "step": 633 }, { "epoch": 1.0079491255961843, "grad_norm": 0.806784178198853, "learning_rate": 3.0001842878783563e-05, "loss": 0.1796, "step": 634 }, { "epoch": 1.0095389507154213, "grad_norm": 1.0015602426133656, "learning_rate": 3.0001956256976943e-05, "loss": 0.3043, "step": 635 }, { "epoch": 1.011128775834658, "grad_norm": 0.918176471529433, "learning_rate": 3.0002073019146117e-05, "loss": 0.157, "step": 636 }, { "epoch": 1.012718600953895, "grad_norm": 0.7689770476952236, "learning_rate": 3.000219316525154e-05, "loss": 0.2029, "step": 637 }, { "epoch": 1.0143084260731319, "grad_norm": 2.220612026181284, "learning_rate": 3.000231669525257e-05, "loss": 0.3223, "step": 638 }, { "epoch": 1.0158982511923689, "grad_norm": 1.3289670267634566, "learning_rate": 3.0002443609107383e-05, "loss": 0.2269, "step": 639 }, { "epoch": 1.0174880763116056, "grad_norm": 2.279024567850086, "learning_rate": 3.000257390677301e-05, "loss": 0.2806, "step": 640 }, { "epoch": 1.0190779014308426, "grad_norm": 0.9680651735383388, "learning_rate": 3.000270758820539e-05, "loss": 0.198, "step": 641 }, { "epoch": 1.0206677265500794, "grad_norm": 1.2366966363728096, "learning_rate": 3.000284465335923e-05, "loss": 0.2, "step": 642 }, { "epoch": 1.0222575516693164, "grad_norm": 1.366837553320901, "learning_rate": 3.000298510218817e-05, "loss": 0.2102, "step": 643 }, { "epoch": 1.0238473767885532, "grad_norm": 1.2072891991648633, "learning_rate": 3.0003128934644662e-05, "loss": 0.2176, "step": 644 }, { "epoch": 1.0254372019077902, "grad_norm": 0.9109499051253581, "learning_rate": 3.0003276150680025e-05, "loss": 0.2015, "step": 645 }, { "epoch": 1.027027027027027, "grad_norm": 0.7797646968043357, "learning_rate": 3.000342675024443e-05, "loss": 0.2053, "step": 646 }, { "epoch": 1.028616852146264, "grad_norm": 1.0289395807668062, "learning_rate": 3.0003580733286906e-05, "loss": 0.2458, "step": 647 }, { "epoch": 1.0302066772655007, "grad_norm": 2.9413031174873745, "learning_rate": 3.0003738099755337e-05, "loss": 0.3945, "step": 648 }, { "epoch": 1.0317965023847377, "grad_norm": 0.7462123213900558, "learning_rate": 3.0003898849596456e-05, "loss": 0.2368, "step": 649 }, { "epoch": 1.0333863275039745, "grad_norm": 1.0423295019165562, "learning_rate": 3.0004062982755864e-05, "loss": 0.2084, "step": 650 }, { "epoch": 1.0349761526232115, "grad_norm": 1.160870145189582, "learning_rate": 3.0004230499178e-05, "loss": 0.2278, "step": 651 }, { "epoch": 1.0365659777424483, "grad_norm": 0.6262899016298544, "learning_rate": 3.000440139880616e-05, "loss": 0.1994, "step": 652 }, { "epoch": 1.0381558028616853, "grad_norm": 1.3464194242216732, "learning_rate": 3.0004575681582512e-05, "loss": 0.2085, "step": 653 }, { "epoch": 1.039745627980922, "grad_norm": 0.8157226116311506, "learning_rate": 3.0004753347448062e-05, "loss": 0.1824, "step": 654 }, { "epoch": 1.041335453100159, "grad_norm": 0.9204293152722103, "learning_rate": 3.0004934396342685e-05, "loss": 0.2348, "step": 655 }, { "epoch": 1.0429252782193958, "grad_norm": 4.90660794098986, "learning_rate": 3.0005118828205097e-05, "loss": 0.2671, "step": 656 }, { "epoch": 1.0445151033386328, "grad_norm": 1.857005791238994, "learning_rate": 3.000530664297286e-05, "loss": 0.2608, "step": 657 }, { "epoch": 1.0461049284578696, "grad_norm": 1.1515033621134392, "learning_rate": 3.0005497840582433e-05, "loss": 0.1962, "step": 658 }, { "epoch": 1.0476947535771066, "grad_norm": 0.867912178692954, "learning_rate": 3.0005692420969074e-05, "loss": 0.23, "step": 659 }, { "epoch": 1.0492845786963434, "grad_norm": 1.4237670599326264, "learning_rate": 3.000589038406695e-05, "loss": 0.2189, "step": 660 }, { "epoch": 1.0508744038155804, "grad_norm": 1.13260620764936, "learning_rate": 3.000609172980904e-05, "loss": 0.2004, "step": 661 }, { "epoch": 1.0524642289348172, "grad_norm": 0.9774487001714098, "learning_rate": 3.0006296458127206e-05, "loss": 0.2001, "step": 662 }, { "epoch": 1.054054054054054, "grad_norm": 0.7915760572935234, "learning_rate": 3.000650456895215e-05, "loss": 0.1935, "step": 663 }, { "epoch": 1.055643879173291, "grad_norm": 1.2561264365563078, "learning_rate": 3.0006716062213444e-05, "loss": 0.2297, "step": 664 }, { "epoch": 1.0572337042925277, "grad_norm": 1.2584264794222662, "learning_rate": 3.000693093783948e-05, "loss": 0.239, "step": 665 }, { "epoch": 1.0588235294117647, "grad_norm": 1.0537006108869125, "learning_rate": 3.0007149195757554e-05, "loss": 0.2286, "step": 666 }, { "epoch": 1.0604133545310015, "grad_norm": 1.084314164760084, "learning_rate": 3.0007370835893784e-05, "loss": 0.1732, "step": 667 }, { "epoch": 1.0620031796502385, "grad_norm": 1.35823376435203, "learning_rate": 3.0007595858173162e-05, "loss": 0.5996, "step": 668 }, { "epoch": 1.0635930047694753, "grad_norm": 0.782134400966495, "learning_rate": 3.0007824262519512e-05, "loss": 0.2388, "step": 669 }, { "epoch": 1.0651828298887123, "grad_norm": 18.143302024807802, "learning_rate": 3.0008056048855544e-05, "loss": 32.3368, "step": 670 }, { "epoch": 1.066772655007949, "grad_norm": 0.824814578222989, "learning_rate": 3.0008291217102792e-05, "loss": 0.2112, "step": 671 }, { "epoch": 1.068362480127186, "grad_norm": 1.0692941873069601, "learning_rate": 3.0008529767181656e-05, "loss": 0.2171, "step": 672 }, { "epoch": 1.0699523052464228, "grad_norm": 0.8063601598888985, "learning_rate": 3.0008771699011413e-05, "loss": 0.1871, "step": 673 }, { "epoch": 1.0715421303656598, "grad_norm": 0.8745071764686374, "learning_rate": 3.0009017012510175e-05, "loss": 0.2061, "step": 674 }, { "epoch": 1.0731319554848966, "grad_norm": 1.8857782407716015, "learning_rate": 3.0009265707594907e-05, "loss": 0.6164, "step": 675 }, { "epoch": 1.0747217806041336, "grad_norm": 4.614882279340868, "learning_rate": 3.0009517784181416e-05, "loss": 0.3255, "step": 676 }, { "epoch": 1.0763116057233704, "grad_norm": 0.900318768421948, "learning_rate": 3.000977324218442e-05, "loss": 0.1904, "step": 677 }, { "epoch": 1.0779014308426074, "grad_norm": 2.9341041546455964, "learning_rate": 3.0010032081517434e-05, "loss": 0.2526, "step": 678 }, { "epoch": 1.0794912559618441, "grad_norm": 1.1049186237197868, "learning_rate": 3.0010294302092857e-05, "loss": 0.2565, "step": 679 }, { "epoch": 1.0810810810810811, "grad_norm": 0.804126214905185, "learning_rate": 3.0010559903821927e-05, "loss": 0.2088, "step": 680 }, { "epoch": 1.082670906200318, "grad_norm": 0.97641024718292, "learning_rate": 3.0010828886614757e-05, "loss": 0.2016, "step": 681 }, { "epoch": 1.084260731319555, "grad_norm": 1.1602108569842764, "learning_rate": 3.00111012503803e-05, "loss": 0.1719, "step": 682 }, { "epoch": 1.0858505564387917, "grad_norm": 0.897923127085055, "learning_rate": 3.0011376995026377e-05, "loss": 0.2062, "step": 683 }, { "epoch": 1.0874403815580287, "grad_norm": 0.7900954260590638, "learning_rate": 3.001165612045965e-05, "loss": 0.259, "step": 684 }, { "epoch": 1.0890302066772655, "grad_norm": 0.5723827780931718, "learning_rate": 3.001193862658566e-05, "loss": 0.1622, "step": 685 }, { "epoch": 1.0906200317965025, "grad_norm": 1.4438465347745348, "learning_rate": 3.001222451330877e-05, "loss": 0.2408, "step": 686 }, { "epoch": 1.0922098569157392, "grad_norm": 0.8352273066307121, "learning_rate": 3.0012513780532238e-05, "loss": 0.1961, "step": 687 }, { "epoch": 1.0937996820349762, "grad_norm": 0.5336144504295944, "learning_rate": 3.0012806428158144e-05, "loss": 0.1486, "step": 688 }, { "epoch": 1.095389507154213, "grad_norm": 0.7521578643204917, "learning_rate": 3.0013102456087433e-05, "loss": 0.2201, "step": 689 }, { "epoch": 1.09697933227345, "grad_norm": 1.0580466861864777, "learning_rate": 3.001340186421992e-05, "loss": 0.2289, "step": 690 }, { "epoch": 1.0985691573926868, "grad_norm": 0.9595923144828853, "learning_rate": 3.0013704652454258e-05, "loss": 0.1913, "step": 691 }, { "epoch": 1.1001589825119238, "grad_norm": 20.235232629780796, "learning_rate": 3.0014010820687985e-05, "loss": 31.6578, "step": 692 }, { "epoch": 1.1017488076311606, "grad_norm": 0.7288383364839481, "learning_rate": 3.0014320368817447e-05, "loss": 0.1923, "step": 693 }, { "epoch": 1.1033386327503973, "grad_norm": 1.35329484407909, "learning_rate": 3.0014633296737884e-05, "loss": 0.2766, "step": 694 }, { "epoch": 1.1049284578696343, "grad_norm": 1.0517251280563669, "learning_rate": 3.0014949604343385e-05, "loss": 0.2479, "step": 695 }, { "epoch": 1.1065182829888713, "grad_norm": 1.6814285279188104, "learning_rate": 3.0015269291526883e-05, "loss": 0.2154, "step": 696 }, { "epoch": 1.1081081081081081, "grad_norm": 1.239004030067652, "learning_rate": 3.0015592358180193e-05, "loss": 0.2793, "step": 697 }, { "epoch": 1.109697933227345, "grad_norm": 1.3536998931504256, "learning_rate": 3.001591880419395e-05, "loss": 0.1767, "step": 698 }, { "epoch": 1.1112877583465819, "grad_norm": 8.398825091086719, "learning_rate": 3.0016248629457668e-05, "loss": 9.2342, "step": 699 }, { "epoch": 1.1128775834658187, "grad_norm": 1.2669175495681713, "learning_rate": 3.0016581833859716e-05, "loss": 0.2639, "step": 700 }, { "epoch": 1.1144674085850557, "grad_norm": 2.7954048731359356, "learning_rate": 3.0016918417287312e-05, "loss": 0.246, "step": 701 }, { "epoch": 1.1160572337042924, "grad_norm": 12.70000817440662, "learning_rate": 3.0017258379626553e-05, "loss": 0.629, "step": 702 }, { "epoch": 1.1176470588235294, "grad_norm": 0.7362099420269006, "learning_rate": 3.0017601720762342e-05, "loss": 0.1971, "step": 703 }, { "epoch": 1.1192368839427662, "grad_norm": 2.861913927679887, "learning_rate": 3.0017948440578506e-05, "loss": 0.223, "step": 704 }, { "epoch": 1.1208267090620032, "grad_norm": 1.0656884043231087, "learning_rate": 3.001829853895766e-05, "loss": 0.2829, "step": 705 }, { "epoch": 1.12241653418124, "grad_norm": 0.989617017808479, "learning_rate": 3.001865201578133e-05, "loss": 0.1867, "step": 706 }, { "epoch": 1.124006359300477, "grad_norm": 1.2681973934985606, "learning_rate": 3.0019008870929875e-05, "loss": 0.2506, "step": 707 }, { "epoch": 1.1255961844197138, "grad_norm": 0.9870081947065322, "learning_rate": 3.0019369104282496e-05, "loss": 0.2059, "step": 708 }, { "epoch": 1.1271860095389508, "grad_norm": 3.8821504495070758, "learning_rate": 3.0019732715717285e-05, "loss": 0.2871, "step": 709 }, { "epoch": 1.1287758346581875, "grad_norm": 0.8260856707850688, "learning_rate": 3.0020099705111165e-05, "loss": 0.1594, "step": 710 }, { "epoch": 1.1303656597774245, "grad_norm": 0.9296416050238138, "learning_rate": 3.002047007233993e-05, "loss": 0.2365, "step": 711 }, { "epoch": 1.1319554848966613, "grad_norm": 0.8576002924604323, "learning_rate": 3.002084381727821e-05, "loss": 0.2389, "step": 712 }, { "epoch": 1.1335453100158983, "grad_norm": 0.8722577263240856, "learning_rate": 3.002122093979952e-05, "loss": 0.2064, "step": 713 }, { "epoch": 1.135135135135135, "grad_norm": 1.147061668202372, "learning_rate": 3.0021601439776213e-05, "loss": 0.2059, "step": 714 }, { "epoch": 1.136724960254372, "grad_norm": 1.1252038707857024, "learning_rate": 3.0021985317079507e-05, "loss": 0.2322, "step": 715 }, { "epoch": 1.1383147853736089, "grad_norm": 1.0062040686356222, "learning_rate": 3.002237257157945e-05, "loss": 0.1978, "step": 716 }, { "epoch": 1.1399046104928459, "grad_norm": 0.6995971478803915, "learning_rate": 3.0022763203145015e-05, "loss": 0.2228, "step": 717 }, { "epoch": 1.1414944356120826, "grad_norm": 1.1613365235570836, "learning_rate": 3.0023157211643948e-05, "loss": 0.2192, "step": 718 }, { "epoch": 1.1430842607313196, "grad_norm": 0.9236464597255482, "learning_rate": 3.0023554596942908e-05, "loss": 0.2274, "step": 719 }, { "epoch": 1.1446740858505564, "grad_norm": 0.6910455473134065, "learning_rate": 3.002395535890739e-05, "loss": 0.1976, "step": 720 }, { "epoch": 1.1462639109697934, "grad_norm": 1.1031161592937788, "learning_rate": 3.002435949740176e-05, "loss": 0.207, "step": 721 }, { "epoch": 1.1478537360890302, "grad_norm": 0.8455647962810376, "learning_rate": 3.0024767012289212e-05, "loss": 0.225, "step": 722 }, { "epoch": 1.1494435612082672, "grad_norm": 1.2732967189592286, "learning_rate": 3.0025177903431845e-05, "loss": 0.1997, "step": 723 }, { "epoch": 1.151033386327504, "grad_norm": 0.8970554786880177, "learning_rate": 3.002559217069056e-05, "loss": 0.223, "step": 724 }, { "epoch": 1.1526232114467407, "grad_norm": 0.7799759873081547, "learning_rate": 3.0026009813925165e-05, "loss": 0.2655, "step": 725 }, { "epoch": 1.1542130365659777, "grad_norm": 3.3035632310650893, "learning_rate": 3.0026430832994277e-05, "loss": 0.3052, "step": 726 }, { "epoch": 1.1558028616852147, "grad_norm": 1.5418044828466606, "learning_rate": 3.0026855227755425e-05, "loss": 0.1817, "step": 727 }, { "epoch": 1.1573926868044515, "grad_norm": 1.2269863083227035, "learning_rate": 3.0027282998064946e-05, "loss": 0.2562, "step": 728 }, { "epoch": 1.1589825119236883, "grad_norm": 1.3352862515834467, "learning_rate": 3.0027714143778058e-05, "loss": 0.1742, "step": 729 }, { "epoch": 1.1605723370429253, "grad_norm": 0.8333132168884884, "learning_rate": 3.002814866474884e-05, "loss": 0.1678, "step": 730 }, { "epoch": 1.1621621621621623, "grad_norm": 1.0310335429679847, "learning_rate": 3.0028586560830226e-05, "loss": 0.2549, "step": 731 }, { "epoch": 1.163751987281399, "grad_norm": 0.7136871917430028, "learning_rate": 3.0029027831873996e-05, "loss": 0.1984, "step": 732 }, { "epoch": 1.1653418124006358, "grad_norm": 17.780055973434013, "learning_rate": 3.0029472477730798e-05, "loss": 31.8157, "step": 733 }, { "epoch": 1.1669316375198728, "grad_norm": 1.167455638976974, "learning_rate": 3.0029920498250133e-05, "loss": 0.2604, "step": 734 }, { "epoch": 1.1685214626391096, "grad_norm": 1.2404234827388172, "learning_rate": 3.0030371893280367e-05, "loss": 0.1906, "step": 735 }, { "epoch": 1.1701112877583466, "grad_norm": 0.837791903645683, "learning_rate": 3.0030826662668716e-05, "loss": 0.172, "step": 736 }, { "epoch": 1.1717011128775834, "grad_norm": 1.4070595102168377, "learning_rate": 3.0031284806261258e-05, "loss": 0.2123, "step": 737 }, { "epoch": 1.1732909379968204, "grad_norm": 1.3437154129961775, "learning_rate": 3.0031746323902934e-05, "loss": 0.2666, "step": 738 }, { "epoch": 1.1748807631160572, "grad_norm": 1.5739642251257215, "learning_rate": 3.0032211215437525e-05, "loss": 0.2191, "step": 739 }, { "epoch": 1.1764705882352942, "grad_norm": 0.7978420162239285, "learning_rate": 3.0032679480707695e-05, "loss": 0.2021, "step": 740 }, { "epoch": 1.178060413354531, "grad_norm": 0.7706922168917046, "learning_rate": 3.003315111955494e-05, "loss": 0.2036, "step": 741 }, { "epoch": 1.179650238473768, "grad_norm": 1.0272058075130088, "learning_rate": 3.0033626131819636e-05, "loss": 0.2149, "step": 742 }, { "epoch": 1.1812400635930047, "grad_norm": 0.7793723921638207, "learning_rate": 3.0034104517341004e-05, "loss": 0.2369, "step": 743 }, { "epoch": 1.1828298887122417, "grad_norm": 0.6834376944185209, "learning_rate": 3.0034586275957124e-05, "loss": 0.1875, "step": 744 }, { "epoch": 1.1844197138314785, "grad_norm": 0.8765064667836667, "learning_rate": 3.0035071407504953e-05, "loss": 0.1756, "step": 745 }, { "epoch": 1.1860095389507155, "grad_norm": 1.60345883316707, "learning_rate": 3.0035559911820284e-05, "loss": 0.2378, "step": 746 }, { "epoch": 1.1875993640699523, "grad_norm": 1.177364867278975, "learning_rate": 3.0036051788737776e-05, "loss": 0.1592, "step": 747 }, { "epoch": 1.1891891891891893, "grad_norm": 1.023919463091455, "learning_rate": 3.003654703809094e-05, "loss": 0.2229, "step": 748 }, { "epoch": 1.190779014308426, "grad_norm": 1.0439842426111843, "learning_rate": 3.0037045659712147e-05, "loss": 0.1564, "step": 749 }, { "epoch": 1.192368839427663, "grad_norm": 0.7633620925832583, "learning_rate": 3.003754765343266e-05, "loss": 0.2503, "step": 750 }, { "epoch": 1.1939586645468998, "grad_norm": 1.1326526071965548, "learning_rate": 3.003805301908255e-05, "loss": 0.2037, "step": 751 }, { "epoch": 1.1955484896661368, "grad_norm": 0.6942691289599621, "learning_rate": 3.003856175649076e-05, "loss": 0.1691, "step": 752 }, { "epoch": 1.1971383147853736, "grad_norm": 0.8745700553017423, "learning_rate": 3.003907386548513e-05, "loss": 0.2151, "step": 753 }, { "epoch": 1.1987281399046106, "grad_norm": 0.6141008459907198, "learning_rate": 3.0039589345892304e-05, "loss": 0.2172, "step": 754 }, { "epoch": 1.2003179650238474, "grad_norm": 0.7400717305573242, "learning_rate": 3.004010819753782e-05, "loss": 0.1969, "step": 755 }, { "epoch": 1.2019077901430844, "grad_norm": 1.456544355988812, "learning_rate": 3.004063042024607e-05, "loss": 0.2032, "step": 756 }, { "epoch": 1.2034976152623211, "grad_norm": 1.3549580207856993, "learning_rate": 3.0041156013840304e-05, "loss": 0.16, "step": 757 }, { "epoch": 1.2050874403815581, "grad_norm": 1.2320206799728637, "learning_rate": 3.004168497814261e-05, "loss": 0.2557, "step": 758 }, { "epoch": 1.206677265500795, "grad_norm": 0.6908114887182963, "learning_rate": 3.004221731297396e-05, "loss": 0.2008, "step": 759 }, { "epoch": 1.2082670906200317, "grad_norm": 1.0048467133330425, "learning_rate": 3.0042753018154174e-05, "loss": 0.3971, "step": 760 }, { "epoch": 1.2098569157392687, "grad_norm": 0.9939338230550647, "learning_rate": 3.004329209350196e-05, "loss": 0.1645, "step": 761 }, { "epoch": 1.2114467408585057, "grad_norm": 0.8042681411868798, "learning_rate": 3.0043834538834827e-05, "loss": 0.1887, "step": 762 }, { "epoch": 1.2130365659777425, "grad_norm": 1.0602905139369727, "learning_rate": 3.0044380353969195e-05, "loss": 0.1986, "step": 763 }, { "epoch": 1.2146263910969792, "grad_norm": 1.5885686015755542, "learning_rate": 3.0044929538720324e-05, "loss": 0.1769, "step": 764 }, { "epoch": 1.2162162162162162, "grad_norm": 8.572598868690184, "learning_rate": 3.004548209290234e-05, "loss": 16.3842, "step": 765 }, { "epoch": 1.217806041335453, "grad_norm": 1.1234669067700735, "learning_rate": 3.0046038016328214e-05, "loss": 0.2212, "step": 766 }, { "epoch": 1.21939586645469, "grad_norm": 1.2345499196619065, "learning_rate": 3.004659730880978e-05, "loss": 0.1988, "step": 767 }, { "epoch": 1.2209856915739268, "grad_norm": 1.3952594069000674, "learning_rate": 3.0047159970157762e-05, "loss": 0.206, "step": 768 }, { "epoch": 1.2225755166931638, "grad_norm": 1.4284866874960678, "learning_rate": 3.0047726000181693e-05, "loss": 0.2888, "step": 769 }, { "epoch": 1.2241653418124006, "grad_norm": 0.6553585379114284, "learning_rate": 3.0048295398689997e-05, "loss": 0.1554, "step": 770 }, { "epoch": 1.2257551669316376, "grad_norm": 0.966422509607705, "learning_rate": 3.0048868165489972e-05, "loss": 0.2207, "step": 771 }, { "epoch": 1.2273449920508743, "grad_norm": 1.3265238901837997, "learning_rate": 3.0049444300387737e-05, "loss": 0.2271, "step": 772 }, { "epoch": 1.2289348171701113, "grad_norm": 2.0733680500106906, "learning_rate": 3.00500238031883e-05, "loss": 0.3018, "step": 773 }, { "epoch": 1.230524642289348, "grad_norm": 1.7820103069693711, "learning_rate": 3.0050606673695528e-05, "loss": 0.2597, "step": 774 }, { "epoch": 1.232114467408585, "grad_norm": 1.6044999969137503, "learning_rate": 3.0051192911712115e-05, "loss": 0.246, "step": 775 }, { "epoch": 1.2337042925278219, "grad_norm": 0.7609386579343164, "learning_rate": 3.0051782517039675e-05, "loss": 0.1615, "step": 776 }, { "epoch": 1.2352941176470589, "grad_norm": 0.7873628425158388, "learning_rate": 3.005237548947861e-05, "loss": 0.201, "step": 777 }, { "epoch": 1.2368839427662957, "grad_norm": 1.417914849653636, "learning_rate": 3.005297182882826e-05, "loss": 0.2042, "step": 778 }, { "epoch": 1.2384737678855327, "grad_norm": 3.5276793435719482, "learning_rate": 3.005357153488675e-05, "loss": 0.3237, "step": 779 }, { "epoch": 1.2400635930047694, "grad_norm": 0.9157155060139784, "learning_rate": 3.005417460745113e-05, "loss": 0.2949, "step": 780 }, { "epoch": 1.2416534181240064, "grad_norm": 1.0359542930699284, "learning_rate": 3.005478104631727e-05, "loss": 0.1611, "step": 781 }, { "epoch": 1.2432432432432432, "grad_norm": 0.5770360344770976, "learning_rate": 3.0055390851279902e-05, "loss": 0.1837, "step": 782 }, { "epoch": 1.2448330683624802, "grad_norm": 1.3851905903665145, "learning_rate": 3.0056004022132648e-05, "loss": 0.3399, "step": 783 }, { "epoch": 1.246422893481717, "grad_norm": 1.0969313876374431, "learning_rate": 3.0056620558667954e-05, "loss": 0.2424, "step": 784 }, { "epoch": 1.248012718600954, "grad_norm": 1.2255312794821909, "learning_rate": 3.0057240460677158e-05, "loss": 0.1819, "step": 785 }, { "epoch": 1.2496025437201908, "grad_norm": 0.6217081656017792, "learning_rate": 3.0057863727950443e-05, "loss": 0.1814, "step": 786 }, { "epoch": 1.2511923688394275, "grad_norm": 2.259275293684557, "learning_rate": 3.0058490360276844e-05, "loss": 0.1968, "step": 787 }, { "epoch": 1.2527821939586645, "grad_norm": 33.305238996872305, "learning_rate": 3.005912035744429e-05, "loss": 30.4028, "step": 788 }, { "epoch": 1.2543720190779015, "grad_norm": 0.8373926477766842, "learning_rate": 3.005975371923953e-05, "loss": 0.2383, "step": 789 }, { "epoch": 1.2559618441971383, "grad_norm": 0.9624395872583089, "learning_rate": 3.0060390445448207e-05, "loss": 0.2119, "step": 790 }, { "epoch": 1.257551669316375, "grad_norm": 1.3852595564730366, "learning_rate": 3.0061030535854805e-05, "loss": 0.2476, "step": 791 }, { "epoch": 1.259141494435612, "grad_norm": 0.9154353270724166, "learning_rate": 3.006167399024267e-05, "loss": 0.1856, "step": 792 }, { "epoch": 1.260731319554849, "grad_norm": 1.342732112673621, "learning_rate": 3.0062320808394038e-05, "loss": 0.1877, "step": 793 }, { "epoch": 1.2623211446740858, "grad_norm": 1.5805588353923163, "learning_rate": 3.0062970990089966e-05, "loss": 0.1977, "step": 794 }, { "epoch": 1.2639109697933226, "grad_norm": 0.8193460966900378, "learning_rate": 3.0063624535110395e-05, "loss": 0.1636, "step": 795 }, { "epoch": 1.2655007949125596, "grad_norm": 1.7480720900396385, "learning_rate": 3.0064281443234124e-05, "loss": 0.2202, "step": 796 }, { "epoch": 1.2670906200317966, "grad_norm": 0.8360298390813324, "learning_rate": 3.006494171423882e-05, "loss": 0.1836, "step": 797 }, { "epoch": 1.2686804451510334, "grad_norm": 1.0019310752543276, "learning_rate": 3.006560534790099e-05, "loss": 0.197, "step": 798 }, { "epoch": 1.2702702702702702, "grad_norm": 1.3541536849473803, "learning_rate": 3.0066272343996042e-05, "loss": 0.2492, "step": 799 }, { "epoch": 1.2718600953895072, "grad_norm": 1.2661965553614343, "learning_rate": 3.006694270229819e-05, "loss": 0.2518, "step": 800 }, { "epoch": 1.2734499205087442, "grad_norm": 1.5746695145803955, "learning_rate": 3.0067616422580567e-05, "loss": 0.2524, "step": 801 }, { "epoch": 1.275039745627981, "grad_norm": 1.5142345958159908, "learning_rate": 3.0068293504615137e-05, "loss": 0.7057, "step": 802 }, { "epoch": 1.2766295707472177, "grad_norm": 1.4115845920403867, "learning_rate": 3.0068973948172732e-05, "loss": 0.2185, "step": 803 }, { "epoch": 1.2782193958664547, "grad_norm": 0.9711763828341654, "learning_rate": 3.0069657753023048e-05, "loss": 0.2486, "step": 804 }, { "epoch": 1.2798092209856915, "grad_norm": 1.0006831313664426, "learning_rate": 3.0070344918934633e-05, "loss": 0.1453, "step": 805 }, { "epoch": 1.2813990461049285, "grad_norm": 1.037333183685246, "learning_rate": 3.0071035445674916e-05, "loss": 0.1607, "step": 806 }, { "epoch": 1.2829888712241653, "grad_norm": 1.8406570681094314, "learning_rate": 3.007172933301017e-05, "loss": 0.1836, "step": 807 }, { "epoch": 1.2845786963434023, "grad_norm": 0.7318388590187899, "learning_rate": 3.0072426580705546e-05, "loss": 0.2004, "step": 808 }, { "epoch": 1.286168521462639, "grad_norm": 0.9238310107679456, "learning_rate": 3.0073127188525044e-05, "loss": 0.1967, "step": 809 }, { "epoch": 1.287758346581876, "grad_norm": 1.3424978332659343, "learning_rate": 3.0073831156231546e-05, "loss": 0.1955, "step": 810 }, { "epoch": 1.2893481717011128, "grad_norm": 1.369310769603383, "learning_rate": 3.007453848358678e-05, "loss": 0.2139, "step": 811 }, { "epoch": 1.2909379968203498, "grad_norm": 1.686854262564021, "learning_rate": 3.0075249170351336e-05, "loss": 0.2092, "step": 812 }, { "epoch": 1.2925278219395866, "grad_norm": 1.121282028074738, "learning_rate": 3.0075963216284673e-05, "loss": 0.1689, "step": 813 }, { "epoch": 1.2941176470588236, "grad_norm": 1.0692057808991164, "learning_rate": 3.0076680621145115e-05, "loss": 0.2817, "step": 814 }, { "epoch": 1.2957074721780604, "grad_norm": 0.8861886635461192, "learning_rate": 3.0077401384689846e-05, "loss": 0.1537, "step": 815 }, { "epoch": 1.2972972972972974, "grad_norm": 1.161476359594777, "learning_rate": 3.0078125506674913e-05, "loss": 0.1986, "step": 816 }, { "epoch": 1.2988871224165341, "grad_norm": 1.5436705101346593, "learning_rate": 3.007885298685522e-05, "loss": 0.2138, "step": 817 }, { "epoch": 1.3004769475357711, "grad_norm": 1.8051666878249613, "learning_rate": 3.0079583824984557e-05, "loss": 0.243, "step": 818 }, { "epoch": 1.302066772655008, "grad_norm": 0.9784423637175137, "learning_rate": 3.0080318020815553e-05, "loss": 0.159, "step": 819 }, { "epoch": 1.303656597774245, "grad_norm": 2.309855956485202, "learning_rate": 3.0081055574099707e-05, "loss": 0.3139, "step": 820 }, { "epoch": 1.3052464228934817, "grad_norm": 0.9276127761408681, "learning_rate": 3.008179648458739e-05, "loss": 0.2219, "step": 821 }, { "epoch": 1.3068362480127185, "grad_norm": 0.7304748699703378, "learning_rate": 3.0082540752027812e-05, "loss": 0.1646, "step": 822 }, { "epoch": 1.3084260731319555, "grad_norm": 14.64675823800732, "learning_rate": 3.00832883761691e-05, "loss": 15.586, "step": 823 }, { "epoch": 1.3100158982511925, "grad_norm": 1.2378236279257848, "learning_rate": 3.0084039356758177e-05, "loss": 0.2368, "step": 824 }, { "epoch": 1.3116057233704292, "grad_norm": 1.4936926507539516, "learning_rate": 3.008479369354088e-05, "loss": 0.2347, "step": 825 }, { "epoch": 1.313195548489666, "grad_norm": 1.0890834190802827, "learning_rate": 3.00855513862619e-05, "loss": 0.2153, "step": 826 }, { "epoch": 1.314785373608903, "grad_norm": 0.7375420724284333, "learning_rate": 3.0086312434664765e-05, "loss": 0.1897, "step": 827 }, { "epoch": 1.31637519872814, "grad_norm": 1.4687967108843725, "learning_rate": 3.00870768384919e-05, "loss": 0.2224, "step": 828 }, { "epoch": 1.3179650238473768, "grad_norm": 1.2028786692470428, "learning_rate": 3.0087844597484587e-05, "loss": 0.2819, "step": 829 }, { "epoch": 1.3195548489666136, "grad_norm": 23.29656376556712, "learning_rate": 3.0088615711382948e-05, "loss": 29.9045, "step": 830 }, { "epoch": 1.3211446740858506, "grad_norm": 2.366796724192845, "learning_rate": 3.008939017992602e-05, "loss": 0.2086, "step": 831 }, { "epoch": 1.3227344992050876, "grad_norm": 0.855058340401459, "learning_rate": 3.0090168002851636e-05, "loss": 0.1756, "step": 832 }, { "epoch": 1.3243243243243243, "grad_norm": 0.9771343101291011, "learning_rate": 3.0090949179896565e-05, "loss": 0.177, "step": 833 }, { "epoch": 1.3259141494435611, "grad_norm": 1.4331130342697376, "learning_rate": 3.0091733710796384e-05, "loss": 0.2416, "step": 834 }, { "epoch": 1.3275039745627981, "grad_norm": 1.0121241450697682, "learning_rate": 3.0092521595285568e-05, "loss": 0.1753, "step": 835 }, { "epoch": 1.329093799682035, "grad_norm": 1.3819689284377823, "learning_rate": 3.0093312833097437e-05, "loss": 0.2043, "step": 836 }, { "epoch": 1.330683624801272, "grad_norm": 0.7731294905433505, "learning_rate": 3.0094107423964208e-05, "loss": 0.2082, "step": 837 }, { "epoch": 1.3322734499205087, "grad_norm": 0.7444981052077096, "learning_rate": 3.0094905367616906e-05, "loss": 0.134, "step": 838 }, { "epoch": 1.3338632750397457, "grad_norm": 3.060526791884808, "learning_rate": 3.0095706663785498e-05, "loss": 0.2738, "step": 839 }, { "epoch": 1.3354531001589824, "grad_norm": 1.317461446818166, "learning_rate": 3.0096511312198732e-05, "loss": 0.2112, "step": 840 }, { "epoch": 1.3370429252782194, "grad_norm": 1.2935365377902823, "learning_rate": 3.0097319312584298e-05, "loss": 0.261, "step": 841 }, { "epoch": 1.3386327503974562, "grad_norm": 1.602858597091533, "learning_rate": 3.0098130664668703e-05, "loss": 0.2384, "step": 842 }, { "epoch": 1.3402225755166932, "grad_norm": 1.1569057949109862, "learning_rate": 3.0098945368177318e-05, "loss": 0.2119, "step": 843 }, { "epoch": 1.34181240063593, "grad_norm": 1.0456882112116346, "learning_rate": 3.0099763422834424e-05, "loss": 0.183, "step": 844 }, { "epoch": 1.343402225755167, "grad_norm": 0.9923362747451282, "learning_rate": 3.0100584828363125e-05, "loss": 0.1606, "step": 845 }, { "epoch": 1.3449920508744038, "grad_norm": 22.01693406696935, "learning_rate": 3.0101409584485403e-05, "loss": 28.9673, "step": 846 }, { "epoch": 1.3465818759936408, "grad_norm": 1.351951339377561, "learning_rate": 3.0102237690922108e-05, "loss": 0.2667, "step": 847 }, { "epoch": 1.3481717011128775, "grad_norm": 1.416879158804829, "learning_rate": 3.0103069147392967e-05, "loss": 0.442, "step": 848 }, { "epoch": 1.3497615262321145, "grad_norm": 1.8620752797877833, "learning_rate": 3.0103903953616543e-05, "loss": 0.3537, "step": 849 }, { "epoch": 1.3513513513513513, "grad_norm": 1.9014671785542079, "learning_rate": 3.0104742109310305e-05, "loss": 0.2472, "step": 850 }, { "epoch": 1.3529411764705883, "grad_norm": 1.1523999923145103, "learning_rate": 3.0105583614190558e-05, "loss": 0.2196, "step": 851 }, { "epoch": 1.354531001589825, "grad_norm": 1.2084480862630775, "learning_rate": 3.0106428467972476e-05, "loss": 0.2012, "step": 852 }, { "epoch": 1.3561208267090619, "grad_norm": 1.1004271284305638, "learning_rate": 3.0107276670370118e-05, "loss": 0.2708, "step": 853 }, { "epoch": 1.3577106518282989, "grad_norm": 1.4587022076778249, "learning_rate": 3.0108128221096396e-05, "loss": 0.2202, "step": 854 }, { "epoch": 1.3593004769475359, "grad_norm": 1.2345319072548069, "learning_rate": 3.010898311986308e-05, "loss": 0.2529, "step": 855 }, { "epoch": 1.3608903020667726, "grad_norm": 1.1949215048100184, "learning_rate": 3.0109841366380828e-05, "loss": 0.1775, "step": 856 }, { "epoch": 1.3624801271860094, "grad_norm": 2.1709009442675486, "learning_rate": 3.0110702960359164e-05, "loss": 0.2037, "step": 857 }, { "epoch": 1.3640699523052464, "grad_norm": 1.7798981566833383, "learning_rate": 3.0111567901506444e-05, "loss": 0.2198, "step": 858 }, { "epoch": 1.3656597774244834, "grad_norm": 2.5850752638868832, "learning_rate": 3.0112436189529936e-05, "loss": 0.2908, "step": 859 }, { "epoch": 1.3672496025437202, "grad_norm": 1.29148844483089, "learning_rate": 3.0113307824135764e-05, "loss": 0.2208, "step": 860 }, { "epoch": 1.368839427662957, "grad_norm": 1.11118813127308, "learning_rate": 3.011418280502889e-05, "loss": 0.1947, "step": 861 }, { "epoch": 1.370429252782194, "grad_norm": 2.9678883622061236, "learning_rate": 3.0115061131913166e-05, "loss": 0.2461, "step": 862 }, { "epoch": 1.372019077901431, "grad_norm": 1.667169318803546, "learning_rate": 3.0115942804491326e-05, "loss": 0.2413, "step": 863 }, { "epoch": 1.3736089030206677, "grad_norm": 2.6753934201807468, "learning_rate": 3.011682782246494e-05, "loss": 0.2481, "step": 864 }, { "epoch": 1.3751987281399045, "grad_norm": 1.755937072758792, "learning_rate": 3.011771618553447e-05, "loss": 0.3807, "step": 865 }, { "epoch": 1.3767885532591415, "grad_norm": 1.4280951549376069, "learning_rate": 3.0118607893399245e-05, "loss": 0.2097, "step": 866 }, { "epoch": 1.3783783783783785, "grad_norm": 4.0611054950607315, "learning_rate": 3.0119502945757437e-05, "loss": 0.2905, "step": 867 }, { "epoch": 1.3799682034976153, "grad_norm": 1.574806507287194, "learning_rate": 3.012040134230611e-05, "loss": 0.2587, "step": 868 }, { "epoch": 1.381558028616852, "grad_norm": 1.2915169012661318, "learning_rate": 3.012130308274119e-05, "loss": 0.2202, "step": 869 }, { "epoch": 1.383147853736089, "grad_norm": 2.4218785615165497, "learning_rate": 3.0122208166757473e-05, "loss": 0.6821, "step": 870 }, { "epoch": 1.3847376788553258, "grad_norm": 2.2373306717355512, "learning_rate": 3.0123116594048624e-05, "loss": 0.2793, "step": 871 }, { "epoch": 1.3863275039745628, "grad_norm": 1.6023318381656593, "learning_rate": 3.0124028364307165e-05, "loss": 0.1828, "step": 872 }, { "epoch": 1.3879173290937996, "grad_norm": 1.225687758839214, "learning_rate": 3.0124943477224493e-05, "loss": 0.1997, "step": 873 }, { "epoch": 1.3895071542130366, "grad_norm": 1.381824957715516, "learning_rate": 3.0125861932490883e-05, "loss": 0.2219, "step": 874 }, { "epoch": 1.3910969793322734, "grad_norm": 1.4890748513157812, "learning_rate": 3.0126783729795474e-05, "loss": 0.2373, "step": 875 }, { "epoch": 1.3926868044515104, "grad_norm": 1.8693579765653567, "learning_rate": 3.012770886882626e-05, "loss": 0.2776, "step": 876 }, { "epoch": 1.3942766295707472, "grad_norm": 1.418823598558248, "learning_rate": 3.0128637349270122e-05, "loss": 0.2283, "step": 877 }, { "epoch": 1.3958664546899842, "grad_norm": 2.32597871848742, "learning_rate": 3.0129569170812802e-05, "loss": 0.278, "step": 878 }, { "epoch": 1.397456279809221, "grad_norm": 1.3603750153749643, "learning_rate": 3.0130504333138905e-05, "loss": 0.2469, "step": 879 }, { "epoch": 1.399046104928458, "grad_norm": 1.4727902631137277, "learning_rate": 3.013144283593193e-05, "loss": 0.2477, "step": 880 }, { "epoch": 1.4006359300476947, "grad_norm": 1.346381053880536, "learning_rate": 3.0132384678874206e-05, "loss": 0.1832, "step": 881 }, { "epoch": 1.4022257551669317, "grad_norm": 1.3112243961597747, "learning_rate": 3.0133329861646977e-05, "loss": 0.2444, "step": 882 }, { "epoch": 1.4038155802861685, "grad_norm": 1.4578806038947298, "learning_rate": 3.0134278383930308e-05, "loss": 0.2265, "step": 883 }, { "epoch": 1.4054054054054055, "grad_norm": 1.809708055055694, "learning_rate": 3.0135230245403176e-05, "loss": 0.2963, "step": 884 }, { "epoch": 1.4069952305246423, "grad_norm": 1.7017998120695168, "learning_rate": 3.01361854457434e-05, "loss": 0.2172, "step": 885 }, { "epoch": 1.4085850556438793, "grad_norm": 1.2430424848140813, "learning_rate": 3.0137143984627687e-05, "loss": 0.2237, "step": 886 }, { "epoch": 1.410174880763116, "grad_norm": 1.255817553321443, "learning_rate": 3.0138105861731607e-05, "loss": 0.2524, "step": 887 }, { "epoch": 1.4117647058823528, "grad_norm": 0.9168654408152741, "learning_rate": 3.013907107672959e-05, "loss": 0.1558, "step": 888 }, { "epoch": 1.4133545310015898, "grad_norm": 2.1306284057490297, "learning_rate": 3.0140039629294952e-05, "loss": 0.2802, "step": 889 }, { "epoch": 1.4149443561208268, "grad_norm": 1.4155743544498551, "learning_rate": 3.0141011519099878e-05, "loss": 0.2404, "step": 890 }, { "epoch": 1.4165341812400636, "grad_norm": 1.5796315947385693, "learning_rate": 3.014198674581541e-05, "loss": 0.1945, "step": 891 }, { "epoch": 1.4181240063593004, "grad_norm": 1.4635847421138366, "learning_rate": 3.014296530911147e-05, "loss": 0.2789, "step": 892 }, { "epoch": 1.4197138314785374, "grad_norm": 16.435185938297618, "learning_rate": 3.014394720865685e-05, "loss": 13.6999, "step": 893 }, { "epoch": 1.4213036565977744, "grad_norm": 0.6246273850460909, "learning_rate": 3.014493244411921e-05, "loss": 0.1835, "step": 894 }, { "epoch": 1.4228934817170111, "grad_norm": 1.0282269555885089, "learning_rate": 3.0145921015165098e-05, "loss": 0.2071, "step": 895 }, { "epoch": 1.424483306836248, "grad_norm": 0.9012248805350297, "learning_rate": 3.0146912921459907e-05, "loss": 0.871, "step": 896 }, { "epoch": 1.426073131955485, "grad_norm": 0.8977785080790269, "learning_rate": 3.0147908162667912e-05, "loss": 0.238, "step": 897 }, { "epoch": 1.427662957074722, "grad_norm": 1.687812654963962, "learning_rate": 3.0148906738452266e-05, "loss": 0.338, "step": 898 }, { "epoch": 1.4292527821939587, "grad_norm": 1.336690455955879, "learning_rate": 3.0149908648474973e-05, "loss": 0.2349, "step": 899 }, { "epoch": 1.4308426073131955, "grad_norm": 1.1672328680973998, "learning_rate": 3.0150913892396944e-05, "loss": 0.2123, "step": 900 }, { "epoch": 1.4324324324324325, "grad_norm": 16.932585097407248, "learning_rate": 3.0151922469877916e-05, "loss": 20.3812, "step": 901 }, { "epoch": 1.4340222575516695, "grad_norm": 2.4584874470032, "learning_rate": 3.015293438057655e-05, "loss": 0.2275, "step": 902 }, { "epoch": 1.4356120826709062, "grad_norm": 1.1736699657732497, "learning_rate": 3.0153949624150332e-05, "loss": 0.1995, "step": 903 }, { "epoch": 1.437201907790143, "grad_norm": 1.316575957160399, "learning_rate": 3.0154968200255632e-05, "loss": 0.2001, "step": 904 }, { "epoch": 1.43879173290938, "grad_norm": 1.2790587596447025, "learning_rate": 3.0155990108547726e-05, "loss": 0.2156, "step": 905 }, { "epoch": 1.4403815580286168, "grad_norm": 1.5542338028877063, "learning_rate": 3.0157015348680703e-05, "loss": 0.3059, "step": 906 }, { "epoch": 1.4419713831478538, "grad_norm": 0.8785217394873726, "learning_rate": 3.0158043920307588e-05, "loss": 0.194, "step": 907 }, { "epoch": 1.4435612082670906, "grad_norm": 1.014232513583885, "learning_rate": 3.0159075823080216e-05, "loss": 0.2055, "step": 908 }, { "epoch": 1.4451510333863276, "grad_norm": 2.588731507982686, "learning_rate": 3.0160111056649346e-05, "loss": 0.3299, "step": 909 }, { "epoch": 1.4467408585055643, "grad_norm": 4.90682119754212, "learning_rate": 3.016114962066458e-05, "loss": 0.2438, "step": 910 }, { "epoch": 1.4483306836248013, "grad_norm": 1.0354297708483435, "learning_rate": 3.016219151477441e-05, "loss": 0.2581, "step": 911 }, { "epoch": 1.449920508744038, "grad_norm": 1.338112520760405, "learning_rate": 3.0163236738626186e-05, "loss": 0.2241, "step": 912 }, { "epoch": 1.451510333863275, "grad_norm": 0.9493098029348258, "learning_rate": 3.016428529186614e-05, "loss": 0.2269, "step": 913 }, { "epoch": 1.4531001589825119, "grad_norm": 0.9583977133614148, "learning_rate": 3.016533717413937e-05, "loss": 0.1663, "step": 914 }, { "epoch": 1.4546899841017489, "grad_norm": 1.7063586588431303, "learning_rate": 3.0166392385089863e-05, "loss": 0.2247, "step": 915 }, { "epoch": 1.4562798092209857, "grad_norm": 0.9554173200113827, "learning_rate": 3.0167450924360454e-05, "loss": 0.1655, "step": 916 }, { "epoch": 1.4578696343402227, "grad_norm": 1.49206518916465, "learning_rate": 3.0168512791592876e-05, "loss": 0.198, "step": 917 }, { "epoch": 1.4594594594594594, "grad_norm": 1.3305470627065514, "learning_rate": 3.016957798642772e-05, "loss": 0.198, "step": 918 }, { "epoch": 1.4610492845786962, "grad_norm": 2.5890646646748885, "learning_rate": 3.017064650850446e-05, "loss": 0.36, "step": 919 }, { "epoch": 1.4626391096979332, "grad_norm": 1.3259346263383753, "learning_rate": 3.0171718357461436e-05, "loss": 0.1933, "step": 920 }, { "epoch": 1.4642289348171702, "grad_norm": 1.2617894160843537, "learning_rate": 3.0172793532935862e-05, "loss": 0.251, "step": 921 }, { "epoch": 1.465818759936407, "grad_norm": 2.421694446667709, "learning_rate": 3.0173872034563853e-05, "loss": 0.2503, "step": 922 }, { "epoch": 1.4674085850556438, "grad_norm": 0.7648883678245367, "learning_rate": 3.0174953861980344e-05, "loss": 0.1747, "step": 923 }, { "epoch": 1.4689984101748808, "grad_norm": 0.9160641425796828, "learning_rate": 3.0176039014819198e-05, "loss": 0.1673, "step": 924 }, { "epoch": 1.4705882352941178, "grad_norm": 1.1373239847956165, "learning_rate": 3.017712749271311e-05, "loss": 0.2425, "step": 925 }, { "epoch": 1.4721780604133545, "grad_norm": 0.9160398991089251, "learning_rate": 3.017821929529369e-05, "loss": 0.2094, "step": 926 }, { "epoch": 1.4737678855325913, "grad_norm": 1.0430762381227112, "learning_rate": 3.0179314422191398e-05, "loss": 0.1925, "step": 927 }, { "epoch": 1.4753577106518283, "grad_norm": 1.3191802162196546, "learning_rate": 3.0180412873035567e-05, "loss": 0.2344, "step": 928 }, { "epoch": 1.4769475357710653, "grad_norm": 4.947424583482794, "learning_rate": 3.0181514647454415e-05, "loss": 0.219, "step": 929 }, { "epoch": 1.478537360890302, "grad_norm": 1.5146220055792687, "learning_rate": 3.018261974507502e-05, "loss": 0.215, "step": 930 }, { "epoch": 1.4801271860095389, "grad_norm": 0.9962538822639876, "learning_rate": 3.018372816552336e-05, "loss": 0.2011, "step": 931 }, { "epoch": 1.4817170111287759, "grad_norm": 2.341206820361946, "learning_rate": 3.0184839908424272e-05, "loss": 0.2776, "step": 932 }, { "epoch": 1.4833068362480128, "grad_norm": 2.1388460215707865, "learning_rate": 3.0185954973401477e-05, "loss": 0.2068, "step": 933 }, { "epoch": 1.4848966613672496, "grad_norm": 3.694706601643514, "learning_rate": 3.0187073360077545e-05, "loss": 0.3092, "step": 934 }, { "epoch": 1.4864864864864864, "grad_norm": 0.9262196418337145, "learning_rate": 3.0188195068073968e-05, "loss": 0.2149, "step": 935 }, { "epoch": 1.4880763116057234, "grad_norm": 1.1163350507429823, "learning_rate": 3.018932009701107e-05, "loss": 0.2333, "step": 936 }, { "epoch": 1.4896661367249602, "grad_norm": 0.9522124896965085, "learning_rate": 3.019044844650808e-05, "loss": 0.1414, "step": 937 }, { "epoch": 1.4912559618441972, "grad_norm": 1.2939616754619978, "learning_rate": 3.019158011618309e-05, "loss": 0.2287, "step": 938 }, { "epoch": 1.492845786963434, "grad_norm": 1.0092810060608215, "learning_rate": 3.0192715105653073e-05, "loss": 0.2127, "step": 939 }, { "epoch": 1.494435612082671, "grad_norm": 1.0406147993798665, "learning_rate": 3.0193853414533866e-05, "loss": 0.2095, "step": 940 }, { "epoch": 1.4960254372019077, "grad_norm": 0.9071449289935577, "learning_rate": 3.019499504244021e-05, "loss": 0.1844, "step": 941 }, { "epoch": 1.4976152623211447, "grad_norm": 1.7153900258721835, "learning_rate": 3.0196139988985688e-05, "loss": 0.262, "step": 942 }, { "epoch": 1.4992050874403815, "grad_norm": 0.5904487087150878, "learning_rate": 3.019728825378278e-05, "loss": 0.145, "step": 943 }, { "epoch": 1.5007949125596185, "grad_norm": 1.8247822877116087, "learning_rate": 3.0198439836442845e-05, "loss": 0.193, "step": 944 }, { "epoch": 1.5023847376788553, "grad_norm": 1.579385206385149, "learning_rate": 3.019959473657612e-05, "loss": 0.276, "step": 945 }, { "epoch": 1.503974562798092, "grad_norm": 21.560067862668113, "learning_rate": 3.020075295379171e-05, "loss": 29.8694, "step": 946 }, { "epoch": 1.505564387917329, "grad_norm": 0.972419391222347, "learning_rate": 3.020191448769758e-05, "loss": 0.1602, "step": 947 }, { "epoch": 1.507154213036566, "grad_norm": 1.0211964289986513, "learning_rate": 3.020307933790062e-05, "loss": 0.1912, "step": 948 }, { "epoch": 1.5087440381558028, "grad_norm": 1.3650571017645552, "learning_rate": 3.0204247504006562e-05, "loss": 0.228, "step": 949 }, { "epoch": 1.5103338632750396, "grad_norm": 0.8363498829193096, "learning_rate": 3.020541898562001e-05, "loss": 0.1629, "step": 950 }, { "epoch": 1.5119236883942766, "grad_norm": 2.1189402580073846, "learning_rate": 3.0206593782344486e-05, "loss": 0.365, "step": 951 }, { "epoch": 1.5135135135135136, "grad_norm": 0.9685848960108028, "learning_rate": 3.0207771893782342e-05, "loss": 0.2005, "step": 952 }, { "epoch": 1.5151033386327504, "grad_norm": 1.2180447744468175, "learning_rate": 3.0208953319534837e-05, "loss": 0.2581, "step": 953 }, { "epoch": 1.5166931637519872, "grad_norm": 0.7446223748266646, "learning_rate": 3.0210138059202102e-05, "loss": 0.1906, "step": 954 }, { "epoch": 1.5182829888712241, "grad_norm": 9.531759256497438, "learning_rate": 3.021132611238315e-05, "loss": 15.5882, "step": 955 }, { "epoch": 1.5198728139904611, "grad_norm": 1.4016145484029638, "learning_rate": 3.021251747867586e-05, "loss": 0.2358, "step": 956 }, { "epoch": 1.521462639109698, "grad_norm": 2.310035025908386, "learning_rate": 3.0213712157677e-05, "loss": 0.2706, "step": 957 }, { "epoch": 1.5230524642289347, "grad_norm": 0.8749840567237552, "learning_rate": 3.021491014898221e-05, "loss": 0.2259, "step": 958 }, { "epoch": 1.5246422893481717, "grad_norm": 0.9057572896693453, "learning_rate": 3.0216111452186032e-05, "loss": 0.1754, "step": 959 }, { "epoch": 1.5262321144674087, "grad_norm": 1.3417570903182994, "learning_rate": 3.021731606688185e-05, "loss": 0.2093, "step": 960 }, { "epoch": 1.5278219395866455, "grad_norm": 0.9549433719026599, "learning_rate": 3.0218523992661945e-05, "loss": 0.1812, "step": 961 }, { "epoch": 1.5294117647058822, "grad_norm": 1.52526316606716, "learning_rate": 3.021973522911749e-05, "loss": 0.2287, "step": 962 }, { "epoch": 1.5310015898251192, "grad_norm": 1.3326802009714298, "learning_rate": 3.0220949775838515e-05, "loss": 0.2242, "step": 963 }, { "epoch": 1.5325914149443562, "grad_norm": 1.1738312452307538, "learning_rate": 3.022216763241394e-05, "loss": 0.2364, "step": 964 }, { "epoch": 1.534181240063593, "grad_norm": 2.1191503748880214, "learning_rate": 3.0223388798431565e-05, "loss": 0.2455, "step": 965 }, { "epoch": 1.5357710651828298, "grad_norm": 1.6451229321504048, "learning_rate": 3.0224613273478083e-05, "loss": 0.2382, "step": 966 }, { "epoch": 1.5373608903020668, "grad_norm": 1.1760832431101025, "learning_rate": 3.0225841057139037e-05, "loss": 0.2042, "step": 967 }, { "epoch": 1.5389507154213038, "grad_norm": 0.9870826186562225, "learning_rate": 3.0227072148998876e-05, "loss": 0.1567, "step": 968 }, { "epoch": 1.5405405405405406, "grad_norm": 0.8098166878215339, "learning_rate": 3.0228306548640926e-05, "loss": 0.1664, "step": 969 }, { "epoch": 1.5421303656597773, "grad_norm": 0.9277490490796043, "learning_rate": 3.022954425564736e-05, "loss": 0.193, "step": 970 }, { "epoch": 1.5437201907790143, "grad_norm": 0.938893933094649, "learning_rate": 3.0230785269599295e-05, "loss": 0.1918, "step": 971 }, { "epoch": 1.5453100158982513, "grad_norm": 1.3099503737392495, "learning_rate": 3.0232029590076657e-05, "loss": 0.1976, "step": 972 }, { "epoch": 1.5468998410174881, "grad_norm": 0.8096349538133699, "learning_rate": 3.0233277216658317e-05, "loss": 0.2131, "step": 973 }, { "epoch": 1.548489666136725, "grad_norm": 1.1287311944686815, "learning_rate": 3.0234528148922e-05, "loss": 0.2728, "step": 974 }, { "epoch": 1.550079491255962, "grad_norm": 1.7202651937718745, "learning_rate": 3.023578238644428e-05, "loss": 0.2265, "step": 975 }, { "epoch": 1.551669316375199, "grad_norm": 1.0953245974833956, "learning_rate": 3.023703992880067e-05, "loss": 0.2155, "step": 976 }, { "epoch": 1.5532591414944354, "grad_norm": 1.070851671683841, "learning_rate": 3.0238300775565523e-05, "loss": 0.1758, "step": 977 }, { "epoch": 1.5548489666136724, "grad_norm": 1.0680349712292683, "learning_rate": 3.0239564926312096e-05, "loss": 0.219, "step": 978 }, { "epoch": 1.5564387917329094, "grad_norm": 2.095857618985453, "learning_rate": 3.024083238061253e-05, "loss": 0.2736, "step": 979 }, { "epoch": 1.5580286168521462, "grad_norm": 1.2518626882494475, "learning_rate": 3.0242103138037816e-05, "loss": 0.1724, "step": 980 }, { "epoch": 1.559618441971383, "grad_norm": 0.9340994254212922, "learning_rate": 3.0243377198157862e-05, "loss": 0.2236, "step": 981 }, { "epoch": 1.56120826709062, "grad_norm": 1.345376697276294, "learning_rate": 3.0244654560541437e-05, "loss": 0.2625, "step": 982 }, { "epoch": 1.562798092209857, "grad_norm": 0.7581661333420866, "learning_rate": 3.0245935224756205e-05, "loss": 0.1954, "step": 983 }, { "epoch": 1.5643879173290938, "grad_norm": 0.7877809864590615, "learning_rate": 3.0247219190368703e-05, "loss": 0.2382, "step": 984 }, { "epoch": 1.5659777424483305, "grad_norm": 0.7596260285949576, "learning_rate": 3.0248506456944368e-05, "loss": 0.1981, "step": 985 }, { "epoch": 1.5675675675675675, "grad_norm": 1.3782681509129533, "learning_rate": 3.0249797024047494e-05, "loss": 0.1895, "step": 986 }, { "epoch": 1.5691573926868045, "grad_norm": 1.6252659542161834, "learning_rate": 3.0251090891241272e-05, "loss": 0.224, "step": 987 }, { "epoch": 1.5707472178060413, "grad_norm": 1.5071942409217602, "learning_rate": 3.0252388058087784e-05, "loss": 0.2437, "step": 988 }, { "epoch": 1.572337042925278, "grad_norm": 1.3492742680725291, "learning_rate": 3.0253688524147967e-05, "loss": 0.2216, "step": 989 }, { "epoch": 1.573926868044515, "grad_norm": 1.0371093488450234, "learning_rate": 3.0254992288981687e-05, "loss": 0.1956, "step": 990 }, { "epoch": 1.575516693163752, "grad_norm": 1.0109677958154635, "learning_rate": 3.0256299352147643e-05, "loss": 0.195, "step": 991 }, { "epoch": 1.5771065182829889, "grad_norm": 1.3165468239691032, "learning_rate": 3.0257609713203464e-05, "loss": 0.3447, "step": 992 }, { "epoch": 1.5786963434022256, "grad_norm": 1.2345693498313024, "learning_rate": 3.0258923371705615e-05, "loss": 0.2736, "step": 993 }, { "epoch": 1.5802861685214626, "grad_norm": 0.7140179416371315, "learning_rate": 3.026024032720948e-05, "loss": 0.1818, "step": 994 }, { "epoch": 1.5818759936406996, "grad_norm": 0.8561793343494055, "learning_rate": 3.0261560579269328e-05, "loss": 0.2059, "step": 995 }, { "epoch": 1.5834658187599364, "grad_norm": 0.6707325810119518, "learning_rate": 3.0262884127438286e-05, "loss": 0.1424, "step": 996 }, { "epoch": 1.5850556438791732, "grad_norm": 1.1564566657592257, "learning_rate": 3.02642109712684e-05, "loss": 0.1924, "step": 997 }, { "epoch": 1.5866454689984102, "grad_norm": 2.7521413005165845, "learning_rate": 3.0265541110310563e-05, "loss": 0.3609, "step": 998 }, { "epoch": 1.5882352941176472, "grad_norm": 20.195928288242115, "learning_rate": 3.0266874544114577e-05, "loss": 28.6857, "step": 999 }, { "epoch": 1.589825119236884, "grad_norm": 0.798479618225531, "learning_rate": 3.026821127222912e-05, "loss": 0.2308, "step": 1000 }, { "epoch": 1.5914149443561207, "grad_norm": 0.7347165195035272, "learning_rate": 3.026955129420176e-05, "loss": 0.1767, "step": 1001 }, { "epoch": 1.5930047694753577, "grad_norm": 1.6135172401601705, "learning_rate": 3.0270894609578962e-05, "loss": 0.2331, "step": 1002 }, { "epoch": 1.5945945945945947, "grad_norm": 1.3851580712301266, "learning_rate": 3.0272241217906033e-05, "loss": 0.1947, "step": 1003 }, { "epoch": 1.5961844197138315, "grad_norm": 1.198578547234958, "learning_rate": 3.0273591118727226e-05, "loss": 0.1809, "step": 1004 }, { "epoch": 1.5977742448330683, "grad_norm": 0.6671479533763305, "learning_rate": 3.0274944311585624e-05, "loss": 0.139, "step": 1005 }, { "epoch": 1.5993640699523053, "grad_norm": 1.1322573374797236, "learning_rate": 3.0276300796023234e-05, "loss": 0.2419, "step": 1006 }, { "epoch": 1.6009538950715423, "grad_norm": 0.8070991166539896, "learning_rate": 3.0277660571580933e-05, "loss": 0.1624, "step": 1007 }, { "epoch": 1.602543720190779, "grad_norm": 1.1182059640193802, "learning_rate": 3.027902363779848e-05, "loss": 0.2045, "step": 1008 }, { "epoch": 1.6041335453100158, "grad_norm": 1.0690797330581425, "learning_rate": 3.0280389994214533e-05, "loss": 0.244, "step": 1009 }, { "epoch": 1.6057233704292528, "grad_norm": 0.6576839834045582, "learning_rate": 3.028175964036664e-05, "loss": 0.127, "step": 1010 }, { "epoch": 1.6073131955484896, "grad_norm": 1.116952111367835, "learning_rate": 3.02831325757912e-05, "loss": 0.1877, "step": 1011 }, { "epoch": 1.6089030206677264, "grad_norm": 0.7363357072866892, "learning_rate": 3.0284508800023537e-05, "loss": 0.1853, "step": 1012 }, { "epoch": 1.6104928457869634, "grad_norm": 1.3332374340482385, "learning_rate": 3.0285888312597856e-05, "loss": 0.2172, "step": 1013 }, { "epoch": 1.6120826709062004, "grad_norm": 1.0798458422470192, "learning_rate": 3.0287271113047227e-05, "loss": 0.1868, "step": 1014 }, { "epoch": 1.6136724960254372, "grad_norm": 1.4191422200107795, "learning_rate": 3.028865720090364e-05, "loss": 0.3528, "step": 1015 }, { "epoch": 1.615262321144674, "grad_norm": 22.39968352243893, "learning_rate": 3.0290046575697942e-05, "loss": 29.3057, "step": 1016 }, { "epoch": 1.616852146263911, "grad_norm": 0.8251643810932937, "learning_rate": 3.0291439236959885e-05, "loss": 0.216, "step": 1017 }, { "epoch": 1.618441971383148, "grad_norm": 1.0129065451329513, "learning_rate": 3.0292835184218094e-05, "loss": 0.2122, "step": 1018 }, { "epoch": 1.6200317965023847, "grad_norm": 1.3383227005604625, "learning_rate": 3.02942344170001e-05, "loss": 0.2426, "step": 1019 }, { "epoch": 1.6216216216216215, "grad_norm": 1.039568889190428, "learning_rate": 3.0295636934832317e-05, "loss": 0.2075, "step": 1020 }, { "epoch": 1.6232114467408585, "grad_norm": 1.4551943038682469, "learning_rate": 3.029704273724004e-05, "loss": 0.2207, "step": 1021 }, { "epoch": 1.6248012718600955, "grad_norm": 1.7130016701184296, "learning_rate": 3.029845182374745e-05, "loss": 0.2328, "step": 1022 }, { "epoch": 1.6263910969793323, "grad_norm": 1.3177197780658378, "learning_rate": 3.029986419387762e-05, "loss": 0.2341, "step": 1023 }, { "epoch": 1.627980922098569, "grad_norm": 1.1661081801724766, "learning_rate": 3.030127984715253e-05, "loss": 0.1661, "step": 1024 }, { "epoch": 1.629570747217806, "grad_norm": 1.2114763315599728, "learning_rate": 3.0302698783093024e-05, "loss": 0.2163, "step": 1025 }, { "epoch": 1.631160572337043, "grad_norm": 1.7429045215392118, "learning_rate": 3.0304121001218837e-05, "loss": 0.2368, "step": 1026 }, { "epoch": 1.6327503974562798, "grad_norm": 1.229976150320729, "learning_rate": 3.0305546501048617e-05, "loss": 0.3057, "step": 1027 }, { "epoch": 1.6343402225755166, "grad_norm": 0.8277196856122726, "learning_rate": 3.030697528209986e-05, "loss": 0.1898, "step": 1028 }, { "epoch": 1.6359300476947536, "grad_norm": 1.2518459872273497, "learning_rate": 3.0308407343888985e-05, "loss": 0.1998, "step": 1029 }, { "epoch": 1.6375198728139906, "grad_norm": 1.1442554900078146, "learning_rate": 3.0309842685931303e-05, "loss": 0.2276, "step": 1030 }, { "epoch": 1.6391096979332274, "grad_norm": 0.999737171772209, "learning_rate": 3.0311281307740995e-05, "loss": 0.1835, "step": 1031 }, { "epoch": 1.6406995230524641, "grad_norm": 0.8762389730055639, "learning_rate": 3.0312723208831133e-05, "loss": 0.2216, "step": 1032 }, { "epoch": 1.6422893481717011, "grad_norm": 0.8722779791702073, "learning_rate": 3.0314168388713687e-05, "loss": 0.2085, "step": 1033 }, { "epoch": 1.6438791732909381, "grad_norm": 1.651788641872156, "learning_rate": 3.031561684689953e-05, "loss": 0.9461, "step": 1034 }, { "epoch": 1.645468998410175, "grad_norm": 1.1482000486638182, "learning_rate": 3.0317068582898385e-05, "loss": 0.3155, "step": 1035 }, { "epoch": 1.6470588235294117, "grad_norm": 1.365565134471463, "learning_rate": 3.031852359621892e-05, "loss": 0.2357, "step": 1036 }, { "epoch": 1.6486486486486487, "grad_norm": 1.0540632683825748, "learning_rate": 3.031998188636865e-05, "loss": 0.2552, "step": 1037 }, { "epoch": 1.6502384737678857, "grad_norm": 0.8327786071028546, "learning_rate": 3.032144345285401e-05, "loss": 0.1849, "step": 1038 }, { "epoch": 1.6518282988871225, "grad_norm": 0.7438323277877603, "learning_rate": 3.032290829518029e-05, "loss": 0.1984, "step": 1039 }, { "epoch": 1.6534181240063592, "grad_norm": 0.7180055674139668, "learning_rate": 3.0324376412851707e-05, "loss": 0.1646, "step": 1040 }, { "epoch": 1.6550079491255962, "grad_norm": 1.026385899745576, "learning_rate": 3.032584780537136e-05, "loss": 0.2127, "step": 1041 }, { "epoch": 1.6565977742448332, "grad_norm": 0.9562139614886938, "learning_rate": 3.0327322472241228e-05, "loss": 0.2218, "step": 1042 }, { "epoch": 1.6581875993640698, "grad_norm": 6.774052589047219, "learning_rate": 3.0328800412962206e-05, "loss": 0.3589, "step": 1043 }, { "epoch": 1.6597774244833068, "grad_norm": 0.9231033126659791, "learning_rate": 3.0330281627034043e-05, "loss": 0.1542, "step": 1044 }, { "epoch": 1.6613672496025438, "grad_norm": 0.961980245210236, "learning_rate": 3.0331766113955405e-05, "loss": 0.2483, "step": 1045 }, { "epoch": 1.6629570747217806, "grad_norm": 1.8238879872432594, "learning_rate": 3.033325387322386e-05, "loss": 0.2628, "step": 1046 }, { "epoch": 1.6645468998410173, "grad_norm": 1.2353631896320034, "learning_rate": 3.0334744904335844e-05, "loss": 0.1984, "step": 1047 }, { "epoch": 1.6661367249602543, "grad_norm": 1.5247606680022396, "learning_rate": 3.03362392067867e-05, "loss": 0.2095, "step": 1048 }, { "epoch": 1.6677265500794913, "grad_norm": 1.6499831849247582, "learning_rate": 3.033773678007067e-05, "loss": 0.1796, "step": 1049 }, { "epoch": 1.669316375198728, "grad_norm": 1.4813913359466746, "learning_rate": 3.0339237623680876e-05, "loss": 0.2163, "step": 1050 }, { "epoch": 1.6709062003179649, "grad_norm": 7.596141787336853, "learning_rate": 3.0340741737109322e-05, "loss": 9.2847, "step": 1051 }, { "epoch": 1.6724960254372019, "grad_norm": 1.0369600353881152, "learning_rate": 3.034224911984693e-05, "loss": 0.2121, "step": 1052 }, { "epoch": 1.6740858505564389, "grad_norm": 1.1020226538041997, "learning_rate": 3.034375977138351e-05, "loss": 0.1804, "step": 1053 }, { "epoch": 1.6756756756756757, "grad_norm": 0.908365170734913, "learning_rate": 3.0345273691207747e-05, "loss": 0.1553, "step": 1054 }, { "epoch": 1.6772655007949124, "grad_norm": 1.2695625802478705, "learning_rate": 3.034679087880726e-05, "loss": 0.2499, "step": 1055 }, { "epoch": 1.6788553259141494, "grad_norm": 0.8283985508808241, "learning_rate": 3.0348311333668503e-05, "loss": 0.1774, "step": 1056 }, { "epoch": 1.6804451510333864, "grad_norm": 1.719289282047525, "learning_rate": 3.0349835055276883e-05, "loss": 0.2125, "step": 1057 }, { "epoch": 1.6820349761526232, "grad_norm": 1.3724527186516684, "learning_rate": 3.035136204311667e-05, "loss": 0.2118, "step": 1058 }, { "epoch": 1.68362480127186, "grad_norm": 1.199393190286099, "learning_rate": 3.035289229667102e-05, "loss": 0.2746, "step": 1059 }, { "epoch": 1.685214626391097, "grad_norm": 1.4838050003907521, "learning_rate": 3.0354425815422017e-05, "loss": 0.2221, "step": 1060 }, { "epoch": 1.686804451510334, "grad_norm": 0.9449772673819271, "learning_rate": 3.035596259885061e-05, "loss": 0.2046, "step": 1061 }, { "epoch": 1.6883942766295708, "grad_norm": 0.7772298693527745, "learning_rate": 3.0357502646436654e-05, "loss": 0.1979, "step": 1062 }, { "epoch": 1.6899841017488075, "grad_norm": 1.7922532106276083, "learning_rate": 3.03590459576589e-05, "loss": 0.1719, "step": 1063 }, { "epoch": 1.6915739268680445, "grad_norm": 1.4589296485518637, "learning_rate": 3.036059253199499e-05, "loss": 0.1684, "step": 1064 }, { "epoch": 1.6931637519872815, "grad_norm": 1.0788749078493898, "learning_rate": 3.0362142368921467e-05, "loss": 0.1816, "step": 1065 }, { "epoch": 1.6947535771065183, "grad_norm": 1.7689777578839445, "learning_rate": 3.036369546791377e-05, "loss": 0.2081, "step": 1066 }, { "epoch": 1.696343402225755, "grad_norm": 1.0230698090068941, "learning_rate": 3.0365251828446224e-05, "loss": 0.1671, "step": 1067 }, { "epoch": 1.697933227344992, "grad_norm": 0.7742377541212698, "learning_rate": 3.0366811449992066e-05, "loss": 0.2232, "step": 1068 }, { "epoch": 1.699523052464229, "grad_norm": 1.0728562233160406, "learning_rate": 3.0368374332023418e-05, "loss": 0.2004, "step": 1069 }, { "epoch": 1.7011128775834659, "grad_norm": 1.6340510251172695, "learning_rate": 3.03699404740113e-05, "loss": 0.1809, "step": 1070 }, { "epoch": 1.7027027027027026, "grad_norm": 2.5581660674212094, "learning_rate": 3.037150987542562e-05, "loss": 0.2397, "step": 1071 }, { "epoch": 1.7042925278219396, "grad_norm": 0.8876447250480038, "learning_rate": 3.0373082535735213e-05, "loss": 0.1879, "step": 1072 }, { "epoch": 1.7058823529411766, "grad_norm": 2.2660333787437197, "learning_rate": 3.037465845440777e-05, "loss": 0.3689, "step": 1073 }, { "epoch": 1.7074721780604134, "grad_norm": 1.081687108896482, "learning_rate": 3.0376237630909923e-05, "loss": 0.1641, "step": 1074 }, { "epoch": 1.7090620031796502, "grad_norm": 1.1329291134230637, "learning_rate": 3.0377820064707148e-05, "loss": 0.2323, "step": 1075 }, { "epoch": 1.7106518282988872, "grad_norm": 1.3131083197263513, "learning_rate": 3.0379405755263873e-05, "loss": 0.1821, "step": 1076 }, { "epoch": 1.712241653418124, "grad_norm": 1.263863001755885, "learning_rate": 3.0380994702043387e-05, "loss": 0.2343, "step": 1077 }, { "epoch": 1.7138314785373607, "grad_norm": 0.9150777263038812, "learning_rate": 3.0382586904507885e-05, "loss": 0.1948, "step": 1078 }, { "epoch": 1.7154213036565977, "grad_norm": 1.015222000314037, "learning_rate": 3.0384182362118484e-05, "loss": 0.2387, "step": 1079 }, { "epoch": 1.7170111287758347, "grad_norm": 2.411797433549411, "learning_rate": 3.0385781074335162e-05, "loss": 0.3066, "step": 1080 }, { "epoch": 1.7186009538950715, "grad_norm": 0.9377602754823323, "learning_rate": 3.0387383040616815e-05, "loss": 0.2277, "step": 1081 }, { "epoch": 1.7201907790143083, "grad_norm": 0.8059865844581234, "learning_rate": 3.0388988260421242e-05, "loss": 0.1716, "step": 1082 }, { "epoch": 1.7217806041335453, "grad_norm": 1.6384585871448132, "learning_rate": 3.039059673320513e-05, "loss": 0.2632, "step": 1083 }, { "epoch": 1.7233704292527823, "grad_norm": 0.7980509939317432, "learning_rate": 3.0392208458424052e-05, "loss": 0.2545, "step": 1084 }, { "epoch": 1.724960254372019, "grad_norm": 1.104975954260964, "learning_rate": 3.0393823435532537e-05, "loss": 0.1616, "step": 1085 }, { "epoch": 1.7265500794912558, "grad_norm": 0.8893150566169342, "learning_rate": 3.039544166398395e-05, "loss": 0.2233, "step": 1086 }, { "epoch": 1.7281399046104928, "grad_norm": 1.28439427250214, "learning_rate": 3.0397063143230567e-05, "loss": 0.259, "step": 1087 }, { "epoch": 1.7297297297297298, "grad_norm": 0.9780766769616641, "learning_rate": 3.0398687872723604e-05, "loss": 0.139, "step": 1088 }, { "epoch": 1.7313195548489666, "grad_norm": 0.7245102415263498, "learning_rate": 3.0400315851913126e-05, "loss": 0.1496, "step": 1089 }, { "epoch": 1.7329093799682034, "grad_norm": 0.8705576855126401, "learning_rate": 3.0401947080248135e-05, "loss": 0.2256, "step": 1090 }, { "epoch": 1.7344992050874404, "grad_norm": 0.7183604194049938, "learning_rate": 3.040358155717651e-05, "loss": 0.1597, "step": 1091 }, { "epoch": 1.7360890302066774, "grad_norm": 0.709877694558199, "learning_rate": 3.0405219282145045e-05, "loss": 0.1465, "step": 1092 }, { "epoch": 1.7376788553259142, "grad_norm": 1.8034292202089164, "learning_rate": 3.040686025459943e-05, "loss": 0.2167, "step": 1093 }, { "epoch": 1.739268680445151, "grad_norm": 1.0610575768570916, "learning_rate": 3.0408504473984248e-05, "loss": 0.2708, "step": 1094 }, { "epoch": 1.740858505564388, "grad_norm": 0.7450606370961692, "learning_rate": 3.0410151939742995e-05, "loss": 0.19, "step": 1095 }, { "epoch": 1.742448330683625, "grad_norm": 0.6325677092298524, "learning_rate": 3.0411802651318065e-05, "loss": 0.1757, "step": 1096 }, { "epoch": 1.7440381558028617, "grad_norm": 0.82653665205964, "learning_rate": 3.041345660815076e-05, "loss": 0.1904, "step": 1097 }, { "epoch": 1.7456279809220985, "grad_norm": 1.0873877231830769, "learning_rate": 3.0415113809681256e-05, "loss": 0.1822, "step": 1098 }, { "epoch": 1.7472178060413355, "grad_norm": 0.9919977370324129, "learning_rate": 3.041677425534867e-05, "loss": 0.1911, "step": 1099 }, { "epoch": 1.7488076311605725, "grad_norm": 0.589273071212331, "learning_rate": 3.0418437944590988e-05, "loss": 0.1832, "step": 1100 }, { "epoch": 1.7503974562798092, "grad_norm": 1.0208086383484447, "learning_rate": 3.042010487684511e-05, "loss": 0.2155, "step": 1101 }, { "epoch": 1.751987281399046, "grad_norm": 1.0176715594852468, "learning_rate": 3.042177505154685e-05, "loss": 0.1866, "step": 1102 }, { "epoch": 1.753577106518283, "grad_norm": 1.0756777747046793, "learning_rate": 3.042344846813091e-05, "loss": 0.2109, "step": 1103 }, { "epoch": 1.75516693163752, "grad_norm": 0.8227900305398175, "learning_rate": 3.0425125126030896e-05, "loss": 0.1818, "step": 1104 }, { "epoch": 1.7567567567567568, "grad_norm": 0.9301873768339785, "learning_rate": 3.0426805024679327e-05, "loss": 0.2194, "step": 1105 }, { "epoch": 1.7583465818759936, "grad_norm": 1.5434973299121142, "learning_rate": 3.042848816350761e-05, "loss": 0.4487, "step": 1106 }, { "epoch": 1.7599364069952306, "grad_norm": 1.1556004683668828, "learning_rate": 3.0430174541946077e-05, "loss": 0.2309, "step": 1107 }, { "epoch": 1.7615262321144676, "grad_norm": 0.870734955334347, "learning_rate": 3.0431864159423924e-05, "loss": 0.2494, "step": 1108 }, { "epoch": 1.7631160572337043, "grad_norm": 0.8468628260534098, "learning_rate": 3.043355701536931e-05, "loss": 0.2457, "step": 1109 }, { "epoch": 1.7647058823529411, "grad_norm": 1.1019902423387296, "learning_rate": 3.043525310920923e-05, "loss": 0.176, "step": 1110 }, { "epoch": 1.7662957074721781, "grad_norm": 1.115427506141178, "learning_rate": 3.0436952440369646e-05, "loss": 0.2072, "step": 1111 }, { "epoch": 1.767885532591415, "grad_norm": 0.6635894962732646, "learning_rate": 3.0438655008275384e-05, "loss": 0.2005, "step": 1112 }, { "epoch": 1.7694753577106517, "grad_norm": 0.9838085526265301, "learning_rate": 3.044036081235019e-05, "loss": 0.2019, "step": 1113 }, { "epoch": 1.7710651828298887, "grad_norm": 1.0367992713995047, "learning_rate": 3.0442069852016696e-05, "loss": 0.2042, "step": 1114 }, { "epoch": 1.7726550079491257, "grad_norm": 0.9270555243151636, "learning_rate": 3.0443782126696473e-05, "loss": 0.1869, "step": 1115 }, { "epoch": 1.7742448330683624, "grad_norm": 0.7373418222253314, "learning_rate": 3.0445497635809985e-05, "loss": 0.1739, "step": 1116 }, { "epoch": 1.7758346581875992, "grad_norm": 0.9030385129023926, "learning_rate": 3.0447216378776562e-05, "loss": 0.2012, "step": 1117 }, { "epoch": 1.7774244833068362, "grad_norm": 0.9298645783725915, "learning_rate": 3.0448938355014496e-05, "loss": 0.1638, "step": 1118 }, { "epoch": 1.7790143084260732, "grad_norm": 1.0646618244471782, "learning_rate": 3.045066356394096e-05, "loss": 0.1644, "step": 1119 }, { "epoch": 1.78060413354531, "grad_norm": 0.6626942473834987, "learning_rate": 3.045239200497202e-05, "loss": 0.1658, "step": 1120 }, { "epoch": 1.7821939586645468, "grad_norm": 0.5893559076292791, "learning_rate": 3.045412367752268e-05, "loss": 0.1768, "step": 1121 }, { "epoch": 1.7837837837837838, "grad_norm": 0.7776302265151359, "learning_rate": 3.045585858100682e-05, "loss": 0.1573, "step": 1122 }, { "epoch": 1.7853736089030208, "grad_norm": 1.0535407381034128, "learning_rate": 3.0457596714837234e-05, "loss": 0.2013, "step": 1123 }, { "epoch": 1.7869634340222575, "grad_norm": 1.4270550339893346, "learning_rate": 3.0459338078425624e-05, "loss": 0.287, "step": 1124 }, { "epoch": 1.7885532591414943, "grad_norm": 0.9610717184811765, "learning_rate": 3.046108267118263e-05, "loss": 0.2835, "step": 1125 }, { "epoch": 1.7901430842607313, "grad_norm": 0.6208644091114227, "learning_rate": 3.0462830492517734e-05, "loss": 0.152, "step": 1126 }, { "epoch": 1.7917329093799683, "grad_norm": 1.747713820343999, "learning_rate": 3.046458154183938e-05, "loss": 0.4747, "step": 1127 }, { "epoch": 1.793322734499205, "grad_norm": 1.1523204040244563, "learning_rate": 3.04663358185549e-05, "loss": 0.2613, "step": 1128 }, { "epoch": 1.7949125596184419, "grad_norm": 1.5645458998789783, "learning_rate": 3.0468093322070527e-05, "loss": 0.218, "step": 1129 }, { "epoch": 1.7965023847376789, "grad_norm": 1.0264419892393362, "learning_rate": 3.0469854051791432e-05, "loss": 0.1817, "step": 1130 }, { "epoch": 1.7980922098569159, "grad_norm": 1.2162302125395081, "learning_rate": 3.047161800712164e-05, "loss": 1.0549, "step": 1131 }, { "epoch": 1.7996820349761526, "grad_norm": 1.0745801815243352, "learning_rate": 3.0473385187464133e-05, "loss": 0.2527, "step": 1132 }, { "epoch": 1.8012718600953894, "grad_norm": 0.728424129168066, "learning_rate": 3.0475155592220794e-05, "loss": 0.1685, "step": 1133 }, { "epoch": 1.8028616852146264, "grad_norm": 14.728235801658727, "learning_rate": 3.0476929220792394e-05, "loss": 20.7016, "step": 1134 }, { "epoch": 1.8044515103338634, "grad_norm": 2.1920150345067864, "learning_rate": 3.0478706072578618e-05, "loss": 0.186, "step": 1135 }, { "epoch": 1.8060413354531002, "grad_norm": 1.0751156691894526, "learning_rate": 3.0480486146978074e-05, "loss": 0.1747, "step": 1136 }, { "epoch": 1.807631160572337, "grad_norm": 0.8908957051791061, "learning_rate": 3.048226944338827e-05, "loss": 0.1716, "step": 1137 }, { "epoch": 1.809220985691574, "grad_norm": 1.8754035281827828, "learning_rate": 3.0484055961205618e-05, "loss": 0.2283, "step": 1138 }, { "epoch": 1.810810810810811, "grad_norm": 0.9899432694217898, "learning_rate": 3.0485845699825457e-05, "loss": 0.1687, "step": 1139 }, { "epoch": 1.8124006359300477, "grad_norm": 4.106341204357688, "learning_rate": 3.0487638658642025e-05, "loss": 0.3294, "step": 1140 }, { "epoch": 1.8139904610492845, "grad_norm": 0.6275997073949349, "learning_rate": 3.0489434837048468e-05, "loss": 0.18, "step": 1141 }, { "epoch": 1.8155802861685215, "grad_norm": 0.8831001047730209, "learning_rate": 3.049123423443684e-05, "loss": 0.215, "step": 1142 }, { "epoch": 1.8171701112877583, "grad_norm": 0.7183832030294394, "learning_rate": 3.0493036850198112e-05, "loss": 0.1682, "step": 1143 }, { "epoch": 1.818759936406995, "grad_norm": 1.9953112788424914, "learning_rate": 3.0494842683722162e-05, "loss": 0.2591, "step": 1144 }, { "epoch": 1.820349761526232, "grad_norm": 1.3171750197182506, "learning_rate": 3.049665173439779e-05, "loss": 0.1961, "step": 1145 }, { "epoch": 1.821939586645469, "grad_norm": 0.8114874081953771, "learning_rate": 3.049846400161269e-05, "loss": 0.1802, "step": 1146 }, { "epoch": 1.8235294117647058, "grad_norm": 3.2290139570364276, "learning_rate": 3.0500279484753472e-05, "loss": 0.2699, "step": 1147 }, { "epoch": 1.8251192368839426, "grad_norm": 1.3570592685888634, "learning_rate": 3.0502098183205673e-05, "loss": 0.2247, "step": 1148 }, { "epoch": 1.8267090620031796, "grad_norm": 0.8368270553854885, "learning_rate": 3.0503920096353727e-05, "loss": 0.157, "step": 1149 }, { "epoch": 1.8282988871224166, "grad_norm": 0.9969744333628553, "learning_rate": 3.0505745223580955e-05, "loss": 0.1836, "step": 1150 }, { "epoch": 1.8298887122416534, "grad_norm": 1.234835673420031, "learning_rate": 3.0507573564269658e-05, "loss": 0.1584, "step": 1151 }, { "epoch": 1.8314785373608902, "grad_norm": 27.351762679014104, "learning_rate": 3.0509405117800992e-05, "loss": 28.4039, "step": 1152 }, { "epoch": 1.8330683624801272, "grad_norm": 1.000350779444888, "learning_rate": 3.0511239883555036e-05, "loss": 0.1827, "step": 1153 }, { "epoch": 1.8346581875993642, "grad_norm": 0.8498434814140499, "learning_rate": 3.051307786091079e-05, "loss": 0.1638, "step": 1154 }, { "epoch": 1.836248012718601, "grad_norm": 0.8981793988731618, "learning_rate": 3.051491904924617e-05, "loss": 0.2264, "step": 1155 }, { "epoch": 1.8378378378378377, "grad_norm": 0.8397168290103142, "learning_rate": 3.0516763447938013e-05, "loss": 0.2034, "step": 1156 }, { "epoch": 1.8394276629570747, "grad_norm": 1.1000143944864458, "learning_rate": 3.0518611056362026e-05, "loss": 0.2631, "step": 1157 }, { "epoch": 1.8410174880763117, "grad_norm": 0.7999298844899428, "learning_rate": 3.052046187389289e-05, "loss": 0.2086, "step": 1158 }, { "epoch": 1.8426073131955485, "grad_norm": 1.3985210103400227, "learning_rate": 3.052231589990414e-05, "loss": 0.2358, "step": 1159 }, { "epoch": 1.8441971383147853, "grad_norm": 0.7009887118889492, "learning_rate": 3.052417313376829e-05, "loss": 0.1429, "step": 1160 }, { "epoch": 1.8457869634340223, "grad_norm": 0.8599550223032569, "learning_rate": 3.0526033574856707e-05, "loss": 0.1608, "step": 1161 }, { "epoch": 1.8473767885532593, "grad_norm": 0.8294913983800233, "learning_rate": 3.052789722253971e-05, "loss": 0.1515, "step": 1162 }, { "epoch": 1.848966613672496, "grad_norm": 1.0388904243923291, "learning_rate": 3.052976407618652e-05, "loss": 0.3636, "step": 1163 }, { "epoch": 1.8505564387917328, "grad_norm": 6.67236447135243, "learning_rate": 3.0531634135165287e-05, "loss": 39.2069, "step": 1164 }, { "epoch": 1.8521462639109698, "grad_norm": 1.1797548102934703, "learning_rate": 3.0533507398843035e-05, "loss": 0.3433, "step": 1165 }, { "epoch": 1.8537360890302068, "grad_norm": 0.9093756763452346, "learning_rate": 3.053538386658576e-05, "loss": 0.1486, "step": 1166 }, { "epoch": 1.8553259141494436, "grad_norm": 2.1277926822781215, "learning_rate": 3.053726353775832e-05, "loss": 0.2027, "step": 1167 }, { "epoch": 1.8569157392686804, "grad_norm": 1.3430101632288174, "learning_rate": 3.053914641172455e-05, "loss": 0.2591, "step": 1168 }, { "epoch": 1.8585055643879174, "grad_norm": 1.4628339002093025, "learning_rate": 3.0541032487847134e-05, "loss": 0.1841, "step": 1169 }, { "epoch": 1.8600953895071544, "grad_norm": 1.2276040061269424, "learning_rate": 3.05429217654877e-05, "loss": 0.2073, "step": 1170 }, { "epoch": 1.8616852146263911, "grad_norm": 1.3848795173779, "learning_rate": 3.0544814244006825e-05, "loss": 0.2098, "step": 1171 }, { "epoch": 1.863275039745628, "grad_norm": 1.3364132832604656, "learning_rate": 3.054670992276397e-05, "loss": 0.2028, "step": 1172 }, { "epoch": 1.864864864864865, "grad_norm": 0.68593815657712, "learning_rate": 3.054860880111748e-05, "loss": 0.166, "step": 1173 }, { "epoch": 1.866454689984102, "grad_norm": 1.0344260040302988, "learning_rate": 3.055051087842469e-05, "loss": 0.2093, "step": 1174 }, { "epoch": 1.8680445151033387, "grad_norm": 1.9159640101298376, "learning_rate": 3.0552416154041804e-05, "loss": 0.2449, "step": 1175 }, { "epoch": 1.8696343402225755, "grad_norm": 1.318210499922234, "learning_rate": 3.055432462732395e-05, "loss": 0.1904, "step": 1176 }, { "epoch": 1.8712241653418125, "grad_norm": 2.0388627574881166, "learning_rate": 3.0556236297625195e-05, "loss": 0.2366, "step": 1177 }, { "epoch": 1.8728139904610492, "grad_norm": 0.9143372839341986, "learning_rate": 3.055815116429849e-05, "loss": 0.1673, "step": 1178 }, { "epoch": 1.874403815580286, "grad_norm": 0.8141043688671673, "learning_rate": 3.056006922669572e-05, "loss": 0.1781, "step": 1179 }, { "epoch": 1.875993640699523, "grad_norm": 1.0319270471681976, "learning_rate": 3.056199048416771e-05, "loss": 0.1863, "step": 1180 }, { "epoch": 1.87758346581876, "grad_norm": 1.2351467416731574, "learning_rate": 3.0563914936064166e-05, "loss": 0.2838, "step": 1181 }, { "epoch": 1.8791732909379968, "grad_norm": 1.5447496122492863, "learning_rate": 3.0565842581733744e-05, "loss": 0.2709, "step": 1182 }, { "epoch": 1.8807631160572336, "grad_norm": 0.9662387779468968, "learning_rate": 3.0567773420523996e-05, "loss": 0.2416, "step": 1183 }, { "epoch": 1.8823529411764706, "grad_norm": 1.1894142022997076, "learning_rate": 3.05697074517814e-05, "loss": 0.2758, "step": 1184 }, { "epoch": 1.8839427662957076, "grad_norm": 1.5308181855530882, "learning_rate": 3.057164467485137e-05, "loss": 0.2031, "step": 1185 }, { "epoch": 1.8855325914149443, "grad_norm": 1.2721557156335144, "learning_rate": 3.0573585089078214e-05, "loss": 0.1981, "step": 1186 }, { "epoch": 1.8871224165341811, "grad_norm": 1.0550759938044105, "learning_rate": 3.0575528693805184e-05, "loss": 0.1587, "step": 1187 }, { "epoch": 1.8887122416534181, "grad_norm": 1.2722089586666157, "learning_rate": 3.057747548837443e-05, "loss": 0.1592, "step": 1188 }, { "epoch": 1.890302066772655, "grad_norm": 1.0533233246999543, "learning_rate": 3.057942547212703e-05, "loss": 0.2189, "step": 1189 }, { "epoch": 1.8918918918918919, "grad_norm": 1.052896557333416, "learning_rate": 3.0581378644403e-05, "loss": 0.1603, "step": 1190 }, { "epoch": 1.8934817170111287, "grad_norm": 0.8152404984290667, "learning_rate": 3.0583335004541253e-05, "loss": 0.1689, "step": 1191 }, { "epoch": 1.8950715421303657, "grad_norm": 0.8616097768579959, "learning_rate": 3.058529455187962e-05, "loss": 0.1801, "step": 1192 }, { "epoch": 1.8966613672496027, "grad_norm": 0.8114540104972947, "learning_rate": 3.0587257285754886e-05, "loss": 0.1542, "step": 1193 }, { "epoch": 1.8982511923688394, "grad_norm": 0.8255352330421866, "learning_rate": 3.058922320550273e-05, "loss": 0.175, "step": 1194 }, { "epoch": 1.8998410174880762, "grad_norm": 1.255939338168114, "learning_rate": 3.059119231045774e-05, "loss": 0.2363, "step": 1195 }, { "epoch": 1.9014308426073132, "grad_norm": 1.1534406008715932, "learning_rate": 3.0593164599953476e-05, "loss": 0.2003, "step": 1196 }, { "epoch": 1.9030206677265502, "grad_norm": 0.841267698651767, "learning_rate": 3.0595140073322374e-05, "loss": 0.1791, "step": 1197 }, { "epoch": 1.904610492845787, "grad_norm": 23.111491486757316, "learning_rate": 3.0597118729895814e-05, "loss": 28.0035, "step": 1198 }, { "epoch": 1.9062003179650238, "grad_norm": 22.984836443191032, "learning_rate": 3.059910056900408e-05, "loss": 27.4725, "step": 1199 }, { "epoch": 1.9077901430842608, "grad_norm": 2.383496193752526, "learning_rate": 3.06010855899764e-05, "loss": 0.2763, "step": 1200 }, { "epoch": 1.9093799682034978, "grad_norm": 1.0386300945084397, "learning_rate": 3.0603073792140914e-05, "loss": 0.2217, "step": 1201 }, { "epoch": 1.9109697933227345, "grad_norm": 2.1819630915536083, "learning_rate": 3.0605065174824694e-05, "loss": 0.2774, "step": 1202 }, { "epoch": 1.9125596184419713, "grad_norm": 1.116663408805725, "learning_rate": 3.060705973735372e-05, "loss": 0.1846, "step": 1203 }, { "epoch": 1.9141494435612083, "grad_norm": 1.8400553429080386, "learning_rate": 3.0609057479052914e-05, "loss": 0.2305, "step": 1204 }, { "epoch": 1.9157392686804453, "grad_norm": 1.3384370966855554, "learning_rate": 3.06110583992461e-05, "loss": 0.2272, "step": 1205 }, { "epoch": 1.917329093799682, "grad_norm": 0.9916247321354082, "learning_rate": 3.061306249725604e-05, "loss": 0.1669, "step": 1206 }, { "epoch": 1.9189189189189189, "grad_norm": 2.1532884746270304, "learning_rate": 3.0615069772404445e-05, "loss": 0.2051, "step": 1207 }, { "epoch": 1.9205087440381559, "grad_norm": 1.1550652922094646, "learning_rate": 3.061708022401189e-05, "loss": 0.1715, "step": 1208 }, { "epoch": 1.9220985691573926, "grad_norm": 1.7970752445970744, "learning_rate": 3.061909385139793e-05, "loss": 0.2117, "step": 1209 }, { "epoch": 1.9236883942766294, "grad_norm": 2.7436833634249336, "learning_rate": 3.062111065388102e-05, "loss": 0.2071, "step": 1210 }, { "epoch": 1.9252782193958664, "grad_norm": 0.9887203777722529, "learning_rate": 3.062313063077855e-05, "loss": 0.1485, "step": 1211 }, { "epoch": 1.9268680445151034, "grad_norm": 0.8717724199440463, "learning_rate": 3.0625153781406824e-05, "loss": 0.1809, "step": 1212 }, { "epoch": 1.9284578696343402, "grad_norm": 1.5202963704600487, "learning_rate": 3.062718010508108e-05, "loss": 0.2287, "step": 1213 }, { "epoch": 1.930047694753577, "grad_norm": 1.0846154052420984, "learning_rate": 3.06292096011155e-05, "loss": 0.167, "step": 1214 }, { "epoch": 1.931637519872814, "grad_norm": 1.3130975733395767, "learning_rate": 3.0631242268823125e-05, "loss": 0.2015, "step": 1215 }, { "epoch": 1.933227344992051, "grad_norm": 1.422738872683954, "learning_rate": 3.063327810751602e-05, "loss": 0.2304, "step": 1216 }, { "epoch": 1.9348171701112877, "grad_norm": 0.7513321902431526, "learning_rate": 3.0635317116505114e-05, "loss": 0.1633, "step": 1217 }, { "epoch": 1.9364069952305245, "grad_norm": 1.0408888186528638, "learning_rate": 3.063735929510026e-05, "loss": 0.1904, "step": 1218 }, { "epoch": 1.9379968203497615, "grad_norm": 1.2013757121906299, "learning_rate": 3.063940464261026e-05, "loss": 0.2122, "step": 1219 }, { "epoch": 1.9395866454689985, "grad_norm": 2.0673398921607804, "learning_rate": 3.0641453158342855e-05, "loss": 0.1858, "step": 1220 }, { "epoch": 1.9411764705882353, "grad_norm": 1.2237917388343624, "learning_rate": 3.064350484160468e-05, "loss": 0.2016, "step": 1221 }, { "epoch": 1.942766295707472, "grad_norm": 1.3159782878890969, "learning_rate": 3.064555969170132e-05, "loss": 0.1893, "step": 1222 }, { "epoch": 1.944356120826709, "grad_norm": 0.8139917527390591, "learning_rate": 3.06476177079373e-05, "loss": 0.1429, "step": 1223 }, { "epoch": 1.945945945945946, "grad_norm": 1.3520108765734955, "learning_rate": 3.064967888961601e-05, "loss": 0.1843, "step": 1224 }, { "epoch": 1.9475357710651828, "grad_norm": 1.0314267662277503, "learning_rate": 3.065174323603986e-05, "loss": 0.1893, "step": 1225 }, { "epoch": 1.9491255961844196, "grad_norm": 0.9901778127937841, "learning_rate": 3.0653810746510115e-05, "loss": 0.1796, "step": 1226 }, { "epoch": 1.9507154213036566, "grad_norm": 1.2652652435030727, "learning_rate": 3.065588142032702e-05, "loss": 0.1685, "step": 1227 }, { "epoch": 1.9523052464228936, "grad_norm": 1.230162636423344, "learning_rate": 3.0657955256789714e-05, "loss": 0.2624, "step": 1228 }, { "epoch": 1.9538950715421304, "grad_norm": 25.80489300702429, "learning_rate": 3.066003225519627e-05, "loss": 29.0589, "step": 1229 }, { "epoch": 1.9554848966613672, "grad_norm": 0.861437387897021, "learning_rate": 3.066211241484371e-05, "loss": 0.2161, "step": 1230 }, { "epoch": 1.9570747217806042, "grad_norm": 0.9196769074957234, "learning_rate": 3.066419573502798e-05, "loss": 0.1982, "step": 1231 }, { "epoch": 1.9586645468998412, "grad_norm": 27.02897188598479, "learning_rate": 3.066628221504396e-05, "loss": 28.2177, "step": 1232 }, { "epoch": 1.960254372019078, "grad_norm": 1.0694183116944747, "learning_rate": 3.066837185418541e-05, "loss": 0.1928, "step": 1233 }, { "epoch": 1.9618441971383147, "grad_norm": 25.387448672311354, "learning_rate": 3.0670464651745116e-05, "loss": 29.1451, "step": 1234 }, { "epoch": 1.9634340222575517, "grad_norm": 2.7887665419062166, "learning_rate": 3.0672560607014695e-05, "loss": 0.2766, "step": 1235 }, { "epoch": 1.9650238473767887, "grad_norm": 1.2931539957994642, "learning_rate": 3.067465971928478e-05, "loss": 0.1806, "step": 1236 }, { "epoch": 1.9666136724960255, "grad_norm": 1.661704258988567, "learning_rate": 3.067676198784488e-05, "loss": 0.251, "step": 1237 }, { "epoch": 1.9682034976152623, "grad_norm": 1.4693391465834271, "learning_rate": 3.067886741198345e-05, "loss": 0.1431, "step": 1238 }, { "epoch": 1.9697933227344993, "grad_norm": 1.7196498024808196, "learning_rate": 3.068097599098789e-05, "loss": 0.2262, "step": 1239 }, { "epoch": 1.9713831478537363, "grad_norm": 0.9381067480056908, "learning_rate": 3.068308772414451e-05, "loss": 0.1605, "step": 1240 }, { "epoch": 1.972972972972973, "grad_norm": 1.0975186508458736, "learning_rate": 3.068520261073857e-05, "loss": 0.204, "step": 1241 }, { "epoch": 1.9745627980922098, "grad_norm": 1.242065103804873, "learning_rate": 3.0687320650054265e-05, "loss": 0.2281, "step": 1242 }, { "epoch": 1.9761526232114468, "grad_norm": 1.2945520946449744, "learning_rate": 3.068944184137471e-05, "loss": 0.154, "step": 1243 }, { "epoch": 1.9777424483306836, "grad_norm": 1.1931142929894483, "learning_rate": 3.069156618398196e-05, "loss": 0.1531, "step": 1244 }, { "epoch": 1.9793322734499204, "grad_norm": 1.2589152711075082, "learning_rate": 3.069369367715701e-05, "loss": 0.2512, "step": 1245 }, { "epoch": 1.9809220985691574, "grad_norm": 1.2612485440947874, "learning_rate": 3.069582432017975e-05, "loss": 0.1843, "step": 1246 }, { "epoch": 1.9825119236883944, "grad_norm": 1.2883629283670672, "learning_rate": 3.069795811232907e-05, "loss": 0.1846, "step": 1247 }, { "epoch": 1.9841017488076311, "grad_norm": 1.347256843993211, "learning_rate": 3.070009505288274e-05, "loss": 0.2223, "step": 1248 }, { "epoch": 1.985691573926868, "grad_norm": 1.7677287754457878, "learning_rate": 3.0702235141117486e-05, "loss": 0.2138, "step": 1249 }, { "epoch": 1.987281399046105, "grad_norm": 1.6566777398256423, "learning_rate": 3.0704378376308966e-05, "loss": 0.2761, "step": 1250 }, { "epoch": 1.988871224165342, "grad_norm": 1.3827746323573025, "learning_rate": 3.0706524757731775e-05, "loss": 0.1946, "step": 1251 }, { "epoch": 1.9904610492845787, "grad_norm": 1.5813255890648994, "learning_rate": 3.0708674284659444e-05, "loss": 0.1801, "step": 1252 }, { "epoch": 1.9920508744038155, "grad_norm": 1.6070535806157233, "learning_rate": 3.071082695636442e-05, "loss": 0.2014, "step": 1253 }, { "epoch": 1.9936406995230525, "grad_norm": 2.02366144478435, "learning_rate": 3.0712982772118114e-05, "loss": 0.4164, "step": 1254 }, { "epoch": 1.9952305246422894, "grad_norm": 1.5040536599606593, "learning_rate": 3.0715141731190864e-05, "loss": 0.189, "step": 1255 }, { "epoch": 1.9968203497615262, "grad_norm": 1.1947304883545589, "learning_rate": 3.071730383285194e-05, "loss": 0.2146, "step": 1256 }, { "epoch": 1.998410174880763, "grad_norm": 1.7214270799293931, "learning_rate": 3.0719469076369525e-05, "loss": 0.1878, "step": 1257 }, { "epoch": 2.0, "grad_norm": 1.0157704917836166, "learning_rate": 3.0721637461010796e-05, "loss": 0.187, "step": 1258 }, { "epoch": 2.001589825119237, "grad_norm": 1.6285401869986473, "learning_rate": 3.0723808986041815e-05, "loss": 0.2418, "step": 1259 }, { "epoch": 2.0031796502384736, "grad_norm": 1.4518016869277235, "learning_rate": 3.072598365072761e-05, "loss": 0.2176, "step": 1260 }, { "epoch": 2.0047694753577106, "grad_norm": 2.3885563057829784, "learning_rate": 3.072816145433213e-05, "loss": 0.2059, "step": 1261 }, { "epoch": 2.0063593004769475, "grad_norm": 1.0261735852628373, "learning_rate": 3.073034239611826e-05, "loss": 0.1838, "step": 1262 }, { "epoch": 2.0079491255961845, "grad_norm": 1.5374133458675676, "learning_rate": 3.073252647534784e-05, "loss": 0.2478, "step": 1263 }, { "epoch": 2.009538950715421, "grad_norm": 0.8128358265322938, "learning_rate": 3.073471369128163e-05, "loss": 0.1714, "step": 1264 }, { "epoch": 2.011128775834658, "grad_norm": 0.8482097073946372, "learning_rate": 3.0736904043179346e-05, "loss": 0.1727, "step": 1265 }, { "epoch": 2.012718600953895, "grad_norm": 0.8648768881046509, "learning_rate": 3.0739097530299624e-05, "loss": 0.1765, "step": 1266 }, { "epoch": 2.014308426073132, "grad_norm": 1.107564625767209, "learning_rate": 3.074129415190006e-05, "loss": 0.1614, "step": 1267 }, { "epoch": 2.0158982511923687, "grad_norm": 1.1544088229763227, "learning_rate": 3.074349390723716e-05, "loss": 0.1799, "step": 1268 }, { "epoch": 2.0174880763116056, "grad_norm": 1.485674176504748, "learning_rate": 3.07456967955664e-05, "loss": 0.21, "step": 1269 }, { "epoch": 2.0190779014308426, "grad_norm": 0.8111920965928121, "learning_rate": 3.074790281614218e-05, "loss": 0.1911, "step": 1270 }, { "epoch": 2.0206677265500796, "grad_norm": 1.6692129560972535, "learning_rate": 3.075011196821784e-05, "loss": 0.2232, "step": 1271 }, { "epoch": 2.022257551669316, "grad_norm": 0.9910243616953991, "learning_rate": 3.0752324251045664e-05, "loss": 0.2019, "step": 1272 }, { "epoch": 2.023847376788553, "grad_norm": 1.851604710969435, "learning_rate": 3.075453966387686e-05, "loss": 0.1962, "step": 1273 }, { "epoch": 2.02543720190779, "grad_norm": 1.6315246047077385, "learning_rate": 3.0756758205961626e-05, "loss": 0.2336, "step": 1274 }, { "epoch": 2.027027027027027, "grad_norm": 1.1332389023345852, "learning_rate": 3.0758979876549034e-05, "loss": 0.1975, "step": 1275 }, { "epoch": 2.0286168521462637, "grad_norm": 1.3854639647356968, "learning_rate": 3.076120467488714e-05, "loss": 0.1859, "step": 1276 }, { "epoch": 2.0302066772655007, "grad_norm": 27.627483185333432, "learning_rate": 3.0763432600222913e-05, "loss": 30.238, "step": 1277 }, { "epoch": 2.0317965023847377, "grad_norm": 1.3216702131635056, "learning_rate": 3.076566365180232e-05, "loss": 0.2133, "step": 1278 }, { "epoch": 2.0333863275039747, "grad_norm": 2.0037253060939353, "learning_rate": 3.076789782887019e-05, "loss": 0.1948, "step": 1279 }, { "epoch": 2.0349761526232113, "grad_norm": 0.8846634052804119, "learning_rate": 3.077013513067036e-05, "loss": 0.2177, "step": 1280 }, { "epoch": 2.0365659777424483, "grad_norm": 1.3785150586824486, "learning_rate": 3.077237555644558e-05, "loss": 0.1767, "step": 1281 }, { "epoch": 2.0381558028616853, "grad_norm": 1.14319076750038, "learning_rate": 3.077461910543754e-05, "loss": 0.1793, "step": 1282 }, { "epoch": 2.0397456279809223, "grad_norm": 1.3058599818187528, "learning_rate": 3.077686577688689e-05, "loss": 0.2194, "step": 1283 }, { "epoch": 2.041335453100159, "grad_norm": 1.1608873509256605, "learning_rate": 3.077911557003319e-05, "loss": 0.1365, "step": 1284 }, { "epoch": 2.042925278219396, "grad_norm": 2.2215253383767357, "learning_rate": 3.0781368484114995e-05, "loss": 0.318, "step": 1285 }, { "epoch": 2.044515103338633, "grad_norm": 1.2337850885799377, "learning_rate": 3.0783624518369764e-05, "loss": 0.2016, "step": 1286 }, { "epoch": 2.04610492845787, "grad_norm": 1.4386787568710655, "learning_rate": 3.078588367203391e-05, "loss": 0.1446, "step": 1287 }, { "epoch": 2.0476947535771064, "grad_norm": 1.3686187612205987, "learning_rate": 3.078814594434279e-05, "loss": 0.2448, "step": 1288 }, { "epoch": 2.0492845786963434, "grad_norm": 1.5259367157698687, "learning_rate": 3.079041133453071e-05, "loss": 0.2683, "step": 1289 }, { "epoch": 2.0508744038155804, "grad_norm": 2.3301193196051186, "learning_rate": 3.0792679841830915e-05, "loss": 0.2524, "step": 1290 }, { "epoch": 2.0524642289348174, "grad_norm": 2.2628810487868543, "learning_rate": 3.07949514654756e-05, "loss": 0.288, "step": 1291 }, { "epoch": 2.054054054054054, "grad_norm": 1.3029376832118935, "learning_rate": 3.0797226204695895e-05, "loss": 0.1805, "step": 1292 }, { "epoch": 2.055643879173291, "grad_norm": 0.8003632043606429, "learning_rate": 3.0799504058721894e-05, "loss": 0.143, "step": 1293 }, { "epoch": 2.057233704292528, "grad_norm": 0.830571401756114, "learning_rate": 3.080178502678262e-05, "loss": 0.1529, "step": 1294 }, { "epoch": 2.0588235294117645, "grad_norm": 28.12440574477103, "learning_rate": 3.080406910810606e-05, "loss": 26.6375, "step": 1295 }, { "epoch": 2.0604133545310015, "grad_norm": 1.4661546883797156, "learning_rate": 3.080635630191911e-05, "loss": 0.2362, "step": 1296 }, { "epoch": 2.0620031796502385, "grad_norm": 0.9270319189359216, "learning_rate": 3.080864660744766e-05, "loss": 0.2385, "step": 1297 }, { "epoch": 2.0635930047694755, "grad_norm": 1.1312945465551818, "learning_rate": 3.0810940023916513e-05, "loss": 0.2121, "step": 1298 }, { "epoch": 2.065182829888712, "grad_norm": 1.235096449226978, "learning_rate": 3.0813236550549424e-05, "loss": 0.2027, "step": 1299 }, { "epoch": 2.066772655007949, "grad_norm": 1.2968467014482736, "learning_rate": 3.0815536186569125e-05, "loss": 0.2507, "step": 1300 }, { "epoch": 2.068362480127186, "grad_norm": 0.9252848126570336, "learning_rate": 3.081783893119726e-05, "loss": 0.1492, "step": 1301 }, { "epoch": 2.069952305246423, "grad_norm": 1.6222466340123076, "learning_rate": 3.082014478365443e-05, "loss": 0.1601, "step": 1302 }, { "epoch": 2.0715421303656596, "grad_norm": 1.592797926521896, "learning_rate": 3.0822453743160196e-05, "loss": 0.2032, "step": 1303 }, { "epoch": 2.0731319554848966, "grad_norm": 1.3739962578434508, "learning_rate": 3.082476580893305e-05, "loss": 0.239, "step": 1304 }, { "epoch": 2.0747217806041336, "grad_norm": 0.7306467378447722, "learning_rate": 3.082708098019043e-05, "loss": 0.1721, "step": 1305 }, { "epoch": 2.0763116057233706, "grad_norm": 1.542581159735289, "learning_rate": 3.0829399256148764e-05, "loss": 0.1936, "step": 1306 }, { "epoch": 2.077901430842607, "grad_norm": 0.8831623975443234, "learning_rate": 3.083172063602337e-05, "loss": 0.1978, "step": 1307 }, { "epoch": 2.079491255961844, "grad_norm": 0.9217243949168382, "learning_rate": 3.083404511902857e-05, "loss": 0.162, "step": 1308 }, { "epoch": 2.081081081081081, "grad_norm": 0.8569836982226501, "learning_rate": 3.08363727043776e-05, "loss": 0.1854, "step": 1309 }, { "epoch": 2.082670906200318, "grad_norm": 1.199840413070969, "learning_rate": 3.0838703391282664e-05, "loss": 0.209, "step": 1310 }, { "epoch": 2.0842607313195547, "grad_norm": 1.160933636914246, "learning_rate": 3.0841037178954886e-05, "loss": 0.1708, "step": 1311 }, { "epoch": 2.0858505564387917, "grad_norm": 1.2875084460130533, "learning_rate": 3.0843374066604395e-05, "loss": 0.2431, "step": 1312 }, { "epoch": 2.0874403815580287, "grad_norm": 0.9992658402973111, "learning_rate": 3.084571405344021e-05, "loss": 0.1902, "step": 1313 }, { "epoch": 2.0890302066772657, "grad_norm": 1.3380287852543482, "learning_rate": 3.084805713867034e-05, "loss": 0.2166, "step": 1314 }, { "epoch": 2.0906200317965022, "grad_norm": 1.0043156815831882, "learning_rate": 3.085040332150176e-05, "loss": 0.1766, "step": 1315 }, { "epoch": 2.0922098569157392, "grad_norm": 1.5805326822218397, "learning_rate": 3.0852752601140325e-05, "loss": 0.2404, "step": 1316 }, { "epoch": 2.0937996820349762, "grad_norm": 1.8861521541509516, "learning_rate": 3.0855104976790934e-05, "loss": 0.2548, "step": 1317 }, { "epoch": 2.0953895071542132, "grad_norm": 1.1831321120577094, "learning_rate": 3.085746044765737e-05, "loss": 0.2035, "step": 1318 }, { "epoch": 2.09697933227345, "grad_norm": 1.3843009415573677, "learning_rate": 3.0859819012942376e-05, "loss": 0.2024, "step": 1319 }, { "epoch": 2.098569157392687, "grad_norm": 1.2148524106972587, "learning_rate": 3.0862180671847705e-05, "loss": 0.223, "step": 1320 }, { "epoch": 2.100158982511924, "grad_norm": 0.9085094684700772, "learning_rate": 3.0864545423573996e-05, "loss": 0.1804, "step": 1321 }, { "epoch": 2.101748807631161, "grad_norm": 0.7691273452199137, "learning_rate": 3.086691326732086e-05, "loss": 0.2157, "step": 1322 }, { "epoch": 2.1033386327503973, "grad_norm": 1.3821088501911378, "learning_rate": 3.086928420228688e-05, "loss": 0.1955, "step": 1323 }, { "epoch": 2.1049284578696343, "grad_norm": 3.761306881942381, "learning_rate": 3.087165822766958e-05, "loss": 0.2686, "step": 1324 }, { "epoch": 2.1065182829888713, "grad_norm": 0.8839596204913662, "learning_rate": 3.0874035342665416e-05, "loss": 0.1476, "step": 1325 }, { "epoch": 2.108108108108108, "grad_norm": 0.9452616510008075, "learning_rate": 3.087641554646986e-05, "loss": 0.1355, "step": 1326 }, { "epoch": 2.109697933227345, "grad_norm": 0.995148220704798, "learning_rate": 3.087879883827727e-05, "loss": 0.1535, "step": 1327 }, { "epoch": 2.111287758346582, "grad_norm": 1.1429686926063791, "learning_rate": 3.0881185217281e-05, "loss": 0.2246, "step": 1328 }, { "epoch": 2.112877583465819, "grad_norm": 2.1319652522118706, "learning_rate": 3.0883574682673345e-05, "loss": 0.2647, "step": 1329 }, { "epoch": 2.1144674085850554, "grad_norm": 1.1920604635860412, "learning_rate": 3.088596723364555e-05, "loss": 0.2352, "step": 1330 }, { "epoch": 2.1160572337042924, "grad_norm": 1.1387436937305278, "learning_rate": 3.088836286938783e-05, "loss": 0.155, "step": 1331 }, { "epoch": 2.1176470588235294, "grad_norm": 1.652141133560016, "learning_rate": 3.089076158908935e-05, "loss": 0.1905, "step": 1332 }, { "epoch": 2.1192368839427664, "grad_norm": 1.5354167292005385, "learning_rate": 3.0893163391938235e-05, "loss": 0.1922, "step": 1333 }, { "epoch": 2.120826709062003, "grad_norm": 4.1241360067695965, "learning_rate": 3.089556827712155e-05, "loss": 0.5256, "step": 1334 }, { "epoch": 2.12241653418124, "grad_norm": 0.645114217644848, "learning_rate": 3.089797624382533e-05, "loss": 0.1425, "step": 1335 }, { "epoch": 2.124006359300477, "grad_norm": 1.3938989618247661, "learning_rate": 3.090038729123457e-05, "loss": 0.2083, "step": 1336 }, { "epoch": 2.125596184419714, "grad_norm": 1.7730708341280415, "learning_rate": 3.090280141853322e-05, "loss": 0.196, "step": 1337 }, { "epoch": 2.1271860095389505, "grad_norm": 0.7394137132287154, "learning_rate": 3.090521862490418e-05, "loss": 0.1577, "step": 1338 }, { "epoch": 2.1287758346581875, "grad_norm": 1.0336190678270467, "learning_rate": 3.090763890952931e-05, "loss": 0.1801, "step": 1339 }, { "epoch": 2.1303656597774245, "grad_norm": 0.9576929373168033, "learning_rate": 3.091006227158945e-05, "loss": 0.1855, "step": 1340 }, { "epoch": 2.1319554848966615, "grad_norm": 1.775426378239908, "learning_rate": 3.091248871026436e-05, "loss": 0.2412, "step": 1341 }, { "epoch": 2.133545310015898, "grad_norm": 0.7418853133482567, "learning_rate": 3.091491822473278e-05, "loss": 0.1824, "step": 1342 }, { "epoch": 2.135135135135135, "grad_norm": 1.0328649429102252, "learning_rate": 3.091735081417242e-05, "loss": 0.2358, "step": 1343 }, { "epoch": 2.136724960254372, "grad_norm": 0.7623939676804489, "learning_rate": 3.091978647775993e-05, "loss": 0.1741, "step": 1344 }, { "epoch": 2.138314785373609, "grad_norm": 0.8738992243465746, "learning_rate": 3.092222521467092e-05, "loss": 0.177, "step": 1345 }, { "epoch": 2.1399046104928456, "grad_norm": 0.804723471310374, "learning_rate": 3.092466702407996e-05, "loss": 0.1416, "step": 1346 }, { "epoch": 2.1414944356120826, "grad_norm": 1.5047521938901964, "learning_rate": 3.092711190516062e-05, "loss": 0.1858, "step": 1347 }, { "epoch": 2.1430842607313196, "grad_norm": 1.2461860763926413, "learning_rate": 3.0929559857085355e-05, "loss": 0.1795, "step": 1348 }, { "epoch": 2.1446740858505566, "grad_norm": 21.777714304884103, "learning_rate": 3.093201087902565e-05, "loss": 19.7598, "step": 1349 }, { "epoch": 2.146263910969793, "grad_norm": 0.9756223762482187, "learning_rate": 3.093446497015189e-05, "loss": 0.1724, "step": 1350 }, { "epoch": 2.14785373608903, "grad_norm": 0.8967810526427992, "learning_rate": 3.09369221296335e-05, "loss": 0.2258, "step": 1351 }, { "epoch": 2.149443561208267, "grad_norm": 1.022614819271286, "learning_rate": 3.0939382356638785e-05, "loss": 0.2299, "step": 1352 }, { "epoch": 2.151033386327504, "grad_norm": 1.1832029028290982, "learning_rate": 3.094184565033508e-05, "loss": 0.2154, "step": 1353 }, { "epoch": 2.1526232114467407, "grad_norm": 1.0559999502153423, "learning_rate": 3.094431200988861e-05, "loss": 0.2091, "step": 1354 }, { "epoch": 2.1542130365659777, "grad_norm": 1.654526643164502, "learning_rate": 3.094678143446462e-05, "loss": 0.2314, "step": 1355 }, { "epoch": 2.1558028616852147, "grad_norm": 1.020730279015888, "learning_rate": 3.09492539232273e-05, "loss": 0.1849, "step": 1356 }, { "epoch": 2.1573926868044513, "grad_norm": 0.8790264686352803, "learning_rate": 3.095172947533981e-05, "loss": 0.1258, "step": 1357 }, { "epoch": 2.1589825119236883, "grad_norm": 0.7786196705654902, "learning_rate": 3.095420808996425e-05, "loss": 0.1793, "step": 1358 }, { "epoch": 2.1605723370429253, "grad_norm": 1.4067081602417018, "learning_rate": 3.09566897662617e-05, "loss": 0.2755, "step": 1359 }, { "epoch": 2.1621621621621623, "grad_norm": 0.9830172756733001, "learning_rate": 3.095917450339221e-05, "loss": 0.0915, "step": 1360 }, { "epoch": 2.1637519872813993, "grad_norm": 1.7798638063114107, "learning_rate": 3.09616623005148e-05, "loss": 0.2358, "step": 1361 }, { "epoch": 2.165341812400636, "grad_norm": 1.089155433635473, "learning_rate": 3.0964153156787414e-05, "loss": 0.2222, "step": 1362 }, { "epoch": 2.166931637519873, "grad_norm": 1.0603013444145613, "learning_rate": 3.0966647071366996e-05, "loss": 0.1896, "step": 1363 }, { "epoch": 2.16852146263911, "grad_norm": 1.1657683753221677, "learning_rate": 3.0969144043409444e-05, "loss": 0.2287, "step": 1364 }, { "epoch": 2.1701112877583464, "grad_norm": 1.1150513983382129, "learning_rate": 3.0971644072069636e-05, "loss": 0.1709, "step": 1365 }, { "epoch": 2.1717011128775834, "grad_norm": 1.413617806629585, "learning_rate": 3.09741471565014e-05, "loss": 0.1981, "step": 1366 }, { "epoch": 2.1732909379968204, "grad_norm": 0.7387448412322494, "learning_rate": 3.097665329585752e-05, "loss": 0.1522, "step": 1367 }, { "epoch": 2.1748807631160574, "grad_norm": 1.6831016678797022, "learning_rate": 3.097916248928976e-05, "loss": 0.1816, "step": 1368 }, { "epoch": 2.176470588235294, "grad_norm": 0.7478624113142754, "learning_rate": 3.098167473594886e-05, "loss": 0.1838, "step": 1369 }, { "epoch": 2.178060413354531, "grad_norm": 1.5408256716519728, "learning_rate": 3.0984190034984514e-05, "loss": 0.1894, "step": 1370 }, { "epoch": 2.179650238473768, "grad_norm": 0.9093900350749017, "learning_rate": 3.0986708385545384e-05, "loss": 0.2198, "step": 1371 }, { "epoch": 2.181240063593005, "grad_norm": 1.018773190322291, "learning_rate": 3.0989229786779086e-05, "loss": 0.1618, "step": 1372 }, { "epoch": 2.1828298887122415, "grad_norm": 1.0429560744402189, "learning_rate": 3.099175423783223e-05, "loss": 0.2116, "step": 1373 }, { "epoch": 2.1844197138314785, "grad_norm": 1.6267291985483283, "learning_rate": 3.0994281737850384e-05, "loss": 0.2586, "step": 1374 }, { "epoch": 2.1860095389507155, "grad_norm": 0.9472913844800965, "learning_rate": 3.099681228597806e-05, "loss": 0.1786, "step": 1375 }, { "epoch": 2.1875993640699525, "grad_norm": 0.9482453520338047, "learning_rate": 3.0999345881358784e-05, "loss": 0.183, "step": 1376 }, { "epoch": 2.189189189189189, "grad_norm": 1.4272694389725795, "learning_rate": 3.100188252313501e-05, "loss": 0.3825, "step": 1377 }, { "epoch": 2.190779014308426, "grad_norm": 1.3120816831611022, "learning_rate": 3.10044222104482e-05, "loss": 0.1812, "step": 1378 }, { "epoch": 2.192368839427663, "grad_norm": 0.8936454200936483, "learning_rate": 3.1006964942438725e-05, "loss": 0.1752, "step": 1379 }, { "epoch": 2.1939586645469, "grad_norm": 1.0308275926343504, "learning_rate": 3.100951071824599e-05, "loss": 0.1979, "step": 1380 }, { "epoch": 2.1955484896661366, "grad_norm": 2.121736356022081, "learning_rate": 3.101205953700833e-05, "loss": 0.1966, "step": 1381 }, { "epoch": 2.1971383147853736, "grad_norm": 1.361500398221776, "learning_rate": 3.101461139786307e-05, "loss": 0.1865, "step": 1382 }, { "epoch": 2.1987281399046106, "grad_norm": 1.0551844275054563, "learning_rate": 3.101716629994648e-05, "loss": 0.1408, "step": 1383 }, { "epoch": 2.2003179650238476, "grad_norm": 1.286749262176366, "learning_rate": 3.101972424239384e-05, "loss": 0.2008, "step": 1384 }, { "epoch": 2.201907790143084, "grad_norm": 1.1440829715368968, "learning_rate": 3.102228522433937e-05, "loss": 0.2574, "step": 1385 }, { "epoch": 2.203497615262321, "grad_norm": 1.6675692498369448, "learning_rate": 3.102484924491628e-05, "loss": 0.1788, "step": 1386 }, { "epoch": 2.205087440381558, "grad_norm": 0.9370870178549853, "learning_rate": 3.102741630325672e-05, "loss": 0.1415, "step": 1387 }, { "epoch": 2.2066772655007947, "grad_norm": 0.9637874312804572, "learning_rate": 3.102998639849185e-05, "loss": 0.1722, "step": 1388 }, { "epoch": 2.2082670906200317, "grad_norm": 1.2556838699580377, "learning_rate": 3.103255952975178e-05, "loss": 0.1686, "step": 1389 }, { "epoch": 2.2098569157392687, "grad_norm": 26.087843924102206, "learning_rate": 3.10351356961656e-05, "loss": 24.7238, "step": 1390 }, { "epoch": 2.2114467408585057, "grad_norm": 1.2306052553326015, "learning_rate": 3.103771489686136e-05, "loss": 0.2127, "step": 1391 }, { "epoch": 2.2130365659777427, "grad_norm": 1.3925769784119306, "learning_rate": 3.104029713096612e-05, "loss": 0.1536, "step": 1392 }, { "epoch": 2.2146263910969792, "grad_norm": 1.0886456722807778, "learning_rate": 3.104288239760587e-05, "loss": 0.1605, "step": 1393 }, { "epoch": 2.2162162162162162, "grad_norm": 1.9733735581123586, "learning_rate": 3.10454706959056e-05, "loss": 0.3207, "step": 1394 }, { "epoch": 2.2178060413354532, "grad_norm": 1.6391136638969375, "learning_rate": 3.104806202498924e-05, "loss": 0.2376, "step": 1395 }, { "epoch": 2.21939586645469, "grad_norm": 1.338525473265953, "learning_rate": 3.105065638397975e-05, "loss": 0.1535, "step": 1396 }, { "epoch": 2.220985691573927, "grad_norm": 1.2636598834016204, "learning_rate": 3.1053253771999016e-05, "loss": 0.2016, "step": 1397 }, { "epoch": 2.2225755166931638, "grad_norm": 2.2258591390768534, "learning_rate": 3.105585418816792e-05, "loss": 0.2009, "step": 1398 }, { "epoch": 2.2241653418124008, "grad_norm": 2.1742096409072453, "learning_rate": 3.105845763160632e-05, "loss": 0.2063, "step": 1399 }, { "epoch": 2.2257551669316373, "grad_norm": 1.4436772759142573, "learning_rate": 3.106106410143304e-05, "loss": 0.2107, "step": 1400 }, { "epoch": 2.2273449920508743, "grad_norm": 1.8040607557530957, "learning_rate": 3.106367359676588e-05, "loss": 0.8534, "step": 1401 }, { "epoch": 2.2289348171701113, "grad_norm": 9.457669343721667, "learning_rate": 3.106628611672163e-05, "loss": 7.0631, "step": 1402 }, { "epoch": 2.2305246422893483, "grad_norm": 2.24096394779888, "learning_rate": 3.106890166041604e-05, "loss": 0.2575, "step": 1403 }, { "epoch": 2.232114467408585, "grad_norm": 29.494664006121866, "learning_rate": 3.107152022696384e-05, "loss": 25.3667, "step": 1404 }, { "epoch": 2.233704292527822, "grad_norm": 2.3338503820567937, "learning_rate": 3.107414181547875e-05, "loss": 0.2012, "step": 1405 }, { "epoch": 2.235294117647059, "grad_norm": 1.2334326685402701, "learning_rate": 3.107676642507345e-05, "loss": 0.1994, "step": 1406 }, { "epoch": 2.236883942766296, "grad_norm": 1.3481853582852126, "learning_rate": 3.10793940548596e-05, "loss": 0.2168, "step": 1407 }, { "epoch": 2.2384737678855324, "grad_norm": 1.449509800560888, "learning_rate": 3.108202470394786e-05, "loss": 0.1952, "step": 1408 }, { "epoch": 2.2400635930047694, "grad_norm": 1.8300766146417315, "learning_rate": 3.108465837144784e-05, "loss": 0.1902, "step": 1409 }, { "epoch": 2.2416534181240064, "grad_norm": 1.769688151517097, "learning_rate": 3.108729505646813e-05, "loss": 0.176, "step": 1410 }, { "epoch": 2.2432432432432434, "grad_norm": 1.4020338307777473, "learning_rate": 3.1089934758116316e-05, "loss": 0.1532, "step": 1411 }, { "epoch": 2.24483306836248, "grad_norm": 1.4975358502023568, "learning_rate": 3.1092577475498965e-05, "loss": 0.2138, "step": 1412 }, { "epoch": 2.246422893481717, "grad_norm": 1.4443855686176623, "learning_rate": 3.10952232077216e-05, "loss": 0.1922, "step": 1413 }, { "epoch": 2.248012718600954, "grad_norm": 2.054295804059935, "learning_rate": 3.109787195388874e-05, "loss": 0.146, "step": 1414 }, { "epoch": 2.249602543720191, "grad_norm": 1.4258794369642493, "learning_rate": 3.110052371310387e-05, "loss": 0.2534, "step": 1415 }, { "epoch": 2.2511923688394275, "grad_norm": 1.2449009899718306, "learning_rate": 3.110317848446948e-05, "loss": 0.1951, "step": 1416 }, { "epoch": 2.2527821939586645, "grad_norm": 4.49563701828563, "learning_rate": 3.110583626708703e-05, "loss": 0.2438, "step": 1417 }, { "epoch": 2.2543720190779015, "grad_norm": 1.121447415027865, "learning_rate": 3.110849706005694e-05, "loss": 0.197, "step": 1418 }, { "epoch": 2.255961844197138, "grad_norm": 2.3025981361064303, "learning_rate": 3.111116086247864e-05, "loss": 0.212, "step": 1419 }, { "epoch": 2.257551669316375, "grad_norm": 1.6567050848724652, "learning_rate": 3.111382767345051e-05, "loss": 0.2821, "step": 1420 }, { "epoch": 2.259141494435612, "grad_norm": 1.65254330309698, "learning_rate": 3.1116497492069965e-05, "loss": 0.1978, "step": 1421 }, { "epoch": 2.260731319554849, "grad_norm": 0.9328317215158144, "learning_rate": 3.111917031743333e-05, "loss": 0.1586, "step": 1422 }, { "epoch": 2.262321144674086, "grad_norm": 3.9300092505161173, "learning_rate": 3.112184614863599e-05, "loss": 0.2695, "step": 1423 }, { "epoch": 2.2639109697933226, "grad_norm": 1.1284450729805386, "learning_rate": 3.1124524984772236e-05, "loss": 0.2547, "step": 1424 }, { "epoch": 2.2655007949125596, "grad_norm": 1.1805631868626294, "learning_rate": 3.112720682493541e-05, "loss": 0.1871, "step": 1425 }, { "epoch": 2.2670906200317966, "grad_norm": 0.8668640445889406, "learning_rate": 3.1129891668217784e-05, "loss": 0.1531, "step": 1426 }, { "epoch": 2.268680445151033, "grad_norm": 1.192741584684383, "learning_rate": 3.113257951371064e-05, "loss": 0.1734, "step": 1427 }, { "epoch": 2.27027027027027, "grad_norm": 1.138895563046949, "learning_rate": 3.1135270360504254e-05, "loss": 0.1943, "step": 1428 }, { "epoch": 2.271860095389507, "grad_norm": 1.1522008460207795, "learning_rate": 3.113796420768786e-05, "loss": 0.2182, "step": 1429 }, { "epoch": 2.273449920508744, "grad_norm": 0.8063615699751957, "learning_rate": 3.1140661054349684e-05, "loss": 0.1991, "step": 1430 }, { "epoch": 2.275039745627981, "grad_norm": 1.6500668772821634, "learning_rate": 3.114336089957694e-05, "loss": 0.1959, "step": 1431 }, { "epoch": 2.2766295707472177, "grad_norm": 0.911827838077677, "learning_rate": 3.114606374245584e-05, "loss": 0.1994, "step": 1432 }, { "epoch": 2.2782193958664547, "grad_norm": 1.1650142407170838, "learning_rate": 3.114876958207157e-05, "loss": 0.172, "step": 1433 }, { "epoch": 2.2798092209856917, "grad_norm": 1.1770580807873885, "learning_rate": 3.115147841750829e-05, "loss": 0.2057, "step": 1434 }, { "epoch": 2.2813990461049283, "grad_norm": 1.029010642784493, "learning_rate": 3.115419024784916e-05, "loss": 0.1747, "step": 1435 }, { "epoch": 2.2829888712241653, "grad_norm": 1.3165787016867403, "learning_rate": 3.1156905072176335e-05, "loss": 0.1619, "step": 1436 }, { "epoch": 2.2845786963434023, "grad_norm": 3.9147528399400153, "learning_rate": 3.115962288957092e-05, "loss": 0.2271, "step": 1437 }, { "epoch": 2.2861685214626393, "grad_norm": 0.9777591782951672, "learning_rate": 3.116234369911307e-05, "loss": 0.1541, "step": 1438 }, { "epoch": 2.287758346581876, "grad_norm": 1.495771594125863, "learning_rate": 3.1165067499881854e-05, "loss": 0.2517, "step": 1439 }, { "epoch": 2.289348171701113, "grad_norm": 13.44438587844138, "learning_rate": 3.116779429095538e-05, "loss": 13.3598, "step": 1440 }, { "epoch": 2.29093799682035, "grad_norm": 0.7960710683865352, "learning_rate": 3.117052407141073e-05, "loss": 0.1753, "step": 1441 }, { "epoch": 2.292527821939587, "grad_norm": 1.3534450241088505, "learning_rate": 3.117325684032397e-05, "loss": 0.1832, "step": 1442 }, { "epoch": 2.2941176470588234, "grad_norm": 1.613919394331104, "learning_rate": 3.117599259677015e-05, "loss": 0.2604, "step": 1443 }, { "epoch": 2.2957074721780604, "grad_norm": 1.52159447878477, "learning_rate": 3.117873133982332e-05, "loss": 0.1978, "step": 1444 }, { "epoch": 2.2972972972972974, "grad_norm": 1.0298035677691213, "learning_rate": 3.118147306855653e-05, "loss": 0.1993, "step": 1445 }, { "epoch": 2.2988871224165344, "grad_norm": 1.6253228994815916, "learning_rate": 3.118421778204176e-05, "loss": 0.1908, "step": 1446 }, { "epoch": 2.300476947535771, "grad_norm": 1.5852938545374853, "learning_rate": 3.118696547935008e-05, "loss": 0.1759, "step": 1447 }, { "epoch": 2.302066772655008, "grad_norm": 1.476417529322313, "learning_rate": 3.118971615955146e-05, "loss": 0.1843, "step": 1448 }, { "epoch": 2.303656597774245, "grad_norm": 1.3566778600157008, "learning_rate": 3.1192469821714894e-05, "loss": 0.1337, "step": 1449 }, { "epoch": 2.3052464228934815, "grad_norm": 1.241183457423938, "learning_rate": 3.119522646490838e-05, "loss": 0.258, "step": 1450 }, { "epoch": 2.3068362480127185, "grad_norm": 1.0807210567397258, "learning_rate": 3.119798608819889e-05, "loss": 0.2256, "step": 1451 }, { "epoch": 2.3084260731319555, "grad_norm": 0.8586670655980936, "learning_rate": 3.120074869065238e-05, "loss": 0.1914, "step": 1452 }, { "epoch": 2.3100158982511925, "grad_norm": 1.9377225721977147, "learning_rate": 3.120351427133383e-05, "loss": 0.2266, "step": 1453 }, { "epoch": 2.3116057233704295, "grad_norm": 1.074866897397345, "learning_rate": 3.120628282930719e-05, "loss": 0.1655, "step": 1454 }, { "epoch": 2.313195548489666, "grad_norm": 1.9670171516378898, "learning_rate": 3.120905436363537e-05, "loss": 0.2426, "step": 1455 }, { "epoch": 2.314785373608903, "grad_norm": 2.139449917829324, "learning_rate": 3.1211828873380356e-05, "loss": 0.1572, "step": 1456 }, { "epoch": 2.31637519872814, "grad_norm": 2.063541341841825, "learning_rate": 3.121460635760302e-05, "loss": 0.2264, "step": 1457 }, { "epoch": 2.3179650238473766, "grad_norm": 2.4224371702730187, "learning_rate": 3.121738681536333e-05, "loss": 0.2049, "step": 1458 }, { "epoch": 2.3195548489666136, "grad_norm": 3.62872067157744, "learning_rate": 3.12201702457202e-05, "loss": 0.2589, "step": 1459 }, { "epoch": 2.3211446740858506, "grad_norm": 1.4396023571567318, "learning_rate": 3.122295664773151e-05, "loss": 0.18, "step": 1460 }, { "epoch": 2.3227344992050876, "grad_norm": 3.103519046325617, "learning_rate": 3.122574602045418e-05, "loss": 0.261, "step": 1461 }, { "epoch": 2.3243243243243246, "grad_norm": 2.5467438977510026, "learning_rate": 3.1228538362944116e-05, "loss": 0.1827, "step": 1462 }, { "epoch": 2.325914149443561, "grad_norm": 1.9904982085596932, "learning_rate": 3.1231333674256194e-05, "loss": 0.2013, "step": 1463 }, { "epoch": 2.327503974562798, "grad_norm": 1.373483581934219, "learning_rate": 3.123413195344432e-05, "loss": 0.1924, "step": 1464 }, { "epoch": 2.329093799682035, "grad_norm": 1.1076693552105425, "learning_rate": 3.123693319956137e-05, "loss": 0.1964, "step": 1465 }, { "epoch": 2.3306836248012717, "grad_norm": 3.007703511021293, "learning_rate": 3.123973741165922e-05, "loss": 0.2712, "step": 1466 }, { "epoch": 2.3322734499205087, "grad_norm": 1.7664803715549058, "learning_rate": 3.124254458878874e-05, "loss": 0.1569, "step": 1467 }, { "epoch": 2.3338632750397457, "grad_norm": 1.7564175490661225, "learning_rate": 3.124535472999982e-05, "loss": 0.2135, "step": 1468 }, { "epoch": 2.3354531001589827, "grad_norm": 1.4712268378911442, "learning_rate": 3.1248167834341324e-05, "loss": 0.2375, "step": 1469 }, { "epoch": 2.337042925278219, "grad_norm": 1.3563940373821437, "learning_rate": 3.125098390086111e-05, "loss": 0.252, "step": 1470 }, { "epoch": 2.338632750397456, "grad_norm": 1.465444794929157, "learning_rate": 3.125380292860604e-05, "loss": 0.2565, "step": 1471 }, { "epoch": 2.340222575516693, "grad_norm": 1.723767544562639, "learning_rate": 3.125662491662199e-05, "loss": 0.1954, "step": 1472 }, { "epoch": 2.34181240063593, "grad_norm": 1.6142840453062304, "learning_rate": 3.125944986395381e-05, "loss": 0.1923, "step": 1473 }, { "epoch": 2.3434022257551668, "grad_norm": 1.2379579648768835, "learning_rate": 3.1262277769645345e-05, "loss": 0.1736, "step": 1474 }, { "epoch": 2.3449920508744038, "grad_norm": 1.2350038003794837, "learning_rate": 3.1265108632739475e-05, "loss": 0.1613, "step": 1475 }, { "epoch": 2.3465818759936408, "grad_norm": 1.3831981324879117, "learning_rate": 3.126794245227805e-05, "loss": 0.2191, "step": 1476 }, { "epoch": 2.3481717011128778, "grad_norm": 2.1957354071496913, "learning_rate": 3.12707792273019e-05, "loss": 0.219, "step": 1477 }, { "epoch": 2.3497615262321143, "grad_norm": 1.878883005683085, "learning_rate": 3.127361895685091e-05, "loss": 0.2139, "step": 1478 }, { "epoch": 2.3513513513513513, "grad_norm": 0.8345237246048842, "learning_rate": 3.127646163996393e-05, "loss": 0.1472, "step": 1479 }, { "epoch": 2.3529411764705883, "grad_norm": 1.2364929495587558, "learning_rate": 3.1279307275678795e-05, "loss": 0.1458, "step": 1480 }, { "epoch": 2.3545310015898253, "grad_norm": 0.9867351119302167, "learning_rate": 3.128215586303238e-05, "loss": 0.234, "step": 1481 }, { "epoch": 2.356120826709062, "grad_norm": 1.223627732869825, "learning_rate": 3.128500740106052e-05, "loss": 0.1667, "step": 1482 }, { "epoch": 2.357710651828299, "grad_norm": 1.2289676538415812, "learning_rate": 3.1287861888798105e-05, "loss": 0.1516, "step": 1483 }, { "epoch": 2.359300476947536, "grad_norm": 1.319048088212486, "learning_rate": 3.1290719325278975e-05, "loss": 0.1729, "step": 1484 }, { "epoch": 2.360890302066773, "grad_norm": 1.0350250071026756, "learning_rate": 3.1293579709535983e-05, "loss": 0.1457, "step": 1485 }, { "epoch": 2.3624801271860094, "grad_norm": 1.953397548119439, "learning_rate": 3.1296443040601005e-05, "loss": 0.1837, "step": 1486 }, { "epoch": 2.3640699523052464, "grad_norm": 1.0521509408943108, "learning_rate": 3.12993093175049e-05, "loss": 0.1388, "step": 1487 }, { "epoch": 2.3656597774244834, "grad_norm": 1.26756396836801, "learning_rate": 3.130217853927755e-05, "loss": 0.1912, "step": 1488 }, { "epoch": 2.36724960254372, "grad_norm": 1.9305598386292797, "learning_rate": 3.130505070494781e-05, "loss": 0.231, "step": 1489 }, { "epoch": 2.368839427662957, "grad_norm": 7.252905728105691, "learning_rate": 3.130792581354357e-05, "loss": 0.9758, "step": 1490 }, { "epoch": 2.370429252782194, "grad_norm": 1.5431665380552189, "learning_rate": 3.1310803864091696e-05, "loss": 0.2032, "step": 1491 }, { "epoch": 2.372019077901431, "grad_norm": 2.656444420676739, "learning_rate": 3.1313684855618095e-05, "loss": 0.2217, "step": 1492 }, { "epoch": 2.373608903020668, "grad_norm": 1.556059919802715, "learning_rate": 3.1316568787147627e-05, "loss": 0.1659, "step": 1493 }, { "epoch": 2.3751987281399045, "grad_norm": 1.3698426597390472, "learning_rate": 3.1319455657704205e-05, "loss": 0.1839, "step": 1494 }, { "epoch": 2.3767885532591415, "grad_norm": 3.0423826028284724, "learning_rate": 3.132234546631072e-05, "loss": 0.3538, "step": 1495 }, { "epoch": 2.3783783783783785, "grad_norm": 1.4872500970599571, "learning_rate": 3.132523821198908e-05, "loss": 0.147, "step": 1496 }, { "epoch": 2.379968203497615, "grad_norm": 1.3747266674234895, "learning_rate": 3.132813389376019e-05, "loss": 0.1832, "step": 1497 }, { "epoch": 2.381558028616852, "grad_norm": 1.5486370602213821, "learning_rate": 3.133103251064397e-05, "loss": 0.2028, "step": 1498 }, { "epoch": 2.383147853736089, "grad_norm": 1.3118235203310677, "learning_rate": 3.1333934061659345e-05, "loss": 0.1406, "step": 1499 }, { "epoch": 2.384737678855326, "grad_norm": 0.8265749362964889, "learning_rate": 3.1336838545824255e-05, "loss": 0.1292, "step": 1500 }, { "epoch": 2.3863275039745626, "grad_norm": 1.1049670753997374, "learning_rate": 3.133974596215561e-05, "loss": 0.1853, "step": 1501 }, { "epoch": 2.3879173290937996, "grad_norm": 1.52507113499075, "learning_rate": 3.1342656309669384e-05, "loss": 0.1474, "step": 1502 }, { "epoch": 2.3895071542130366, "grad_norm": 1.0958351723191888, "learning_rate": 3.134556958738051e-05, "loss": 0.1782, "step": 1503 }, { "epoch": 2.3910969793322736, "grad_norm": 1.2887028391507356, "learning_rate": 3.1348485794302956e-05, "loss": 0.1773, "step": 1504 }, { "epoch": 2.39268680445151, "grad_norm": 0.9680746236960877, "learning_rate": 3.13514049294497e-05, "loss": 0.2013, "step": 1505 }, { "epoch": 2.394276629570747, "grad_norm": 3.4367125383495685, "learning_rate": 3.135432699183269e-05, "loss": 0.1734, "step": 1506 }, { "epoch": 2.395866454689984, "grad_norm": 1.057120088249028, "learning_rate": 3.1357251980462956e-05, "loss": 0.2048, "step": 1507 }, { "epoch": 2.397456279809221, "grad_norm": 1.1650375662031185, "learning_rate": 3.1360179894350465e-05, "loss": 0.1992, "step": 1508 }, { "epoch": 2.3990461049284577, "grad_norm": 1.7979175555589268, "learning_rate": 3.136311073250424e-05, "loss": 0.1266, "step": 1509 }, { "epoch": 2.4006359300476947, "grad_norm": 1.1944252438466139, "learning_rate": 3.136604449393228e-05, "loss": 0.1409, "step": 1510 }, { "epoch": 2.4022257551669317, "grad_norm": 30.518064575713485, "learning_rate": 3.136898117764164e-05, "loss": 26.6968, "step": 1511 }, { "epoch": 2.4038155802861687, "grad_norm": 28.21925598666198, "learning_rate": 3.1371920782638336e-05, "loss": 26.635, "step": 1512 }, { "epoch": 2.4054054054054053, "grad_norm": 2.0987458298031303, "learning_rate": 3.137486330792742e-05, "loss": 0.2136, "step": 1513 }, { "epoch": 2.4069952305246423, "grad_norm": 1.0021039433603938, "learning_rate": 3.137780875251297e-05, "loss": 0.2344, "step": 1514 }, { "epoch": 2.4085850556438793, "grad_norm": 1.1394833635766866, "learning_rate": 3.138075711539805e-05, "loss": 0.1472, "step": 1515 }, { "epoch": 2.4101748807631163, "grad_norm": 0.9624810031376568, "learning_rate": 3.138370839558474e-05, "loss": 0.18, "step": 1516 }, { "epoch": 2.411764705882353, "grad_norm": 1.2790589209620769, "learning_rate": 3.138666259207415e-05, "loss": 0.2613, "step": 1517 }, { "epoch": 2.41335453100159, "grad_norm": 1.1680169508386977, "learning_rate": 3.138961970386638e-05, "loss": 0.1454, "step": 1518 }, { "epoch": 2.414944356120827, "grad_norm": 0.8273120941151018, "learning_rate": 3.1392579729960564e-05, "loss": 0.1644, "step": 1519 }, { "epoch": 2.4165341812400634, "grad_norm": 1.150587108819813, "learning_rate": 3.139554266935484e-05, "loss": 0.1579, "step": 1520 }, { "epoch": 2.4181240063593004, "grad_norm": 34.95835080594549, "learning_rate": 3.1398508521046344e-05, "loss": 25.178, "step": 1521 }, { "epoch": 2.4197138314785374, "grad_norm": 0.8969091537680663, "learning_rate": 3.1401477284031273e-05, "loss": 0.1639, "step": 1522 }, { "epoch": 2.4213036565977744, "grad_norm": 34.49501627345301, "learning_rate": 3.140444895730478e-05, "loss": 24.3633, "step": 1523 }, { "epoch": 2.4228934817170114, "grad_norm": 0.8764485747777683, "learning_rate": 3.140742353986106e-05, "loss": 0.1497, "step": 1524 }, { "epoch": 2.424483306836248, "grad_norm": 0.6552112156199094, "learning_rate": 3.141040103069335e-05, "loss": 0.1249, "step": 1525 }, { "epoch": 2.426073131955485, "grad_norm": 1.1845888791516181, "learning_rate": 3.141338142879387e-05, "loss": 0.1936, "step": 1526 }, { "epoch": 2.427662957074722, "grad_norm": 2.4087522854115533, "learning_rate": 3.141636473315384e-05, "loss": 0.2414, "step": 1527 }, { "epoch": 2.4292527821939585, "grad_norm": 1.0040074881775796, "learning_rate": 3.1419350942763557e-05, "loss": 0.2171, "step": 1528 }, { "epoch": 2.4308426073131955, "grad_norm": 1.130652196648176, "learning_rate": 3.142234005661226e-05, "loss": 0.191, "step": 1529 }, { "epoch": 2.4324324324324325, "grad_norm": 1.140773132991218, "learning_rate": 3.142533207368826e-05, "loss": 0.1944, "step": 1530 }, { "epoch": 2.4340222575516695, "grad_norm": 0.9822099132244579, "learning_rate": 3.1428326992978875e-05, "loss": 0.2321, "step": 1531 }, { "epoch": 2.435612082670906, "grad_norm": 0.9373274885012641, "learning_rate": 3.143132481347042e-05, "loss": 0.1313, "step": 1532 }, { "epoch": 2.437201907790143, "grad_norm": 0.8130735396798979, "learning_rate": 3.143432553414825e-05, "loss": 0.1926, "step": 1533 }, { "epoch": 2.43879173290938, "grad_norm": 2.3315274090712337, "learning_rate": 3.143732915399672e-05, "loss": 0.3846, "step": 1534 }, { "epoch": 2.440381558028617, "grad_norm": 1.0935092122841343, "learning_rate": 3.144033567199922e-05, "loss": 0.1562, "step": 1535 }, { "epoch": 2.4419713831478536, "grad_norm": 1.43334673121355, "learning_rate": 3.1443345087138156e-05, "loss": 0.188, "step": 1536 }, { "epoch": 2.4435612082670906, "grad_norm": 0.9968599019221744, "learning_rate": 3.144635739839493e-05, "loss": 0.1672, "step": 1537 }, { "epoch": 2.4451510333863276, "grad_norm": 1.319604075396449, "learning_rate": 3.144937260475001e-05, "loss": 0.1569, "step": 1538 }, { "epoch": 2.4467408585055646, "grad_norm": 1.127694113037551, "learning_rate": 3.145239070518285e-05, "loss": 0.1736, "step": 1539 }, { "epoch": 2.448330683624801, "grad_norm": 1.312037599884244, "learning_rate": 3.145541169867192e-05, "loss": 0.1597, "step": 1540 }, { "epoch": 2.449920508744038, "grad_norm": 2.1475541168287817, "learning_rate": 3.145843558419474e-05, "loss": 0.2302, "step": 1541 }, { "epoch": 2.451510333863275, "grad_norm": 1.4954514035933968, "learning_rate": 3.146146236072783e-05, "loss": 0.2064, "step": 1542 }, { "epoch": 2.453100158982512, "grad_norm": 1.0531233797293011, "learning_rate": 3.1464492027246734e-05, "loss": 0.1396, "step": 1543 }, { "epoch": 2.4546899841017487, "grad_norm": 1.2449728626300245, "learning_rate": 3.1467524582726e-05, "loss": 0.1279, "step": 1544 }, { "epoch": 2.4562798092209857, "grad_norm": 1.1274244747338644, "learning_rate": 3.147056002613925e-05, "loss": 0.1711, "step": 1545 }, { "epoch": 2.4578696343402227, "grad_norm": 1.7982378255027534, "learning_rate": 3.147359835645908e-05, "loss": 0.169, "step": 1546 }, { "epoch": 2.4594594594594597, "grad_norm": 0.8689095376740332, "learning_rate": 3.1476639572657125e-05, "loss": 0.1984, "step": 1547 }, { "epoch": 2.461049284578696, "grad_norm": 1.0218448093979877, "learning_rate": 3.147968367370404e-05, "loss": 0.1839, "step": 1548 }, { "epoch": 2.462639109697933, "grad_norm": 1.5260519817874239, "learning_rate": 3.1482730658569524e-05, "loss": 0.19, "step": 1549 }, { "epoch": 2.46422893481717, "grad_norm": 27.353099202814303, "learning_rate": 3.148578052622227e-05, "loss": 25.1552, "step": 1550 }, { "epoch": 2.4658187599364068, "grad_norm": 1.2370950473362627, "learning_rate": 3.148883327563e-05, "loss": 0.1732, "step": 1551 }, { "epoch": 2.4674085850556438, "grad_norm": 1.782406424734523, "learning_rate": 3.1491888905759486e-05, "loss": 0.2094, "step": 1552 }, { "epoch": 2.4689984101748808, "grad_norm": 4.058523803805113, "learning_rate": 3.1494947415576506e-05, "loss": 0.3051, "step": 1553 }, { "epoch": 2.4705882352941178, "grad_norm": 1.7735133259534277, "learning_rate": 3.1498008804045846e-05, "loss": 0.1888, "step": 1554 }, { "epoch": 2.4721780604133547, "grad_norm": 1.3410614826828207, "learning_rate": 3.150107307013136e-05, "loss": 0.1547, "step": 1555 }, { "epoch": 2.4737678855325913, "grad_norm": 3.1654860002357523, "learning_rate": 3.150414021279589e-05, "loss": 0.2041, "step": 1556 }, { "epoch": 2.4753577106518283, "grad_norm": 2.1675021409470148, "learning_rate": 3.150721023100133e-05, "loss": 0.2697, "step": 1557 }, { "epoch": 2.4769475357710653, "grad_norm": 1.359752284577795, "learning_rate": 3.151028312370859e-05, "loss": 0.1854, "step": 1558 }, { "epoch": 2.478537360890302, "grad_norm": 1.7383788340055064, "learning_rate": 3.151335888987759e-05, "loss": 0.1989, "step": 1559 }, { "epoch": 2.480127186009539, "grad_norm": 1.836117580380345, "learning_rate": 3.1516437528467315e-05, "loss": 0.1769, "step": 1560 }, { "epoch": 2.481717011128776, "grad_norm": 1.8333286667868176, "learning_rate": 3.151951903843574e-05, "loss": 0.1817, "step": 1561 }, { "epoch": 2.483306836248013, "grad_norm": 2.120279631244857, "learning_rate": 3.15226034187399e-05, "loss": 0.1772, "step": 1562 }, { "epoch": 2.48489666136725, "grad_norm": 2.741404407263717, "learning_rate": 3.152569066833584e-05, "loss": 0.1939, "step": 1563 }, { "epoch": 2.4864864864864864, "grad_norm": 1.8914845041859072, "learning_rate": 3.1528780786178625e-05, "loss": 0.146, "step": 1564 }, { "epoch": 2.4880763116057234, "grad_norm": 1.5340942829081892, "learning_rate": 3.153187377122239e-05, "loss": 0.1939, "step": 1565 }, { "epoch": 2.4896661367249604, "grad_norm": 1.7439342919233336, "learning_rate": 3.153496962242024e-05, "loss": 0.1876, "step": 1566 }, { "epoch": 2.491255961844197, "grad_norm": 1.61316043881466, "learning_rate": 3.1538068338724364e-05, "loss": 0.1702, "step": 1567 }, { "epoch": 2.492845786963434, "grad_norm": 2.6287178758888725, "learning_rate": 3.1541169919085945e-05, "loss": 0.2057, "step": 1568 }, { "epoch": 2.494435612082671, "grad_norm": 1.3821009699608557, "learning_rate": 3.1544274362455215e-05, "loss": 0.1686, "step": 1569 }, { "epoch": 2.496025437201908, "grad_norm": 2.3644522206373892, "learning_rate": 3.154738166778144e-05, "loss": 0.2875, "step": 1570 }, { "epoch": 2.4976152623211445, "grad_norm": 1.824177878410893, "learning_rate": 3.1550491834012896e-05, "loss": 0.2011, "step": 1571 }, { "epoch": 2.4992050874403815, "grad_norm": 1.7540490819874688, "learning_rate": 3.155360486009692e-05, "loss": 0.1459, "step": 1572 }, { "epoch": 2.5007949125596185, "grad_norm": 1.1836154367270562, "learning_rate": 3.1556720744979846e-05, "loss": 0.174, "step": 1573 }, { "epoch": 2.502384737678855, "grad_norm": 1.971247818786658, "learning_rate": 3.155983948760708e-05, "loss": 0.164, "step": 1574 }, { "epoch": 2.503974562798092, "grad_norm": 2.259937106561657, "learning_rate": 3.1562961086923025e-05, "loss": 0.1838, "step": 1575 }, { "epoch": 2.505564387917329, "grad_norm": 1.3556363011844528, "learning_rate": 3.156608554187115e-05, "loss": 0.159, "step": 1576 }, { "epoch": 2.507154213036566, "grad_norm": 1.6360385009731324, "learning_rate": 3.1569212851393915e-05, "loss": 0.221, "step": 1577 }, { "epoch": 2.508744038155803, "grad_norm": 2.086187548519359, "learning_rate": 3.157234301443286e-05, "loss": 0.1374, "step": 1578 }, { "epoch": 2.5103338632750396, "grad_norm": 1.0095470304857215, "learning_rate": 3.1575476029928524e-05, "loss": 0.1724, "step": 1579 }, { "epoch": 2.5119236883942766, "grad_norm": 1.6433520231243424, "learning_rate": 3.157861189682051e-05, "loss": 0.232, "step": 1580 }, { "epoch": 2.5135135135135136, "grad_norm": 1.0328133474390058, "learning_rate": 3.1581750614047434e-05, "loss": 0.1761, "step": 1581 }, { "epoch": 2.51510333863275, "grad_norm": 1.2690146738591945, "learning_rate": 3.158489218054693e-05, "loss": 0.2007, "step": 1582 }, { "epoch": 2.516693163751987, "grad_norm": 1.246144047347006, "learning_rate": 3.1588036595255746e-05, "loss": 0.1999, "step": 1583 }, { "epoch": 2.518282988871224, "grad_norm": 1.0850788097137907, "learning_rate": 3.159118385710955e-05, "loss": 0.1901, "step": 1584 }, { "epoch": 2.519872813990461, "grad_norm": 0.8284641994965761, "learning_rate": 3.159433396504316e-05, "loss": 0.1658, "step": 1585 }, { "epoch": 2.521462639109698, "grad_norm": 0.9720658936117254, "learning_rate": 3.1597486917990346e-05, "loss": 0.1578, "step": 1586 }, { "epoch": 2.5230524642289347, "grad_norm": 1.840114119924228, "learning_rate": 3.1600642714883954e-05, "loss": 0.221, "step": 1587 }, { "epoch": 2.5246422893481717, "grad_norm": 2.377086664673885, "learning_rate": 3.1603801354655866e-05, "loss": 0.1853, "step": 1588 }, { "epoch": 2.5262321144674087, "grad_norm": 2.0315889551445405, "learning_rate": 3.1606962836237004e-05, "loss": 0.1776, "step": 1589 }, { "epoch": 2.5278219395866453, "grad_norm": 1.9891518136174857, "learning_rate": 3.1610127158557295e-05, "loss": 0.1738, "step": 1590 }, { "epoch": 2.5294117647058822, "grad_norm": 1.8688675727531665, "learning_rate": 3.161329432054576e-05, "loss": 0.2659, "step": 1591 }, { "epoch": 2.5310015898251192, "grad_norm": 1.7867401394045739, "learning_rate": 3.161646432113042e-05, "loss": 0.1975, "step": 1592 }, { "epoch": 2.5325914149443562, "grad_norm": 2.171421626126862, "learning_rate": 3.161963715923833e-05, "loss": 0.242, "step": 1593 }, { "epoch": 2.5341812400635932, "grad_norm": 2.6943614232804096, "learning_rate": 3.1622812833795616e-05, "loss": 0.2038, "step": 1594 }, { "epoch": 2.53577106518283, "grad_norm": 0.8935098063562663, "learning_rate": 3.1625991343727414e-05, "loss": 0.2242, "step": 1595 }, { "epoch": 2.537360890302067, "grad_norm": 1.7616556618538113, "learning_rate": 3.162917268795793e-05, "loss": 0.1934, "step": 1596 }, { "epoch": 2.538950715421304, "grad_norm": 1.6185284022114828, "learning_rate": 3.163235686541038e-05, "loss": 0.1785, "step": 1597 }, { "epoch": 2.5405405405405403, "grad_norm": 2.3720044892681322, "learning_rate": 3.163554387500705e-05, "loss": 0.2387, "step": 1598 }, { "epoch": 2.5421303656597773, "grad_norm": 1.1746212556196753, "learning_rate": 3.1638733715669226e-05, "loss": 0.1546, "step": 1599 }, { "epoch": 2.5437201907790143, "grad_norm": 2.0582307775588706, "learning_rate": 3.16419263863173e-05, "loss": 0.1576, "step": 1600 }, { "epoch": 2.5453100158982513, "grad_norm": 1.425021499342463, "learning_rate": 3.164512188587064e-05, "loss": 0.15, "step": 1601 }, { "epoch": 2.5468998410174883, "grad_norm": 4.712356314712078, "learning_rate": 3.164832021324768e-05, "loss": 0.5045, "step": 1602 }, { "epoch": 2.548489666136725, "grad_norm": 1.1145134267667107, "learning_rate": 3.165152136736593e-05, "loss": 0.1196, "step": 1603 }, { "epoch": 2.550079491255962, "grad_norm": 1.5850706192031563, "learning_rate": 3.16547253471419e-05, "loss": 0.2033, "step": 1604 }, { "epoch": 2.551669316375199, "grad_norm": 1.1366371495805372, "learning_rate": 3.165793215149116e-05, "loss": 0.1293, "step": 1605 }, { "epoch": 2.5532591414944354, "grad_norm": 0.9952876982851481, "learning_rate": 3.1661141779328316e-05, "loss": 0.1959, "step": 1606 }, { "epoch": 2.5548489666136724, "grad_norm": 1.2205563642082213, "learning_rate": 3.1664354229567046e-05, "loss": 0.1932, "step": 1607 }, { "epoch": 2.5564387917329094, "grad_norm": 1.2206997124155936, "learning_rate": 3.1667569501120016e-05, "loss": 0.2295, "step": 1608 }, { "epoch": 2.5580286168521464, "grad_norm": 0.8037883586730251, "learning_rate": 3.167078759289901e-05, "loss": 0.2214, "step": 1609 }, { "epoch": 2.559618441971383, "grad_norm": 0.9903893497716555, "learning_rate": 3.1674008503814794e-05, "loss": 0.1528, "step": 1610 }, { "epoch": 2.56120826709062, "grad_norm": 1.1079250554104005, "learning_rate": 3.167723223277722e-05, "loss": 0.2337, "step": 1611 }, { "epoch": 2.562798092209857, "grad_norm": 0.805272238731337, "learning_rate": 3.168045877869518e-05, "loss": 0.2316, "step": 1612 }, { "epoch": 2.5643879173290935, "grad_norm": 0.9296334785312509, "learning_rate": 3.168368814047658e-05, "loss": 0.146, "step": 1613 }, { "epoch": 2.5659777424483305, "grad_norm": 1.1095332688567827, "learning_rate": 3.168692031702842e-05, "loss": 0.137, "step": 1614 }, { "epoch": 2.5675675675675675, "grad_norm": 1.195186347935812, "learning_rate": 3.169015530725672e-05, "loss": 0.1781, "step": 1615 }, { "epoch": 2.5691573926868045, "grad_norm": 1.2188215958285262, "learning_rate": 3.169339311006655e-05, "loss": 0.2506, "step": 1616 }, { "epoch": 2.5707472178060415, "grad_norm": 1.8952347764631603, "learning_rate": 3.1696633724362025e-05, "loss": 0.1658, "step": 1617 }, { "epoch": 2.572337042925278, "grad_norm": 1.1303299911350377, "learning_rate": 3.1699877149046325e-05, "loss": 0.1467, "step": 1618 }, { "epoch": 2.573926868044515, "grad_norm": 1.272771800812583, "learning_rate": 3.1703123383021666e-05, "loss": 0.1791, "step": 1619 }, { "epoch": 2.575516693163752, "grad_norm": 1.477385369713666, "learning_rate": 3.170637242518931e-05, "loss": 0.1206, "step": 1620 }, { "epoch": 2.5771065182829886, "grad_norm": 1.1719991964586132, "learning_rate": 3.170962427444958e-05, "loss": 0.14, "step": 1621 }, { "epoch": 2.5786963434022256, "grad_norm": 1.9668466686565567, "learning_rate": 3.1712878929701844e-05, "loss": 0.2481, "step": 1622 }, { "epoch": 2.5802861685214626, "grad_norm": 0.7302787952789077, "learning_rate": 3.171613638984451e-05, "loss": 0.1544, "step": 1623 }, { "epoch": 2.5818759936406996, "grad_norm": 0.9105475085363616, "learning_rate": 3.171939665377506e-05, "loss": 0.1962, "step": 1624 }, { "epoch": 2.5834658187599366, "grad_norm": 1.2853315261327518, "learning_rate": 3.172265972039e-05, "loss": 0.1766, "step": 1625 }, { "epoch": 2.585055643879173, "grad_norm": 0.7011321641667215, "learning_rate": 3.17259255885849e-05, "loss": 0.1593, "step": 1626 }, { "epoch": 2.58664546899841, "grad_norm": 4.172950592456332, "learning_rate": 3.172919425725438e-05, "loss": 0.3516, "step": 1627 }, { "epoch": 2.588235294117647, "grad_norm": 0.6970200702320729, "learning_rate": 3.1732465725292126e-05, "loss": 0.1466, "step": 1628 }, { "epoch": 2.5898251192368837, "grad_norm": 1.3745930167631675, "learning_rate": 3.173573999159086e-05, "loss": 0.2465, "step": 1629 }, { "epoch": 2.5914149443561207, "grad_norm": 21.45415587211635, "learning_rate": 3.1739017055042365e-05, "loss": 12.3912, "step": 1630 }, { "epoch": 2.5930047694753577, "grad_norm": 0.6469654376167354, "learning_rate": 3.174229691453746e-05, "loss": 0.1966, "step": 1631 }, { "epoch": 2.5945945945945947, "grad_norm": 1.2012334629515304, "learning_rate": 3.174557956896604e-05, "loss": 0.1595, "step": 1632 }, { "epoch": 2.5961844197138317, "grad_norm": 1.067169130312683, "learning_rate": 3.174886501721705e-05, "loss": 0.1718, "step": 1633 }, { "epoch": 2.5977742448330683, "grad_norm": 1.1754927638047044, "learning_rate": 3.175215325817848e-05, "loss": 0.161, "step": 1634 }, { "epoch": 2.5993640699523053, "grad_norm": 33.8228033276979, "learning_rate": 3.1755444290737376e-05, "loss": 23.8961, "step": 1635 }, { "epoch": 2.6009538950715423, "grad_norm": 1.389730707345103, "learning_rate": 3.175873811377985e-05, "loss": 0.1651, "step": 1636 }, { "epoch": 2.602543720190779, "grad_norm": 1.032629610400898, "learning_rate": 3.176203472619105e-05, "loss": 0.2002, "step": 1637 }, { "epoch": 2.604133545310016, "grad_norm": 1.8994430286847295, "learning_rate": 3.176533412685521e-05, "loss": 0.1939, "step": 1638 }, { "epoch": 2.605723370429253, "grad_norm": 1.3019931066620578, "learning_rate": 3.176863631465559e-05, "loss": 0.1492, "step": 1639 }, { "epoch": 2.60731319554849, "grad_norm": 5.501726750465694, "learning_rate": 3.177194128847451e-05, "loss": 0.3948, "step": 1640 }, { "epoch": 2.6089030206677264, "grad_norm": 0.8538889161897999, "learning_rate": 3.177524904719337e-05, "loss": 0.1582, "step": 1641 }, { "epoch": 2.6104928457869634, "grad_norm": 0.6587350500729588, "learning_rate": 3.177855958969263e-05, "loss": 0.1789, "step": 1642 }, { "epoch": 2.6120826709062004, "grad_norm": 2.3652042751342557, "learning_rate": 3.1781872914851756e-05, "loss": 0.2941, "step": 1643 }, { "epoch": 2.613672496025437, "grad_norm": 1.486225739504669, "learning_rate": 3.178518902154933e-05, "loss": 0.1594, "step": 1644 }, { "epoch": 2.615262321144674, "grad_norm": 1.8505554207598498, "learning_rate": 3.178850790866296e-05, "loss": 0.1976, "step": 1645 }, { "epoch": 2.616852146263911, "grad_norm": 1.1723249654038448, "learning_rate": 3.179182957506933e-05, "loss": 0.1911, "step": 1646 }, { "epoch": 2.618441971383148, "grad_norm": 1.3728987602444567, "learning_rate": 3.179515401964417e-05, "loss": 0.249, "step": 1647 }, { "epoch": 2.620031796502385, "grad_norm": 1.0593486516109556, "learning_rate": 3.1798481241262284e-05, "loss": 0.1723, "step": 1648 }, { "epoch": 2.6216216216216215, "grad_norm": 1.281494858602922, "learning_rate": 3.1801811238797515e-05, "loss": 0.2096, "step": 1649 }, { "epoch": 2.6232114467408585, "grad_norm": 1.5552829966348858, "learning_rate": 3.1805144011122795e-05, "loss": 0.2733, "step": 1650 }, { "epoch": 2.6248012718600955, "grad_norm": 0.8680567447248302, "learning_rate": 3.180847955711008e-05, "loss": 0.1482, "step": 1651 }, { "epoch": 2.626391096979332, "grad_norm": 1.6052048022167813, "learning_rate": 3.181181787563043e-05, "loss": 0.1218, "step": 1652 }, { "epoch": 2.627980922098569, "grad_norm": 1.5389332831609386, "learning_rate": 3.181515896555394e-05, "loss": 0.196, "step": 1653 }, { "epoch": 2.629570747217806, "grad_norm": 1.4382613782415516, "learning_rate": 3.181850282574977e-05, "loss": 0.1883, "step": 1654 }, { "epoch": 2.631160572337043, "grad_norm": 1.78131454821202, "learning_rate": 3.182184945508613e-05, "loss": 0.2171, "step": 1655 }, { "epoch": 2.63275039745628, "grad_norm": 1.5959026050327163, "learning_rate": 3.182519885243033e-05, "loss": 0.167, "step": 1656 }, { "epoch": 2.6343402225755166, "grad_norm": 1.4506501146841084, "learning_rate": 3.182855101664872e-05, "loss": 0.1826, "step": 1657 }, { "epoch": 2.6359300476947536, "grad_norm": 1.2336988576159258, "learning_rate": 3.183190594660669e-05, "loss": 0.19, "step": 1658 }, { "epoch": 2.6375198728139906, "grad_norm": 1.7568890220037732, "learning_rate": 3.183526364116874e-05, "loss": 0.2242, "step": 1659 }, { "epoch": 2.639109697933227, "grad_norm": 1.9760264122780529, "learning_rate": 3.1838624099198396e-05, "loss": 0.2328, "step": 1660 }, { "epoch": 2.640699523052464, "grad_norm": 2.0053734765637574, "learning_rate": 3.184198731955827e-05, "loss": 0.1501, "step": 1661 }, { "epoch": 2.642289348171701, "grad_norm": 2.258285985704613, "learning_rate": 3.184535330111005e-05, "loss": 0.185, "step": 1662 }, { "epoch": 2.643879173290938, "grad_norm": 2.3558932735456786, "learning_rate": 3.1848722042714454e-05, "loss": 0.2384, "step": 1663 }, { "epoch": 2.645468998410175, "grad_norm": 0.8834694005323286, "learning_rate": 3.185209354323129e-05, "loss": 0.1489, "step": 1664 }, { "epoch": 2.6470588235294117, "grad_norm": 1.0315247898130053, "learning_rate": 3.185546780151943e-05, "loss": 0.1556, "step": 1665 }, { "epoch": 2.6486486486486487, "grad_norm": 1.414992440838794, "learning_rate": 3.1858844816436804e-05, "loss": 0.1532, "step": 1666 }, { "epoch": 2.6502384737678857, "grad_norm": 1.543054409974727, "learning_rate": 3.1862224586840425e-05, "loss": 0.1717, "step": 1667 }, { "epoch": 2.6518282988871222, "grad_norm": 1.2348687419858817, "learning_rate": 3.186560711158635e-05, "loss": 0.1859, "step": 1668 }, { "epoch": 2.6534181240063592, "grad_norm": 1.0777269835329621, "learning_rate": 3.186899238952972e-05, "loss": 0.1785, "step": 1669 }, { "epoch": 2.6550079491255962, "grad_norm": 1.650655142566561, "learning_rate": 3.1872380419524755e-05, "loss": 0.1778, "step": 1670 }, { "epoch": 2.6565977742448332, "grad_norm": 1.7929541694054434, "learning_rate": 3.187577120042471e-05, "loss": 0.1782, "step": 1671 }, { "epoch": 2.65818759936407, "grad_norm": 1.435896564322184, "learning_rate": 3.1879164731081936e-05, "loss": 0.1707, "step": 1672 }, { "epoch": 2.659777424483307, "grad_norm": 1.7621159292367217, "learning_rate": 3.188256101034785e-05, "loss": 0.1856, "step": 1673 }, { "epoch": 2.661367249602544, "grad_norm": 1.3976184816737527, "learning_rate": 3.1885960037072935e-05, "loss": 0.2045, "step": 1674 }, { "epoch": 2.6629570747217803, "grad_norm": 1.2330924249145283, "learning_rate": 3.1889361810106736e-05, "loss": 0.1586, "step": 1675 }, { "epoch": 2.6645468998410173, "grad_norm": 1.3158885034398211, "learning_rate": 3.1892766328297876e-05, "loss": 0.1507, "step": 1676 }, { "epoch": 2.6661367249602543, "grad_norm": 1.1200144607379563, "learning_rate": 3.189617359049406e-05, "loss": 0.1689, "step": 1677 }, { "epoch": 2.6677265500794913, "grad_norm": 1.7236185066275653, "learning_rate": 3.189958359554204e-05, "loss": 0.2984, "step": 1678 }, { "epoch": 2.6693163751987283, "grad_norm": 1.355741214581686, "learning_rate": 3.1902996342287666e-05, "loss": 0.2031, "step": 1679 }, { "epoch": 2.670906200317965, "grad_norm": 1.5073381933331391, "learning_rate": 3.1906411829575835e-05, "loss": 0.1832, "step": 1680 }, { "epoch": 2.672496025437202, "grad_norm": 0.8702560004838332, "learning_rate": 3.190983005625053e-05, "loss": 0.1488, "step": 1681 }, { "epoch": 2.674085850556439, "grad_norm": 1.03058745025465, "learning_rate": 3.191325102115481e-05, "loss": 0.1792, "step": 1682 }, { "epoch": 2.6756756756756754, "grad_norm": 1.923291188689511, "learning_rate": 3.19166747231308e-05, "loss": 0.2635, "step": 1683 }, { "epoch": 2.6772655007949124, "grad_norm": 1.6836533719738307, "learning_rate": 3.19201011610197e-05, "loss": 0.249, "step": 1684 }, { "epoch": 2.6788553259141494, "grad_norm": 1.964818303051795, "learning_rate": 3.1923530333661784e-05, "loss": 0.1682, "step": 1685 }, { "epoch": 2.6804451510333864, "grad_norm": 1.3778456952100422, "learning_rate": 3.1926962239896404e-05, "loss": 0.2093, "step": 1686 }, { "epoch": 2.6820349761526234, "grad_norm": 1.27779326758411, "learning_rate": 3.193039687856198e-05, "loss": 0.1741, "step": 1687 }, { "epoch": 2.68362480127186, "grad_norm": 1.4621100523109438, "learning_rate": 3.1933834248496016e-05, "loss": 0.1598, "step": 1688 }, { "epoch": 2.685214626391097, "grad_norm": 0.8665263222298191, "learning_rate": 3.193727434853508e-05, "loss": 0.1512, "step": 1689 }, { "epoch": 2.686804451510334, "grad_norm": 1.7393494446613103, "learning_rate": 3.194071717751484e-05, "loss": 0.2215, "step": 1690 }, { "epoch": 2.6883942766295705, "grad_norm": 1.423530279081555, "learning_rate": 3.1944162734270007e-05, "loss": 0.1694, "step": 1691 }, { "epoch": 2.6899841017488075, "grad_norm": 1.0113582197331754, "learning_rate": 3.194761101763439e-05, "loss": 0.1793, "step": 1692 }, { "epoch": 2.6915739268680445, "grad_norm": 1.6972460013817485, "learning_rate": 3.1951062026440854e-05, "loss": 0.1699, "step": 1693 }, { "epoch": 2.6931637519872815, "grad_norm": 1.4275489492514701, "learning_rate": 3.195451575952138e-05, "loss": 0.2191, "step": 1694 }, { "epoch": 2.6947535771065185, "grad_norm": 1.122093489103576, "learning_rate": 3.1957972215707e-05, "loss": 0.2306, "step": 1695 }, { "epoch": 2.696343402225755, "grad_norm": 0.7456333724225834, "learning_rate": 3.196143139382783e-05, "loss": 0.155, "step": 1696 }, { "epoch": 2.697933227344992, "grad_norm": 1.477067741269703, "learning_rate": 3.196489329271305e-05, "loss": 0.1922, "step": 1697 }, { "epoch": 2.699523052464229, "grad_norm": 1.355615214597007, "learning_rate": 3.1968357911190936e-05, "loss": 0.191, "step": 1698 }, { "epoch": 2.7011128775834656, "grad_norm": 1.0408488873943331, "learning_rate": 3.1971825248088855e-05, "loss": 0.1966, "step": 1699 }, { "epoch": 2.7027027027027026, "grad_norm": 1.13717420157594, "learning_rate": 3.197529530223323e-05, "loss": 0.1942, "step": 1700 }, { "epoch": 2.7042925278219396, "grad_norm": 2.2414440722425377, "learning_rate": 3.197876807244956e-05, "loss": 0.2886, "step": 1701 }, { "epoch": 2.7058823529411766, "grad_norm": 1.3631627310146235, "learning_rate": 3.198224355756246e-05, "loss": 0.2118, "step": 1702 }, { "epoch": 2.7074721780604136, "grad_norm": 1.6438113179195133, "learning_rate": 3.1985721756395596e-05, "loss": 0.2801, "step": 1703 }, { "epoch": 2.70906200317965, "grad_norm": 1.296013225608353, "learning_rate": 3.198920266777171e-05, "loss": 0.1591, "step": 1704 }, { "epoch": 2.710651828298887, "grad_norm": 1.1902862599409112, "learning_rate": 3.199268629051267e-05, "loss": 0.2281, "step": 1705 }, { "epoch": 2.7122416534181237, "grad_norm": 1.3033252061979184, "learning_rate": 3.1996172623439363e-05, "loss": 0.2097, "step": 1706 }, { "epoch": 2.7138314785373607, "grad_norm": 1.3567354808550032, "learning_rate": 3.1999661665371815e-05, "loss": 0.1386, "step": 1707 }, { "epoch": 2.7154213036565977, "grad_norm": 1.1980831364476576, "learning_rate": 3.2003153415129096e-05, "loss": 0.2669, "step": 1708 }, { "epoch": 2.7170111287758347, "grad_norm": 0.8758886678648288, "learning_rate": 3.2006647871529385e-05, "loss": 0.1794, "step": 1709 }, { "epoch": 2.7186009538950717, "grad_norm": 2.2770119920425924, "learning_rate": 3.201014503338993e-05, "loss": 0.1909, "step": 1710 }, { "epoch": 2.7201907790143083, "grad_norm": 1.3578763903897242, "learning_rate": 3.201364489952707e-05, "loss": 0.2351, "step": 1711 }, { "epoch": 2.7217806041335453, "grad_norm": 1.4609257473020034, "learning_rate": 3.201714746875623e-05, "loss": 0.2299, "step": 1712 }, { "epoch": 2.7233704292527823, "grad_norm": 1.282090891499825, "learning_rate": 3.202065273989192e-05, "loss": 0.2211, "step": 1713 }, { "epoch": 2.724960254372019, "grad_norm": 0.8559316511104884, "learning_rate": 3.202416071174771e-05, "loss": 0.1496, "step": 1714 }, { "epoch": 2.726550079491256, "grad_norm": 32.53751570442245, "learning_rate": 3.202767138313631e-05, "loss": 23.3768, "step": 1715 }, { "epoch": 2.728139904610493, "grad_norm": 1.3776538876684823, "learning_rate": 3.203118475286947e-05, "loss": 0.1701, "step": 1716 }, { "epoch": 2.72972972972973, "grad_norm": 1.187766910298497, "learning_rate": 3.2034700819758046e-05, "loss": 0.2013, "step": 1717 }, { "epoch": 2.731319554848967, "grad_norm": 1.567732228678719, "learning_rate": 3.203821958261196e-05, "loss": 0.189, "step": 1718 }, { "epoch": 2.7329093799682034, "grad_norm": 1.1703829067350333, "learning_rate": 3.204174104024026e-05, "loss": 0.1996, "step": 1719 }, { "epoch": 2.7344992050874404, "grad_norm": 0.8792507982491055, "learning_rate": 3.204526519145105e-05, "loss": 0.1807, "step": 1720 }, { "epoch": 2.7360890302066774, "grad_norm": 1.8268282056630911, "learning_rate": 3.204879203505152e-05, "loss": 0.1598, "step": 1721 }, { "epoch": 2.737678855325914, "grad_norm": 1.6029457919985353, "learning_rate": 3.205232156984799e-05, "loss": 0.1829, "step": 1722 }, { "epoch": 2.739268680445151, "grad_norm": 1.2319465015828268, "learning_rate": 3.205585379464582e-05, "loss": 0.1588, "step": 1723 }, { "epoch": 2.740858505564388, "grad_norm": 1.7712858001605818, "learning_rate": 3.2059388708249486e-05, "loss": 0.2265, "step": 1724 }, { "epoch": 2.742448330683625, "grad_norm": 0.8573729718573424, "learning_rate": 3.206292630946255e-05, "loss": 0.1771, "step": 1725 }, { "epoch": 2.744038155802862, "grad_norm": 1.4809619002048446, "learning_rate": 3.206646659708765e-05, "loss": 0.1957, "step": 1726 }, { "epoch": 2.7456279809220985, "grad_norm": 1.8338706312751636, "learning_rate": 3.2070009569926546e-05, "loss": 0.2097, "step": 1727 }, { "epoch": 2.7472178060413355, "grad_norm": 1.039101427630297, "learning_rate": 3.207355522678005e-05, "loss": 0.1587, "step": 1728 }, { "epoch": 2.7488076311605725, "grad_norm": 2.128916809261633, "learning_rate": 3.207710356644809e-05, "loss": 0.2538, "step": 1729 }, { "epoch": 2.750397456279809, "grad_norm": 0.9104875019324283, "learning_rate": 3.20806545877297e-05, "loss": 0.1329, "step": 1730 }, { "epoch": 2.751987281399046, "grad_norm": 1.2507427845750951, "learning_rate": 3.208420828942297e-05, "loss": 0.1729, "step": 1731 }, { "epoch": 2.753577106518283, "grad_norm": 1.9635739107784789, "learning_rate": 3.2087764670325106e-05, "loss": 0.2143, "step": 1732 }, { "epoch": 2.75516693163752, "grad_norm": 1.0835176712051497, "learning_rate": 3.20913237292324e-05, "loss": 0.2116, "step": 1733 }, { "epoch": 2.756756756756757, "grad_norm": 2.5062558678885667, "learning_rate": 3.2094885464940236e-05, "loss": 0.2314, "step": 1734 }, { "epoch": 2.7583465818759936, "grad_norm": 1.0641432499915056, "learning_rate": 3.2098449876243093e-05, "loss": 0.1471, "step": 1735 }, { "epoch": 2.7599364069952306, "grad_norm": 1.27405110523491, "learning_rate": 3.2102016961934575e-05, "loss": 0.2091, "step": 1736 }, { "epoch": 2.7615262321144676, "grad_norm": 2.477749790896029, "learning_rate": 3.210558672080731e-05, "loss": 0.1802, "step": 1737 }, { "epoch": 2.763116057233704, "grad_norm": 0.7760285817555416, "learning_rate": 3.21091591516531e-05, "loss": 0.1575, "step": 1738 }, { "epoch": 2.764705882352941, "grad_norm": 0.9769955512861405, "learning_rate": 3.211273425326278e-05, "loss": 0.1832, "step": 1739 }, { "epoch": 2.766295707472178, "grad_norm": 1.2731070411363863, "learning_rate": 3.2116312024426325e-05, "loss": 0.1565, "step": 1740 }, { "epoch": 2.767885532591415, "grad_norm": 1.175387201377257, "learning_rate": 3.211989246393278e-05, "loss": 0.174, "step": 1741 }, { "epoch": 2.7694753577106517, "grad_norm": 2.2529024620001, "learning_rate": 3.21234755705703e-05, "loss": 0.1835, "step": 1742 }, { "epoch": 2.7710651828298887, "grad_norm": 1.176054148207597, "learning_rate": 3.212706134312614e-05, "loss": 0.3043, "step": 1743 }, { "epoch": 2.7726550079491257, "grad_norm": 2.104039901756064, "learning_rate": 3.213064978038662e-05, "loss": 0.2276, "step": 1744 }, { "epoch": 2.7742448330683622, "grad_norm": 0.7709259002556587, "learning_rate": 3.2134240881137215e-05, "loss": 0.1119, "step": 1745 }, { "epoch": 2.7758346581875992, "grad_norm": 1.5939038386254845, "learning_rate": 3.213783464416246e-05, "loss": 0.2864, "step": 1746 }, { "epoch": 2.7774244833068362, "grad_norm": 14.80254176290554, "learning_rate": 3.2141431068245975e-05, "loss": 12.0569, "step": 1747 }, { "epoch": 2.779014308426073, "grad_norm": 1.0609416169674386, "learning_rate": 3.214503015217053e-05, "loss": 0.1465, "step": 1748 }, { "epoch": 2.78060413354531, "grad_norm": 1.5027287422318525, "learning_rate": 3.2148631894717945e-05, "loss": 0.1965, "step": 1749 }, { "epoch": 2.7821939586645468, "grad_norm": 0.7307606870505935, "learning_rate": 3.215223629466917e-05, "loss": 0.1381, "step": 1750 }, { "epoch": 2.7837837837837838, "grad_norm": 1.714743212025667, "learning_rate": 3.2155843350804244e-05, "loss": 0.2377, "step": 1751 }, { "epoch": 2.7853736089030208, "grad_norm": 34.97373885193613, "learning_rate": 3.2159453061902314e-05, "loss": 22.6905, "step": 1752 }, { "epoch": 2.7869634340222573, "grad_norm": 1.7772171006352502, "learning_rate": 3.2163065426741604e-05, "loss": 0.1993, "step": 1753 }, { "epoch": 2.7885532591414943, "grad_norm": 1.9736706194550975, "learning_rate": 3.216668044409948e-05, "loss": 0.1461, "step": 1754 }, { "epoch": 2.7901430842607313, "grad_norm": 21.846105885135277, "learning_rate": 3.217029811275238e-05, "loss": 11.3971, "step": 1755 }, { "epoch": 2.7917329093799683, "grad_norm": 3.6866855585417335, "learning_rate": 3.217391843147587e-05, "loss": 0.2437, "step": 1756 }, { "epoch": 2.7933227344992053, "grad_norm": 1.8147222778311776, "learning_rate": 3.2177541399044574e-05, "loss": 0.198, "step": 1757 }, { "epoch": 2.794912559618442, "grad_norm": 2.335280478843124, "learning_rate": 3.218116701423227e-05, "loss": 0.155, "step": 1758 }, { "epoch": 2.796502384737679, "grad_norm": 2.1947574636326537, "learning_rate": 3.218479527581182e-05, "loss": 0.1701, "step": 1759 }, { "epoch": 2.798092209856916, "grad_norm": 1.1251785657993423, "learning_rate": 3.2188426182555166e-05, "loss": 0.1652, "step": 1760 }, { "epoch": 2.7996820349761524, "grad_norm": 2.699847610986689, "learning_rate": 3.2192059733233414e-05, "loss": 0.2165, "step": 1761 }, { "epoch": 2.8012718600953894, "grad_norm": 4.306110244044309, "learning_rate": 3.21956959266167e-05, "loss": 0.2491, "step": 1762 }, { "epoch": 2.8028616852146264, "grad_norm": 1.9944981895021419, "learning_rate": 3.2199334761474334e-05, "loss": 0.146, "step": 1763 }, { "epoch": 2.8044515103338634, "grad_norm": 3.53157054045948, "learning_rate": 3.220297623657469e-05, "loss": 0.3417, "step": 1764 }, { "epoch": 2.8060413354531004, "grad_norm": 1.6904104701396765, "learning_rate": 3.220662035068526e-05, "loss": 0.1897, "step": 1765 }, { "epoch": 2.807631160572337, "grad_norm": 2.111251656890466, "learning_rate": 3.221026710257264e-05, "loss": 0.2737, "step": 1766 }, { "epoch": 2.809220985691574, "grad_norm": 1.4197709979435877, "learning_rate": 3.221391649100255e-05, "loss": 0.1191, "step": 1767 }, { "epoch": 2.810810810810811, "grad_norm": 2.1680567222244953, "learning_rate": 3.2217568514739795e-05, "loss": 0.1694, "step": 1768 }, { "epoch": 2.8124006359300475, "grad_norm": 3.7832067252603165, "learning_rate": 3.222122317254829e-05, "loss": 0.1965, "step": 1769 }, { "epoch": 2.8139904610492845, "grad_norm": 2.000114122655783, "learning_rate": 3.222488046319107e-05, "loss": 0.2326, "step": 1770 }, { "epoch": 2.8155802861685215, "grad_norm": 1.933208812867363, "learning_rate": 3.222854038543029e-05, "loss": 0.2298, "step": 1771 }, { "epoch": 2.8171701112877585, "grad_norm": 0.6352925591220043, "learning_rate": 3.2232202938027174e-05, "loss": 0.1778, "step": 1772 }, { "epoch": 2.818759936406995, "grad_norm": 2.598852653632586, "learning_rate": 3.223586811974211e-05, "loss": 0.3305, "step": 1773 }, { "epoch": 2.820349761526232, "grad_norm": 2.1144912865314804, "learning_rate": 3.2239535929334535e-05, "loss": 0.1947, "step": 1774 }, { "epoch": 2.821939586645469, "grad_norm": 2.4545852865496, "learning_rate": 3.224320636556305e-05, "loss": 0.1631, "step": 1775 }, { "epoch": 2.8235294117647056, "grad_norm": 1.7121594275242755, "learning_rate": 3.2246879427185346e-05, "loss": 0.2234, "step": 1776 }, { "epoch": 2.8251192368839426, "grad_norm": 2.3896269986296375, "learning_rate": 3.225055511295821e-05, "loss": 0.2006, "step": 1777 }, { "epoch": 2.8267090620031796, "grad_norm": 1.8310456788085938, "learning_rate": 3.2254233421637555e-05, "loss": 0.1796, "step": 1778 }, { "epoch": 2.8282988871224166, "grad_norm": 1.7401567196392833, "learning_rate": 3.225791435197842e-05, "loss": 0.1902, "step": 1779 }, { "epoch": 2.8298887122416536, "grad_norm": 2.4531004194601413, "learning_rate": 3.226159790273493e-05, "loss": 0.1912, "step": 1780 }, { "epoch": 2.83147853736089, "grad_norm": 2.832812464289055, "learning_rate": 3.2265284072660364e-05, "loss": 0.2752, "step": 1781 }, { "epoch": 2.833068362480127, "grad_norm": 1.4158067284930005, "learning_rate": 3.226897286050705e-05, "loss": 0.1989, "step": 1782 }, { "epoch": 2.834658187599364, "grad_norm": 0.9594148311195276, "learning_rate": 3.2272664265026494e-05, "loss": 0.0984, "step": 1783 }, { "epoch": 2.8362480127186007, "grad_norm": 2.3192094925385525, "learning_rate": 3.2276358284969266e-05, "loss": 0.2225, "step": 1784 }, { "epoch": 2.8378378378378377, "grad_norm": 1.234217590285289, "learning_rate": 3.22800549190851e-05, "loss": 0.1561, "step": 1785 }, { "epoch": 2.8394276629570747, "grad_norm": 1.8018200270835174, "learning_rate": 3.2283754166122804e-05, "loss": 0.1796, "step": 1786 }, { "epoch": 2.8410174880763117, "grad_norm": 1.8056419445386878, "learning_rate": 3.228745602483032e-05, "loss": 0.2091, "step": 1787 }, { "epoch": 2.8426073131955487, "grad_norm": 1.9142642816844597, "learning_rate": 3.2291160493954694e-05, "loss": 0.1931, "step": 1788 }, { "epoch": 2.8441971383147853, "grad_norm": 1.1120128883972513, "learning_rate": 3.229486757224211e-05, "loss": 0.13, "step": 1789 }, { "epoch": 2.8457869634340223, "grad_norm": 1.2667038640437513, "learning_rate": 3.229857725843785e-05, "loss": 0.1931, "step": 1790 }, { "epoch": 2.8473767885532593, "grad_norm": 3.200711906687901, "learning_rate": 3.230228955128632e-05, "loss": 0.1997, "step": 1791 }, { "epoch": 2.848966613672496, "grad_norm": 1.6766016177150809, "learning_rate": 3.230600444953105e-05, "loss": 0.1777, "step": 1792 }, { "epoch": 2.850556438791733, "grad_norm": 1.228569454723208, "learning_rate": 3.230972195191467e-05, "loss": 0.1608, "step": 1793 }, { "epoch": 2.85214626391097, "grad_norm": 2.0462899069052356, "learning_rate": 3.231344205717895e-05, "loss": 0.2354, "step": 1794 }, { "epoch": 2.853736089030207, "grad_norm": 8.921863436997553, "learning_rate": 3.2317164764064773e-05, "loss": 6.8511, "step": 1795 }, { "epoch": 2.855325914149444, "grad_norm": 2.9048163074029616, "learning_rate": 3.232089007131212e-05, "loss": 0.1997, "step": 1796 }, { "epoch": 2.8569157392686804, "grad_norm": 1.1714846662154808, "learning_rate": 3.232461797766011e-05, "loss": 0.1418, "step": 1797 }, { "epoch": 2.8585055643879174, "grad_norm": 1.4078822473512569, "learning_rate": 3.2328348481847e-05, "loss": 0.2506, "step": 1798 }, { "epoch": 2.8600953895071544, "grad_norm": 1.8884875908647951, "learning_rate": 3.233208158261014e-05, "loss": 0.184, "step": 1799 }, { "epoch": 2.861685214626391, "grad_norm": 1.6350291491581712, "learning_rate": 3.233581727868601e-05, "loss": 0.1682, "step": 1800 }, { "epoch": 2.863275039745628, "grad_norm": 1.3381182017079336, "learning_rate": 3.2339555568810225e-05, "loss": 0.156, "step": 1801 }, { "epoch": 2.864864864864865, "grad_norm": 0.9944119796506846, "learning_rate": 3.234329645171748e-05, "loss": 0.1731, "step": 1802 }, { "epoch": 2.866454689984102, "grad_norm": 1.6876037753574757, "learning_rate": 3.234703992614165e-05, "loss": 0.2124, "step": 1803 }, { "epoch": 2.868044515103339, "grad_norm": 1.7791816064524173, "learning_rate": 3.235078599081568e-05, "loss": 0.1865, "step": 1804 }, { "epoch": 2.8696343402225755, "grad_norm": 1.606348302069339, "learning_rate": 3.235453464447169e-05, "loss": 0.158, "step": 1805 }, { "epoch": 2.8712241653418125, "grad_norm": 1.7817509217750394, "learning_rate": 3.235828588584088e-05, "loss": 0.1462, "step": 1806 }, { "epoch": 2.872813990461049, "grad_norm": 1.1831815548218376, "learning_rate": 3.2362039713653576e-05, "loss": 0.1494, "step": 1807 }, { "epoch": 2.874403815580286, "grad_norm": 1.3333737045906051, "learning_rate": 3.236579612663928e-05, "loss": 0.19, "step": 1808 }, { "epoch": 2.875993640699523, "grad_norm": 33.6998921425052, "learning_rate": 3.236955512352655e-05, "loss": 24.7118, "step": 1809 }, { "epoch": 2.87758346581876, "grad_norm": 1.6581471959725742, "learning_rate": 3.237331670304312e-05, "loss": 0.1311, "step": 1810 }, { "epoch": 2.879173290937997, "grad_norm": 2.0458793441517122, "learning_rate": 3.2377080863915816e-05, "loss": 0.1345, "step": 1811 }, { "epoch": 2.8807631160572336, "grad_norm": 1.8223951957318947, "learning_rate": 3.238084760487063e-05, "loss": 0.208, "step": 1812 }, { "epoch": 2.8823529411764706, "grad_norm": 3.002320276478578, "learning_rate": 3.2384616924632636e-05, "loss": 0.3845, "step": 1813 }, { "epoch": 2.8839427662957076, "grad_norm": 1.3747817159913949, "learning_rate": 3.238838882192606e-05, "loss": 0.1457, "step": 1814 }, { "epoch": 2.885532591414944, "grad_norm": 1.298999626216307, "learning_rate": 3.2392163295474254e-05, "loss": 0.1486, "step": 1815 }, { "epoch": 2.887122416534181, "grad_norm": 34.44194534170242, "learning_rate": 3.239594034399969e-05, "loss": 22.3721, "step": 1816 }, { "epoch": 2.888712241653418, "grad_norm": 2.346599093155901, "learning_rate": 3.239971996622398e-05, "loss": 0.2209, "step": 1817 }, { "epoch": 2.890302066772655, "grad_norm": 2.8246101554821577, "learning_rate": 3.2403502160867855e-05, "loss": 0.1743, "step": 1818 }, { "epoch": 2.891891891891892, "grad_norm": 2.1762712110482787, "learning_rate": 3.24072869266512e-05, "loss": 0.1789, "step": 1819 }, { "epoch": 2.8934817170111287, "grad_norm": 1.1837876290290728, "learning_rate": 3.241107426229296e-05, "loss": 0.1827, "step": 1820 }, { "epoch": 2.8950715421303657, "grad_norm": 2.092027378061255, "learning_rate": 3.241486416651131e-05, "loss": 0.2095, "step": 1821 }, { "epoch": 2.8966613672496027, "grad_norm": 2.428722664628801, "learning_rate": 3.2418656638023476e-05, "loss": 0.2139, "step": 1822 }, { "epoch": 2.898251192368839, "grad_norm": 2.312577280796555, "learning_rate": 3.242245167554586e-05, "loss": 0.1966, "step": 1823 }, { "epoch": 2.899841017488076, "grad_norm": 1.5064922527069833, "learning_rate": 3.2426249277793954e-05, "loss": 0.1918, "step": 1824 }, { "epoch": 2.901430842607313, "grad_norm": 2.1226484016189935, "learning_rate": 3.243004944348244e-05, "loss": 0.1931, "step": 1825 }, { "epoch": 2.90302066772655, "grad_norm": 1.855599530271734, "learning_rate": 3.243385217132507e-05, "loss": 0.1476, "step": 1826 }, { "epoch": 2.904610492845787, "grad_norm": 3.5829060471023113, "learning_rate": 3.243765746003478e-05, "loss": 0.2053, "step": 1827 }, { "epoch": 2.9062003179650238, "grad_norm": 2.505957667632483, "learning_rate": 3.244146530832361e-05, "loss": 0.1501, "step": 1828 }, { "epoch": 2.9077901430842608, "grad_norm": 2.1146517595911694, "learning_rate": 3.2445275714902723e-05, "loss": 0.1818, "step": 1829 }, { "epoch": 2.9093799682034978, "grad_norm": 11.12380780935816, "learning_rate": 3.2449088678482465e-05, "loss": 0.4669, "step": 1830 }, { "epoch": 2.9109697933227343, "grad_norm": 26.12362543291316, "learning_rate": 3.245290419777228e-05, "loss": 15.4261, "step": 1831 }, { "epoch": 2.9125596184419713, "grad_norm": 3.780108650183511, "learning_rate": 3.245672227148074e-05, "loss": 0.1549, "step": 1832 }, { "epoch": 2.9141494435612083, "grad_norm": 36.54774952299517, "learning_rate": 3.246054289831557e-05, "loss": 21.6537, "step": 1833 }, { "epoch": 2.9157392686804453, "grad_norm": 5.580079307167936, "learning_rate": 3.246436607698361e-05, "loss": 0.3568, "step": 1834 }, { "epoch": 2.9173290937996823, "grad_norm": 2.4919037854233874, "learning_rate": 3.24681918061909e-05, "loss": 0.2161, "step": 1835 }, { "epoch": 2.918918918918919, "grad_norm": 3.1653913542509926, "learning_rate": 3.2472020084642517e-05, "loss": 0.1501, "step": 1836 }, { "epoch": 2.920508744038156, "grad_norm": 1.7680860458764274, "learning_rate": 3.247585091104276e-05, "loss": 0.2124, "step": 1837 }, { "epoch": 2.9220985691573924, "grad_norm": 2.1511797601739184, "learning_rate": 3.2479684284095016e-05, "loss": 0.1816, "step": 1838 }, { "epoch": 2.9236883942766294, "grad_norm": 3.940517095945787, "learning_rate": 3.248352020250184e-05, "loss": 0.207, "step": 1839 }, { "epoch": 2.9252782193958664, "grad_norm": 2.2781744442566954, "learning_rate": 3.248735866496489e-05, "loss": 0.19, "step": 1840 }, { "epoch": 2.9268680445151034, "grad_norm": 2.4352795233961544, "learning_rate": 3.249119967018501e-05, "loss": 0.2036, "step": 1841 }, { "epoch": 2.9284578696343404, "grad_norm": 1.590662277056064, "learning_rate": 3.249504321686215e-05, "loss": 0.16, "step": 1842 }, { "epoch": 2.930047694753577, "grad_norm": 4.0913943175004714, "learning_rate": 3.249888930369541e-05, "loss": 0.2709, "step": 1843 }, { "epoch": 2.931637519872814, "grad_norm": 2.932441191850693, "learning_rate": 3.250273792938302e-05, "loss": 0.28, "step": 1844 }, { "epoch": 2.933227344992051, "grad_norm": 2.1297286474151433, "learning_rate": 3.250658909262237e-05, "loss": 0.2142, "step": 1845 }, { "epoch": 2.9348171701112875, "grad_norm": 3.326055317374116, "learning_rate": 3.2510442792109984e-05, "loss": 0.2444, "step": 1846 }, { "epoch": 2.9364069952305245, "grad_norm": 1.6009156602104764, "learning_rate": 3.2514299026541505e-05, "loss": 0.1215, "step": 1847 }, { "epoch": 2.9379968203497615, "grad_norm": 2.966842325325179, "learning_rate": 3.251815779461175e-05, "loss": 0.1817, "step": 1848 }, { "epoch": 2.9395866454689985, "grad_norm": 2.262328560679966, "learning_rate": 3.252201909501468e-05, "loss": 0.2007, "step": 1849 }, { "epoch": 2.9411764705882355, "grad_norm": 2.4362391296064163, "learning_rate": 3.252588292644337e-05, "loss": 0.238, "step": 1850 }, { "epoch": 2.942766295707472, "grad_norm": 5.024061021007884, "learning_rate": 3.2529749287590036e-05, "loss": 0.3082, "step": 1851 }, { "epoch": 2.944356120826709, "grad_norm": 3.8806305999463158, "learning_rate": 3.2533618177146084e-05, "loss": 0.2192, "step": 1852 }, { "epoch": 2.945945945945946, "grad_norm": 5.6499319890107484, "learning_rate": 3.253748959380203e-05, "loss": 0.3267, "step": 1853 }, { "epoch": 2.9475357710651826, "grad_norm": 1.7469900567131358, "learning_rate": 3.254136353624751e-05, "loss": 0.2042, "step": 1854 }, { "epoch": 2.9491255961844196, "grad_norm": 2.2586329249031385, "learning_rate": 3.2545240003171384e-05, "loss": 0.2078, "step": 1855 }, { "epoch": 2.9507154213036566, "grad_norm": 2.993807115656151, "learning_rate": 3.2549118993261557e-05, "loss": 0.212, "step": 1856 }, { "epoch": 2.9523052464228936, "grad_norm": 3.5747388179410136, "learning_rate": 3.2553000505205176e-05, "loss": 0.295, "step": 1857 }, { "epoch": 2.9538950715421306, "grad_norm": 3.0406701446229776, "learning_rate": 3.255688453768846e-05, "loss": 0.241, "step": 1858 }, { "epoch": 2.955484896661367, "grad_norm": 2.2974389292643727, "learning_rate": 3.2560771089396815e-05, "loss": 0.2194, "step": 1859 }, { "epoch": 2.957074721780604, "grad_norm": 2.173118821995272, "learning_rate": 3.256466015901478e-05, "loss": 0.1373, "step": 1860 }, { "epoch": 2.958664546899841, "grad_norm": 2.993814847201125, "learning_rate": 3.2568551745226056e-05, "loss": 0.1772, "step": 1861 }, { "epoch": 2.9602543720190777, "grad_norm": 1.6363405578787704, "learning_rate": 3.257244584671348e-05, "loss": 0.2132, "step": 1862 }, { "epoch": 2.9618441971383147, "grad_norm": 1.0565829447716102, "learning_rate": 3.257634246215903e-05, "loss": 0.1624, "step": 1863 }, { "epoch": 2.9634340222575517, "grad_norm": 1.7515221152713383, "learning_rate": 3.258024159024383e-05, "loss": 0.1915, "step": 1864 }, { "epoch": 2.9650238473767887, "grad_norm": 77.3194986407281, "learning_rate": 3.2584143229648206e-05, "loss": 25.2183, "step": 1865 }, { "epoch": 2.9666136724960257, "grad_norm": 2.2122275501316495, "learning_rate": 3.258804737905156e-05, "loss": 0.2118, "step": 1866 }, { "epoch": 2.9682034976152623, "grad_norm": 1.4136148223961897, "learning_rate": 3.25919540371325e-05, "loss": 0.2109, "step": 1867 }, { "epoch": 2.9697933227344993, "grad_norm": 2.988176151334641, "learning_rate": 3.2595863202568745e-05, "loss": 0.242, "step": 1868 }, { "epoch": 2.9713831478537363, "grad_norm": 2.0233876787700744, "learning_rate": 3.25997748740372e-05, "loss": 0.2042, "step": 1869 }, { "epoch": 2.972972972972973, "grad_norm": 1.0966639433763725, "learning_rate": 3.26036890502139e-05, "loss": 0.1939, "step": 1870 }, { "epoch": 2.97456279809221, "grad_norm": 2.1742903420702597, "learning_rate": 3.2607605729774046e-05, "loss": 0.1475, "step": 1871 }, { "epoch": 2.976152623211447, "grad_norm": 1.5714201251870261, "learning_rate": 3.2611524911391964e-05, "loss": 0.1983, "step": 1872 }, { "epoch": 2.977742448330684, "grad_norm": 1.281674395154899, "learning_rate": 3.261544659374116e-05, "loss": 0.1106, "step": 1873 }, { "epoch": 2.9793322734499204, "grad_norm": 2.273856068176308, "learning_rate": 3.2619370775494306e-05, "loss": 0.1639, "step": 1874 }, { "epoch": 2.9809220985691574, "grad_norm": 1.3458060703066936, "learning_rate": 3.2623297455323186e-05, "loss": 0.1455, "step": 1875 }, { "epoch": 2.9825119236883944, "grad_norm": 2.459342440348716, "learning_rate": 3.2627226631898765e-05, "loss": 0.195, "step": 1876 }, { "epoch": 2.984101748807631, "grad_norm": 1.679834309306734, "learning_rate": 3.263115830389117e-05, "loss": 0.1801, "step": 1877 }, { "epoch": 2.985691573926868, "grad_norm": 2.3654151029644366, "learning_rate": 3.263509246996965e-05, "loss": 0.1819, "step": 1878 }, { "epoch": 2.987281399046105, "grad_norm": 1.4450223452783846, "learning_rate": 3.263902912880266e-05, "loss": 0.1797, "step": 1879 }, { "epoch": 2.988871224165342, "grad_norm": 1.2896168223404074, "learning_rate": 3.264296827905776e-05, "loss": 0.1561, "step": 1880 }, { "epoch": 2.990461049284579, "grad_norm": 1.1028702388559217, "learning_rate": 3.2646909919401706e-05, "loss": 0.1731, "step": 1881 }, { "epoch": 2.9920508744038155, "grad_norm": 3.222497754783046, "learning_rate": 3.2650854048500405e-05, "loss": 0.2544, "step": 1882 }, { "epoch": 2.9936406995230525, "grad_norm": 1.314969257056666, "learning_rate": 3.265480066501889e-05, "loss": 0.1414, "step": 1883 }, { "epoch": 2.9952305246422894, "grad_norm": 1.2013236612060652, "learning_rate": 3.265874976762138e-05, "loss": 0.2029, "step": 1884 }, { "epoch": 2.996820349761526, "grad_norm": 1.218399788036494, "learning_rate": 3.266270135497123e-05, "loss": 0.1361, "step": 1885 }, { "epoch": 2.998410174880763, "grad_norm": 1.2500087718695136, "learning_rate": 3.266665542573101e-05, "loss": 0.1551, "step": 1886 }, { "epoch": 3.0, "grad_norm": 0.9800135750224278, "learning_rate": 3.267061197856239e-05, "loss": 0.1434, "step": 1887 }, { "epoch": 3.001589825119237, "grad_norm": 1.1281258284629547, "learning_rate": 3.2674571012126206e-05, "loss": 0.2176, "step": 1888 }, { "epoch": 3.0031796502384736, "grad_norm": 0.8377961012707515, "learning_rate": 3.26785325250825e-05, "loss": 0.2095, "step": 1889 }, { "epoch": 3.0047694753577106, "grad_norm": 1.3870899681171056, "learning_rate": 3.268249651609041e-05, "loss": 0.2162, "step": 1890 }, { "epoch": 3.0063593004769475, "grad_norm": 1.0624178646401679, "learning_rate": 3.26864629838083e-05, "loss": 0.1484, "step": 1891 }, { "epoch": 3.0079491255961845, "grad_norm": 1.0696249564716236, "learning_rate": 3.269043192689364e-05, "loss": 0.1469, "step": 1892 }, { "epoch": 3.009538950715421, "grad_norm": 0.7928740121309226, "learning_rate": 3.269440334400309e-05, "loss": 0.2311, "step": 1893 }, { "epoch": 3.011128775834658, "grad_norm": 1.8902743625030656, "learning_rate": 3.269837723379248e-05, "loss": 0.1727, "step": 1894 }, { "epoch": 3.012718600953895, "grad_norm": 0.9340447720099906, "learning_rate": 3.270235359491678e-05, "loss": 0.1757, "step": 1895 }, { "epoch": 3.014308426073132, "grad_norm": 1.5324306122089544, "learning_rate": 3.270633242603015e-05, "loss": 0.1661, "step": 1896 }, { "epoch": 3.0158982511923687, "grad_norm": 1.202390005942924, "learning_rate": 3.2710313725785886e-05, "loss": 0.2412, "step": 1897 }, { "epoch": 3.0174880763116056, "grad_norm": 1.2100063713448455, "learning_rate": 3.271429749283647e-05, "loss": 0.1556, "step": 1898 }, { "epoch": 3.0190779014308426, "grad_norm": 2.1708437656270934, "learning_rate": 3.271828372583354e-05, "loss": 0.1829, "step": 1899 }, { "epoch": 3.0206677265500796, "grad_norm": 0.7740077062503045, "learning_rate": 3.272227242342789e-05, "loss": 0.1491, "step": 1900 }, { "epoch": 3.022257551669316, "grad_norm": 2.279173217250167, "learning_rate": 3.2726263584269514e-05, "loss": 0.2551, "step": 1901 }, { "epoch": 3.023847376788553, "grad_norm": 2.3654667747412863, "learning_rate": 3.2730257207007523e-05, "loss": 0.2887, "step": 1902 }, { "epoch": 3.02543720190779, "grad_norm": 2.313704747315006, "learning_rate": 3.273425329029024e-05, "loss": 0.1808, "step": 1903 }, { "epoch": 3.027027027027027, "grad_norm": 1.6213307420266159, "learning_rate": 3.273825183276513e-05, "loss": 0.1549, "step": 1904 }, { "epoch": 3.0286168521462637, "grad_norm": 2.011158551051491, "learning_rate": 3.274225283307881e-05, "loss": 0.2066, "step": 1905 }, { "epoch": 3.0302066772655007, "grad_norm": 2.027395998585125, "learning_rate": 3.2746256289877126e-05, "loss": 0.2191, "step": 1906 }, { "epoch": 3.0317965023847377, "grad_norm": 1.692920427575308, "learning_rate": 3.275026220180502e-05, "loss": 0.1531, "step": 1907 }, { "epoch": 3.0333863275039747, "grad_norm": 2.029454715546095, "learning_rate": 3.275427056750665e-05, "loss": 0.2158, "step": 1908 }, { "epoch": 3.0349761526232113, "grad_norm": 1.7330464379784225, "learning_rate": 3.2758281385625325e-05, "loss": 0.188, "step": 1909 }, { "epoch": 3.0365659777424483, "grad_norm": 1.893798129977745, "learning_rate": 3.2762294654803536e-05, "loss": 0.2665, "step": 1910 }, { "epoch": 3.0381558028616853, "grad_norm": 1.0591379262273728, "learning_rate": 3.2766310373682915e-05, "loss": 0.1771, "step": 1911 }, { "epoch": 3.0397456279809223, "grad_norm": 1.0126405391549058, "learning_rate": 3.277032854090433e-05, "loss": 0.186, "step": 1912 }, { "epoch": 3.041335453100159, "grad_norm": 1.1712462392496072, "learning_rate": 3.277434915510772e-05, "loss": 0.1999, "step": 1913 }, { "epoch": 3.042925278219396, "grad_norm": 0.8382379370089657, "learning_rate": 3.277837221493229e-05, "loss": 0.1686, "step": 1914 }, { "epoch": 3.044515103338633, "grad_norm": 1.2011902536329309, "learning_rate": 3.278239771901638e-05, "loss": 0.2315, "step": 1915 }, { "epoch": 3.04610492845787, "grad_norm": 2.661871345898783, "learning_rate": 3.278642566599749e-05, "loss": 0.1994, "step": 1916 }, { "epoch": 3.0476947535771064, "grad_norm": 1.324634890354758, "learning_rate": 3.27904560545123e-05, "loss": 0.1362, "step": 1917 }, { "epoch": 3.0492845786963434, "grad_norm": 1.842924550395593, "learning_rate": 3.27944888831967e-05, "loss": 0.1642, "step": 1918 }, { "epoch": 3.0508744038155804, "grad_norm": 144.16157827376344, "learning_rate": 3.279852415068569e-05, "loss": 29.0972, "step": 1919 }, { "epoch": 3.0524642289348174, "grad_norm": 1.9624553295275307, "learning_rate": 3.280256185561349e-05, "loss": 0.1752, "step": 1920 }, { "epoch": 3.054054054054054, "grad_norm": 1.3747680421603894, "learning_rate": 3.280660199661349e-05, "loss": 0.183, "step": 1921 }, { "epoch": 3.055643879173291, "grad_norm": 2.4370877991044524, "learning_rate": 3.2810644572318235e-05, "loss": 0.2116, "step": 1922 }, { "epoch": 3.057233704292528, "grad_norm": 2.2911024601773864, "learning_rate": 3.281468958135948e-05, "loss": 0.1644, "step": 1923 }, { "epoch": 3.0588235294117645, "grad_norm": 1.7023827045778943, "learning_rate": 3.281873702236811e-05, "loss": 0.153, "step": 1924 }, { "epoch": 3.0604133545310015, "grad_norm": 2.209047208715941, "learning_rate": 3.282278689397423e-05, "loss": 0.2076, "step": 1925 }, { "epoch": 3.0620031796502385, "grad_norm": 2.3500151430837874, "learning_rate": 3.282683919480711e-05, "loss": 0.1726, "step": 1926 }, { "epoch": 3.0635930047694755, "grad_norm": 2.0006679935322467, "learning_rate": 3.2830893923495166e-05, "loss": 0.1594, "step": 1927 }, { "epoch": 3.065182829888712, "grad_norm": 1.5272442036748977, "learning_rate": 3.2834951078666056e-05, "loss": 0.168, "step": 1928 }, { "epoch": 3.066772655007949, "grad_norm": 0.9167213066631827, "learning_rate": 3.283901065894655e-05, "loss": 0.1116, "step": 1929 }, { "epoch": 3.068362480127186, "grad_norm": 39.3013823353338, "learning_rate": 3.2843072662962646e-05, "loss": 25.4797, "step": 1930 }, { "epoch": 3.069952305246423, "grad_norm": 1.3991004367595645, "learning_rate": 3.284713708933948e-05, "loss": 0.2264, "step": 1931 }, { "epoch": 3.0715421303656596, "grad_norm": 1.6586053197123771, "learning_rate": 3.285120393670142e-05, "loss": 0.156, "step": 1932 }, { "epoch": 3.0731319554848966, "grad_norm": 1.195665037480131, "learning_rate": 3.285527320367196e-05, "loss": 0.1298, "step": 1933 }, { "epoch": 3.0747217806041336, "grad_norm": 1.1040711858605634, "learning_rate": 3.285934488887382e-05, "loss": 0.1526, "step": 1934 }, { "epoch": 3.0763116057233706, "grad_norm": 1.782315852979005, "learning_rate": 3.286341899092887e-05, "loss": 0.1779, "step": 1935 }, { "epoch": 3.077901430842607, "grad_norm": 1.6819257787270387, "learning_rate": 3.286749550845818e-05, "loss": 0.2027, "step": 1936 }, { "epoch": 3.079491255961844, "grad_norm": 1.5149116774749574, "learning_rate": 3.287157444008199e-05, "loss": 0.176, "step": 1937 }, { "epoch": 3.081081081081081, "grad_norm": 1.4941675057717916, "learning_rate": 3.287565578441974e-05, "loss": 0.1745, "step": 1938 }, { "epoch": 3.082670906200318, "grad_norm": 1.0002696983956485, "learning_rate": 3.287973954009003e-05, "loss": 0.1972, "step": 1939 }, { "epoch": 3.0842607313195547, "grad_norm": 0.9454272741350396, "learning_rate": 3.288382570571067e-05, "loss": 0.1629, "step": 1940 }, { "epoch": 3.0858505564387917, "grad_norm": 1.1817530758763106, "learning_rate": 3.288791427989863e-05, "loss": 0.1738, "step": 1941 }, { "epoch": 3.0874403815580287, "grad_norm": 1.2163507639562412, "learning_rate": 3.2892005261270074e-05, "loss": 0.1684, "step": 1942 }, { "epoch": 3.0890302066772657, "grad_norm": 1.10392308378242, "learning_rate": 3.289609864844037e-05, "loss": 0.1927, "step": 1943 }, { "epoch": 3.0906200317965022, "grad_norm": 1.263396106097779, "learning_rate": 3.290019444002403e-05, "loss": 0.1577, "step": 1944 }, { "epoch": 3.0922098569157392, "grad_norm": 0.9845783198187106, "learning_rate": 3.2904292634634795e-05, "loss": 0.1886, "step": 1945 }, { "epoch": 3.0937996820349762, "grad_norm": 1.0762094068529975, "learning_rate": 3.290839323088556e-05, "loss": 0.1507, "step": 1946 }, { "epoch": 3.0953895071542132, "grad_norm": 1.3904239721622995, "learning_rate": 3.2912496227388444e-05, "loss": 0.1701, "step": 1947 }, { "epoch": 3.09697933227345, "grad_norm": 1.398537309104411, "learning_rate": 3.291660162275471e-05, "loss": 0.1722, "step": 1948 }, { "epoch": 3.098569157392687, "grad_norm": 1.344297316706831, "learning_rate": 3.292070941559484e-05, "loss": 0.1552, "step": 1949 }, { "epoch": 3.100158982511924, "grad_norm": 0.841262345763392, "learning_rate": 3.292481960451849e-05, "loss": 0.1535, "step": 1950 }, { "epoch": 3.101748807631161, "grad_norm": 0.7620259295472986, "learning_rate": 3.2928932188134525e-05, "loss": 0.1569, "step": 1951 }, { "epoch": 3.1033386327503973, "grad_norm": 1.417824005812314, "learning_rate": 3.293304716505096e-05, "loss": 0.2298, "step": 1952 }, { "epoch": 3.1049284578696343, "grad_norm": 0.8344711142697316, "learning_rate": 3.293716453387505e-05, "loss": 0.1687, "step": 1953 }, { "epoch": 3.1065182829888713, "grad_norm": 1.8677102889037265, "learning_rate": 3.2941284293213186e-05, "loss": 0.2357, "step": 1954 }, { "epoch": 3.108108108108108, "grad_norm": 1.3798876719295738, "learning_rate": 3.2945406441671e-05, "loss": 0.1447, "step": 1955 }, { "epoch": 3.109697933227345, "grad_norm": 3.6113097669741996, "learning_rate": 3.29495309778533e-05, "loss": 0.1674, "step": 1956 }, { "epoch": 3.111287758346582, "grad_norm": 1.4186979959943251, "learning_rate": 3.295365790036406e-05, "loss": 0.2354, "step": 1957 }, { "epoch": 3.112877583465819, "grad_norm": 1.2580283664386984, "learning_rate": 3.2957787207806465e-05, "loss": 0.1565, "step": 1958 }, { "epoch": 3.1144674085850554, "grad_norm": 37.18545167642314, "learning_rate": 3.29619188987829e-05, "loss": 22.002, "step": 1959 }, { "epoch": 3.1160572337042924, "grad_norm": 1.1559754608869024, "learning_rate": 3.296605297189496e-05, "loss": 0.1891, "step": 1960 }, { "epoch": 3.1176470588235294, "grad_norm": 1.2714999828438585, "learning_rate": 3.297018942574338e-05, "loss": 0.2014, "step": 1961 }, { "epoch": 3.1192368839427664, "grad_norm": 1.6043778443584225, "learning_rate": 3.2974328258928137e-05, "loss": 0.1208, "step": 1962 }, { "epoch": 3.120826709062003, "grad_norm": 1.651638929725782, "learning_rate": 3.2978469470048376e-05, "loss": 0.195, "step": 1963 }, { "epoch": 3.12241653418124, "grad_norm": 1.7351014190374314, "learning_rate": 3.2982613057702446e-05, "loss": 0.1747, "step": 1964 }, { "epoch": 3.124006359300477, "grad_norm": 0.6126506364431229, "learning_rate": 3.2986759020487906e-05, "loss": 0.0996, "step": 1965 }, { "epoch": 3.125596184419714, "grad_norm": 1.609898128216873, "learning_rate": 3.299090735700149e-05, "loss": 0.175, "step": 1966 }, { "epoch": 3.1271860095389505, "grad_norm": 1.6780783515860238, "learning_rate": 3.2995058065839136e-05, "loss": 0.1567, "step": 1967 }, { "epoch": 3.1287758346581875, "grad_norm": 1.2847573814103512, "learning_rate": 3.2999211145595976e-05, "loss": 0.1419, "step": 1968 }, { "epoch": 3.1303656597774245, "grad_norm": 0.7533568806556772, "learning_rate": 3.300336659486635e-05, "loss": 0.105, "step": 1969 }, { "epoch": 3.1319554848966615, "grad_norm": 0.9644498780308932, "learning_rate": 3.300752441224378e-05, "loss": 0.2062, "step": 1970 }, { "epoch": 3.133545310015898, "grad_norm": 0.8185427540073015, "learning_rate": 3.3011684596321004e-05, "loss": 0.1449, "step": 1971 }, { "epoch": 3.135135135135135, "grad_norm": 1.0768584128413883, "learning_rate": 3.3015847145689936e-05, "loss": 0.1707, "step": 1972 }, { "epoch": 3.136724960254372, "grad_norm": 1.1263664526600041, "learning_rate": 3.302001205894173e-05, "loss": 0.1974, "step": 1973 }, { "epoch": 3.138314785373609, "grad_norm": 0.9768191162392208, "learning_rate": 3.302417933466669e-05, "loss": 0.2151, "step": 1974 }, { "epoch": 3.1399046104928456, "grad_norm": 1.3755838816110673, "learning_rate": 3.302834897145436e-05, "loss": 0.1743, "step": 1975 }, { "epoch": 3.1414944356120826, "grad_norm": 0.6798102871890868, "learning_rate": 3.303252096789345e-05, "loss": 0.1561, "step": 1976 }, { "epoch": 3.1430842607313196, "grad_norm": 0.781133820601312, "learning_rate": 3.3036695322571906e-05, "loss": 0.1196, "step": 1977 }, { "epoch": 3.1446740858505566, "grad_norm": 2.072769586559616, "learning_rate": 3.3040872034076855e-05, "loss": 0.2347, "step": 1978 }, { "epoch": 3.146263910969793, "grad_norm": 1.6849357484997305, "learning_rate": 3.3045051100994644e-05, "loss": 0.1685, "step": 1979 }, { "epoch": 3.14785373608903, "grad_norm": 2.692333163987331, "learning_rate": 3.3049232521910785e-05, "loss": 0.2578, "step": 1980 }, { "epoch": 3.149443561208267, "grad_norm": 0.9627159800236059, "learning_rate": 3.3053416295410026e-05, "loss": 0.0988, "step": 1981 }, { "epoch": 3.151033386327504, "grad_norm": 0.9776701422595108, "learning_rate": 3.3057602420076326e-05, "loss": 0.1665, "step": 1982 }, { "epoch": 3.1526232114467407, "grad_norm": 2.0385540158621125, "learning_rate": 3.306179089449282e-05, "loss": 0.1802, "step": 1983 }, { "epoch": 3.1542130365659777, "grad_norm": 1.4048896694502067, "learning_rate": 3.306598171724188e-05, "loss": 0.2071, "step": 1984 }, { "epoch": 3.1558028616852147, "grad_norm": 1.501249936428329, "learning_rate": 3.3070174886905034e-05, "loss": 0.2386, "step": 1985 }, { "epoch": 3.1573926868044513, "grad_norm": 1.587898068718713, "learning_rate": 3.3074370402063054e-05, "loss": 0.1827, "step": 1986 }, { "epoch": 3.1589825119236883, "grad_norm": 1.0132545643530786, "learning_rate": 3.307856826129593e-05, "loss": 0.1432, "step": 1987 }, { "epoch": 3.1605723370429253, "grad_norm": 1.1635612617034106, "learning_rate": 3.308276846318283e-05, "loss": 0.1548, "step": 1988 }, { "epoch": 3.1621621621621623, "grad_norm": 1.8734635262946138, "learning_rate": 3.3086971006302126e-05, "loss": 0.1398, "step": 1989 }, { "epoch": 3.1637519872813993, "grad_norm": 0.9747597540983771, "learning_rate": 3.309117588923142e-05, "loss": 0.136, "step": 1990 }, { "epoch": 3.165341812400636, "grad_norm": 32.13137658526767, "learning_rate": 3.30953831105475e-05, "loss": 20.4109, "step": 1991 }, { "epoch": 3.166931637519873, "grad_norm": 1.4277880209360654, "learning_rate": 3.3099592668826386e-05, "loss": 0.125, "step": 1992 }, { "epoch": 3.16852146263911, "grad_norm": 1.524782026029901, "learning_rate": 3.3103804562643306e-05, "loss": 0.1585, "step": 1993 }, { "epoch": 3.1701112877583464, "grad_norm": 0.861977347980029, "learning_rate": 3.310801879057266e-05, "loss": 0.1378, "step": 1994 }, { "epoch": 3.1717011128775834, "grad_norm": 1.0077426223162675, "learning_rate": 3.3112235351188087e-05, "loss": 0.1943, "step": 1995 }, { "epoch": 3.1732909379968204, "grad_norm": 2.2943093068787683, "learning_rate": 3.311645424306246e-05, "loss": 0.1592, "step": 1996 }, { "epoch": 3.1748807631160574, "grad_norm": 1.6875214584713865, "learning_rate": 3.312067546476781e-05, "loss": 0.1556, "step": 1997 }, { "epoch": 3.176470588235294, "grad_norm": 1.1806329465463816, "learning_rate": 3.3124899014875426e-05, "loss": 0.1388, "step": 1998 }, { "epoch": 3.178060413354531, "grad_norm": 56.409258678504905, "learning_rate": 3.312912489195577e-05, "loss": 33.0671, "step": 1999 }, { "epoch": 3.179650238473768, "grad_norm": 2.141471304691906, "learning_rate": 3.313335309457854e-05, "loss": 0.1528, "step": 2000 }, { "epoch": 3.181240063593005, "grad_norm": 1.6639295605976259, "learning_rate": 3.313758362131266e-05, "loss": 0.1539, "step": 2001 }, { "epoch": 3.1828298887122415, "grad_norm": 2.2856800720902926, "learning_rate": 3.314181647072623e-05, "loss": 0.255, "step": 2002 }, { "epoch": 3.1844197138314785, "grad_norm": 1.3118614331928724, "learning_rate": 3.3146051641386606e-05, "loss": 0.1373, "step": 2003 }, { "epoch": 3.1860095389507155, "grad_norm": 1.2733814165098716, "learning_rate": 3.3150289131860306e-05, "loss": 0.156, "step": 2004 }, { "epoch": 3.1875993640699525, "grad_norm": 1.508294259693326, "learning_rate": 3.315452894071311e-05, "loss": 0.2343, "step": 2005 }, { "epoch": 3.189189189189189, "grad_norm": 2.162535711209585, "learning_rate": 3.315877106651e-05, "loss": 0.1781, "step": 2006 }, { "epoch": 3.190779014308426, "grad_norm": 2.3836983581709577, "learning_rate": 3.316301550781516e-05, "loss": 0.2031, "step": 2007 }, { "epoch": 3.192368839427663, "grad_norm": 1.5693348330991597, "learning_rate": 3.316726226319201e-05, "loss": 0.1899, "step": 2008 }, { "epoch": 3.1939586645469, "grad_norm": 2.046961823376242, "learning_rate": 3.317151133120317e-05, "loss": 0.1282, "step": 2009 }, { "epoch": 3.1955484896661366, "grad_norm": 4.313264951099641, "learning_rate": 3.317576271041049e-05, "loss": 0.2294, "step": 2010 }, { "epoch": 3.1971383147853736, "grad_norm": 2.6025331577579225, "learning_rate": 3.318001639937501e-05, "loss": 0.1757, "step": 2011 }, { "epoch": 3.1987281399046106, "grad_norm": 1.8316778137434928, "learning_rate": 3.318427239665705e-05, "loss": 0.2137, "step": 2012 }, { "epoch": 3.2003179650238476, "grad_norm": 4.0184933933952225, "learning_rate": 3.318853070081608e-05, "loss": 0.2147, "step": 2013 }, { "epoch": 3.201907790143084, "grad_norm": 3.180541743830729, "learning_rate": 3.3192791310410816e-05, "loss": 0.2239, "step": 2014 }, { "epoch": 3.203497615262321, "grad_norm": 3.547151935416979, "learning_rate": 3.319705422399923e-05, "loss": 0.1764, "step": 2015 }, { "epoch": 3.205087440381558, "grad_norm": 2.708890069858936, "learning_rate": 3.3201319440138433e-05, "loss": 0.1673, "step": 2016 }, { "epoch": 3.2066772655007947, "grad_norm": 2.810242563409344, "learning_rate": 3.320558695738483e-05, "loss": 0.1769, "step": 2017 }, { "epoch": 3.2082670906200317, "grad_norm": 4.5665891109311305, "learning_rate": 3.320985677429403e-05, "loss": 0.2431, "step": 2018 }, { "epoch": 3.2098569157392687, "grad_norm": 1.6811383945656806, "learning_rate": 3.3214128889420835e-05, "loss": 0.1495, "step": 2019 }, { "epoch": 3.2114467408585057, "grad_norm": 1.5516439642875695, "learning_rate": 3.3218403301319294e-05, "loss": 0.1747, "step": 2020 }, { "epoch": 3.2130365659777427, "grad_norm": 2.1251807156694613, "learning_rate": 3.322268000854268e-05, "loss": 0.1572, "step": 2021 }, { "epoch": 3.2146263910969792, "grad_norm": 1.5052606934218604, "learning_rate": 3.322695900964348e-05, "loss": 0.1531, "step": 2022 }, { "epoch": 3.2162162162162162, "grad_norm": 2.2431616919677904, "learning_rate": 3.32312403031734e-05, "loss": 0.1463, "step": 2023 }, { "epoch": 3.2178060413354532, "grad_norm": 1.298867478561596, "learning_rate": 3.323552388768338e-05, "loss": 0.1616, "step": 2024 }, { "epoch": 3.21939586645469, "grad_norm": 1.5018859916598755, "learning_rate": 3.323980976172358e-05, "loss": 0.1349, "step": 2025 }, { "epoch": 3.220985691573927, "grad_norm": 2.982556851255493, "learning_rate": 3.32440979238434e-05, "loss": 0.1618, "step": 2026 }, { "epoch": 3.2225755166931638, "grad_norm": 1.7714178314345896, "learning_rate": 3.3248388372591435e-05, "loss": 0.1617, "step": 2027 }, { "epoch": 3.2241653418124008, "grad_norm": 2.2655010972965943, "learning_rate": 3.3252681106515534e-05, "loss": 0.1871, "step": 2028 }, { "epoch": 3.2257551669316373, "grad_norm": 2.7541267080238323, "learning_rate": 3.325697612416277e-05, "loss": 0.1438, "step": 2029 }, { "epoch": 3.2273449920508743, "grad_norm": 3.2554128941818172, "learning_rate": 3.326127342407941e-05, "loss": 0.2001, "step": 2030 }, { "epoch": 3.2289348171701113, "grad_norm": 3.0231568447528194, "learning_rate": 3.326557300481099e-05, "loss": 0.1624, "step": 2031 }, { "epoch": 3.2305246422893483, "grad_norm": 1.597346330492968, "learning_rate": 3.3269874864902266e-05, "loss": 0.183, "step": 2032 }, { "epoch": 3.232114467408585, "grad_norm": 2.208834066194717, "learning_rate": 3.32741790028972e-05, "loss": 0.1846, "step": 2033 }, { "epoch": 3.233704292527822, "grad_norm": 3.4048356114629046, "learning_rate": 3.3278485417339004e-05, "loss": 0.2179, "step": 2034 }, { "epoch": 3.235294117647059, "grad_norm": 2.3164545945181265, "learning_rate": 3.32827941067701e-05, "loss": 0.1463, "step": 2035 }, { "epoch": 3.236883942766296, "grad_norm": 1.7994079248617014, "learning_rate": 3.328710506973216e-05, "loss": 0.1761, "step": 2036 }, { "epoch": 3.2384737678855324, "grad_norm": 1.8010778234783207, "learning_rate": 3.3291418304766094e-05, "loss": 0.1471, "step": 2037 }, { "epoch": 3.2400635930047694, "grad_norm": 1.3559843917733438, "learning_rate": 3.329573381041201e-05, "loss": 0.1789, "step": 2038 }, { "epoch": 3.2416534181240064, "grad_norm": 4.1962914123268185, "learning_rate": 3.330005158520927e-05, "loss": 0.3014, "step": 2039 }, { "epoch": 3.2432432432432434, "grad_norm": 47.58509618502178, "learning_rate": 3.330437162769647e-05, "loss": 22.9623, "step": 2040 }, { "epoch": 3.24483306836248, "grad_norm": 3.7935446218238686, "learning_rate": 3.3308693936411426e-05, "loss": 0.2144, "step": 2041 }, { "epoch": 3.246422893481717, "grad_norm": 2.2487300045613847, "learning_rate": 3.331301850989118e-05, "loss": 0.1711, "step": 2042 }, { "epoch": 3.248012718600954, "grad_norm": 1.9941419530698707, "learning_rate": 3.331734534667205e-05, "loss": 0.1675, "step": 2043 }, { "epoch": 3.249602543720191, "grad_norm": 2.1261608900167204, "learning_rate": 3.3321674445289536e-05, "loss": 0.1439, "step": 2044 }, { "epoch": 3.2511923688394275, "grad_norm": 3.0045998227534962, "learning_rate": 3.3326005804278396e-05, "loss": 0.2156, "step": 2045 }, { "epoch": 3.2527821939586645, "grad_norm": 4.213304807376894, "learning_rate": 3.333033942217264e-05, "loss": 0.2483, "step": 2046 }, { "epoch": 3.2543720190779015, "grad_norm": 3.9534928376186027, "learning_rate": 3.333467529750548e-05, "loss": 0.2007, "step": 2047 }, { "epoch": 3.255961844197138, "grad_norm": 3.5995097561911384, "learning_rate": 3.333901342880937e-05, "loss": 0.2193, "step": 2048 }, { "epoch": 3.257551669316375, "grad_norm": 3.7608789956159767, "learning_rate": 3.334335381461603e-05, "loss": 0.1536, "step": 2049 }, { "epoch": 3.259141494435612, "grad_norm": 4.140493716440672, "learning_rate": 3.33476964534564e-05, "loss": 0.1823, "step": 2050 }, { "epoch": 3.260731319554849, "grad_norm": 3.2947050907811355, "learning_rate": 3.335204134386062e-05, "loss": 0.21, "step": 2051 }, { "epoch": 3.262321144674086, "grad_norm": 3.724432855304805, "learning_rate": 3.335638848435814e-05, "loss": 0.1665, "step": 2052 }, { "epoch": 3.2639109697933226, "grad_norm": 1.4390746557597143, "learning_rate": 3.336073787347759e-05, "loss": 0.2013, "step": 2053 }, { "epoch": 3.2655007949125596, "grad_norm": 2.138549521034436, "learning_rate": 3.3365089509746854e-05, "loss": 0.1581, "step": 2054 }, { "epoch": 3.2670906200317966, "grad_norm": 2.0356620517618595, "learning_rate": 3.336944339169308e-05, "loss": 0.1841, "step": 2055 }, { "epoch": 3.268680445151033, "grad_norm": 2.1871357520909402, "learning_rate": 3.337379951784262e-05, "loss": 0.1487, "step": 2056 }, { "epoch": 3.27027027027027, "grad_norm": 3.3897997812299923, "learning_rate": 3.33781578867211e-05, "loss": 0.1627, "step": 2057 }, { "epoch": 3.271860095389507, "grad_norm": 2.812923655334617, "learning_rate": 3.338251849685336e-05, "loss": 0.2008, "step": 2058 }, { "epoch": 3.273449920508744, "grad_norm": 2.1589961141789766, "learning_rate": 3.3386881346763476e-05, "loss": 0.1448, "step": 2059 }, { "epoch": 3.275039745627981, "grad_norm": 3.274878608170008, "learning_rate": 3.339124643497481e-05, "loss": 0.2327, "step": 2060 }, { "epoch": 3.2766295707472177, "grad_norm": 19.682556968853664, "learning_rate": 3.3395613760009926e-05, "loss": 0.5019, "step": 2061 }, { "epoch": 3.2782193958664547, "grad_norm": 2.375560645116687, "learning_rate": 3.339998332039063e-05, "loss": 0.175, "step": 2062 }, { "epoch": 3.2798092209856917, "grad_norm": 1.9781578974665686, "learning_rate": 3.3404355114638e-05, "loss": 0.1498, "step": 2063 }, { "epoch": 3.2813990461049283, "grad_norm": 1.3596175622607638, "learning_rate": 3.3408729141272346e-05, "loss": 0.217, "step": 2064 }, { "epoch": 3.2829888712241653, "grad_norm": 3.934068218806691, "learning_rate": 3.34131053988132e-05, "loss": 0.2102, "step": 2065 }, { "epoch": 3.2845786963434023, "grad_norm": 1.8627148598428587, "learning_rate": 3.341748388577936e-05, "loss": 0.1771, "step": 2066 }, { "epoch": 3.2861685214626393, "grad_norm": 1.5054654612110512, "learning_rate": 3.3421864600688886e-05, "loss": 0.1943, "step": 2067 }, { "epoch": 3.287758346581876, "grad_norm": 1.0810595631648718, "learning_rate": 3.342624754205905e-05, "loss": 0.1479, "step": 2068 }, { "epoch": 3.289348171701113, "grad_norm": 2.3677587201051287, "learning_rate": 3.343063270840637e-05, "loss": 0.1579, "step": 2069 }, { "epoch": 3.29093799682035, "grad_norm": 2.0453085968861364, "learning_rate": 3.3435020098246656e-05, "loss": 0.1811, "step": 2070 }, { "epoch": 3.292527821939587, "grad_norm": 1.323429309382073, "learning_rate": 3.3439409710094935e-05, "loss": 0.149, "step": 2071 }, { "epoch": 3.2941176470588234, "grad_norm": 9.7879780388433, "learning_rate": 3.3443801542465455e-05, "loss": 39.6278, "step": 2072 }, { "epoch": 3.2957074721780604, "grad_norm": 18.584709452989777, "learning_rate": 3.344819559387175e-05, "loss": 7.2133, "step": 2073 }, { "epoch": 3.2972972972972974, "grad_norm": 3.014184130755303, "learning_rate": 3.345259186282661e-05, "loss": 0.1924, "step": 2074 }, { "epoch": 3.2988871224165344, "grad_norm": 4.150033421339138, "learning_rate": 3.3456990347842036e-05, "loss": 0.1853, "step": 2075 }, { "epoch": 3.300476947535771, "grad_norm": 2.6950937769825067, "learning_rate": 3.3461391047429305e-05, "loss": 0.2621, "step": 2076 }, { "epoch": 3.302066772655008, "grad_norm": 1.845369806123608, "learning_rate": 3.3465793960098945e-05, "loss": 0.2772, "step": 2077 }, { "epoch": 3.303656597774245, "grad_norm": 4.434643034512295, "learning_rate": 3.3470199084360735e-05, "loss": 0.173, "step": 2078 }, { "epoch": 3.3052464228934815, "grad_norm": 3.9911423063317724, "learning_rate": 3.347460641872368e-05, "loss": 0.2178, "step": 2079 }, { "epoch": 3.3068362480127185, "grad_norm": 1.3368786895459137, "learning_rate": 3.3479015961696085e-05, "loss": 0.2259, "step": 2080 }, { "epoch": 3.3084260731319555, "grad_norm": 2.300668625990305, "learning_rate": 3.3483427711785454e-05, "loss": 0.1603, "step": 2081 }, { "epoch": 3.3100158982511925, "grad_norm": 2.328607631336273, "learning_rate": 3.3487841667498575e-05, "loss": 0.1426, "step": 2082 }, { "epoch": 3.3116057233704295, "grad_norm": 2.2766915190712584, "learning_rate": 3.349225782734149e-05, "loss": 0.2161, "step": 2083 }, { "epoch": 3.313195548489666, "grad_norm": 3.6611936435477963, "learning_rate": 3.349667618981949e-05, "loss": 0.1419, "step": 2084 }, { "epoch": 3.314785373608903, "grad_norm": 2.9980482234918946, "learning_rate": 3.3501096753437114e-05, "loss": 0.1203, "step": 2085 }, { "epoch": 3.31637519872814, "grad_norm": 1.7111955270348578, "learning_rate": 3.350551951669816e-05, "loss": 0.2149, "step": 2086 }, { "epoch": 3.3179650238473766, "grad_norm": 37.303963807717935, "learning_rate": 3.350994447810569e-05, "loss": 16.3468, "step": 2087 }, { "epoch": 3.3195548489666136, "grad_norm": 2.5257303017998574, "learning_rate": 3.351437163616202e-05, "loss": 0.2372, "step": 2088 }, { "epoch": 3.3211446740858506, "grad_norm": 2.959088143267741, "learning_rate": 3.351880098936869e-05, "loss": 0.2107, "step": 2089 }, { "epoch": 3.3227344992050876, "grad_norm": 1.2716736817045031, "learning_rate": 3.3523232536226546e-05, "loss": 0.1768, "step": 2090 }, { "epoch": 3.3243243243243246, "grad_norm": 1.9471410134252394, "learning_rate": 3.352766627523568e-05, "loss": 0.2063, "step": 2091 }, { "epoch": 3.325914149443561, "grad_norm": 1.2465839795045928, "learning_rate": 3.3532102204895395e-05, "loss": 0.1965, "step": 2092 }, { "epoch": 3.327503974562798, "grad_norm": 2.2432255659009672, "learning_rate": 3.3536540323704336e-05, "loss": 0.1735, "step": 2093 }, { "epoch": 3.329093799682035, "grad_norm": 20.537803716846273, "learning_rate": 3.354098063016033e-05, "loss": 9.2448, "step": 2094 }, { "epoch": 3.3306836248012717, "grad_norm": 2.557327523773593, "learning_rate": 3.35454231227605e-05, "loss": 0.1609, "step": 2095 }, { "epoch": 3.3322734499205087, "grad_norm": 1.7436180161122463, "learning_rate": 3.3549867800001224e-05, "loss": 0.1877, "step": 2096 }, { "epoch": 3.3338632750397457, "grad_norm": 3.0693422701988884, "learning_rate": 3.3554314660378134e-05, "loss": 0.1568, "step": 2097 }, { "epoch": 3.3354531001589827, "grad_norm": 1.81735322159398, "learning_rate": 3.355876370238614e-05, "loss": 0.2079, "step": 2098 }, { "epoch": 3.337042925278219, "grad_norm": 4.166750498568972, "learning_rate": 3.3563214924519395e-05, "loss": 0.1545, "step": 2099 }, { "epoch": 3.338632750397456, "grad_norm": 1.570116510322975, "learning_rate": 3.3567668325271324e-05, "loss": 0.1559, "step": 2100 }, { "epoch": 3.340222575516693, "grad_norm": 2.6392027116524024, "learning_rate": 3.3572123903134616e-05, "loss": 0.1922, "step": 2101 }, { "epoch": 3.34181240063593, "grad_norm": 1.5507919497905496, "learning_rate": 3.35765816566012e-05, "loss": 0.1611, "step": 2102 }, { "epoch": 3.3434022257551668, "grad_norm": 2.9069591648755027, "learning_rate": 3.358104158416231e-05, "loss": 0.1856, "step": 2103 }, { "epoch": 3.3449920508744038, "grad_norm": 3.1764079695465677, "learning_rate": 3.358550368430842e-05, "loss": 0.2452, "step": 2104 }, { "epoch": 3.3465818759936408, "grad_norm": 3.1109162341619863, "learning_rate": 3.358996795552926e-05, "loss": 0.1811, "step": 2105 }, { "epoch": 3.3481717011128778, "grad_norm": 2.0614628653725657, "learning_rate": 3.3594434396313846e-05, "loss": 0.2231, "step": 2106 }, { "epoch": 3.3497615262321143, "grad_norm": 3.150357181803421, "learning_rate": 3.3598903005150444e-05, "loss": 0.1274, "step": 2107 }, { "epoch": 3.3513513513513513, "grad_norm": 4.41304512253067, "learning_rate": 3.3603373780526594e-05, "loss": 0.3273, "step": 2108 }, { "epoch": 3.3529411764705883, "grad_norm": 3.1773168358969333, "learning_rate": 3.36078467209291e-05, "loss": 0.1698, "step": 2109 }, { "epoch": 3.3545310015898253, "grad_norm": 2.7264620764431924, "learning_rate": 3.3612321824844026e-05, "loss": 0.1793, "step": 2110 }, { "epoch": 3.356120826709062, "grad_norm": 3.3388799761897965, "learning_rate": 3.361679909075671e-05, "loss": 0.172, "step": 2111 }, { "epoch": 3.357710651828299, "grad_norm": 8.132202886736007, "learning_rate": 3.362127851715179e-05, "loss": 0.2245, "step": 2112 }, { "epoch": 3.359300476947536, "grad_norm": 3.0471120913808574, "learning_rate": 3.36257601025131e-05, "loss": 0.1993, "step": 2113 }, { "epoch": 3.360890302066773, "grad_norm": 38.9031732410398, "learning_rate": 3.363024384532381e-05, "loss": 14.8347, "step": 2114 }, { "epoch": 3.3624801271860094, "grad_norm": 3.9451946228445536, "learning_rate": 3.363472974406633e-05, "loss": 0.1653, "step": 2115 }, { "epoch": 3.3640699523052464, "grad_norm": 4.974225852814844, "learning_rate": 3.3639217797222356e-05, "loss": 0.2454, "step": 2116 }, { "epoch": 3.3656597774244834, "grad_norm": 2.943610320647188, "learning_rate": 3.3643708003272827e-05, "loss": 0.1967, "step": 2117 }, { "epoch": 3.36724960254372, "grad_norm": 6.199498369217142, "learning_rate": 3.364820036069799e-05, "loss": 0.294, "step": 2118 }, { "epoch": 3.368839427662957, "grad_norm": 3.876403373560721, "learning_rate": 3.365269486797733e-05, "loss": 0.1655, "step": 2119 }, { "epoch": 3.370429252782194, "grad_norm": 4.450210813779476, "learning_rate": 3.365719152358962e-05, "loss": 0.1666, "step": 2120 }, { "epoch": 3.372019077901431, "grad_norm": 6.199273332228174, "learning_rate": 3.36616903260129e-05, "loss": 0.2147, "step": 2121 }, { "epoch": 3.373608903020668, "grad_norm": 5.461239816934954, "learning_rate": 3.36661912737245e-05, "loss": 0.1692, "step": 2122 }, { "epoch": 3.3751987281399045, "grad_norm": 3.828740174271409, "learning_rate": 3.367069436520101e-05, "loss": 0.9425, "step": 2123 }, { "epoch": 3.3767885532591415, "grad_norm": 7.3116931172107735, "learning_rate": 3.367519959891829e-05, "loss": 0.3303, "step": 2124 }, { "epoch": 3.3783783783783785, "grad_norm": 8.853360203021568, "learning_rate": 3.367970697335149e-05, "loss": 0.2298, "step": 2125 }, { "epoch": 3.379968203497615, "grad_norm": 5.031469142603491, "learning_rate": 3.368421648697502e-05, "loss": 0.195, "step": 2126 }, { "epoch": 3.381558028616852, "grad_norm": 2.146405048242195, "learning_rate": 3.368872813826259e-05, "loss": 0.1756, "step": 2127 }, { "epoch": 3.383147853736089, "grad_norm": 5.034097998991767, "learning_rate": 3.3693241925687136e-05, "loss": 0.1971, "step": 2128 }, { "epoch": 3.384737678855326, "grad_norm": 3.611530120724988, "learning_rate": 3.369775784772094e-05, "loss": 0.1811, "step": 2129 }, { "epoch": 3.3863275039745626, "grad_norm": 4.111126576492514, "learning_rate": 3.3702275902835494e-05, "loss": 0.1922, "step": 2130 }, { "epoch": 3.3879173290937996, "grad_norm": 5.157071650324187, "learning_rate": 3.3706796089501624e-05, "loss": 0.1734, "step": 2131 }, { "epoch": 3.3895071542130366, "grad_norm": 1.7201039368085091, "learning_rate": 3.37113184061894e-05, "loss": 0.2418, "step": 2132 }, { "epoch": 3.3910969793322736, "grad_norm": 4.340771580040756, "learning_rate": 3.371584285136819e-05, "loss": 0.172, "step": 2133 }, { "epoch": 3.39268680445151, "grad_norm": 2.337969475780816, "learning_rate": 3.372036942350662e-05, "loss": 0.2358, "step": 2134 }, { "epoch": 3.394276629570747, "grad_norm": 3.088486372529841, "learning_rate": 3.372489812107262e-05, "loss": 0.1656, "step": 2135 }, { "epoch": 3.395866454689984, "grad_norm": 4.311901676947814, "learning_rate": 3.3729428942533384e-05, "loss": 0.2293, "step": 2136 }, { "epoch": 3.397456279809221, "grad_norm": 1.7915664512944196, "learning_rate": 3.3733961886355394e-05, "loss": 0.1557, "step": 2137 }, { "epoch": 3.3990461049284577, "grad_norm": 1.2885339333863737, "learning_rate": 3.373849695100442e-05, "loss": 0.1144, "step": 2138 }, { "epoch": 3.4006359300476947, "grad_norm": 41.737273747148684, "learning_rate": 3.374303413494549e-05, "loss": 15.6859, "step": 2139 }, { "epoch": 3.4022257551669317, "grad_norm": 3.6992944174720863, "learning_rate": 3.374757343664295e-05, "loss": 0.1643, "step": 2140 }, { "epoch": 3.4038155802861687, "grad_norm": 4.110467230295227, "learning_rate": 3.37521148545604e-05, "loss": 0.1788, "step": 2141 }, { "epoch": 3.4054054054054053, "grad_norm": 2.9958323299699128, "learning_rate": 3.3756658387160735e-05, "loss": 0.2365, "step": 2142 }, { "epoch": 3.4069952305246423, "grad_norm": 2.7046787458429256, "learning_rate": 3.3761204032906134e-05, "loss": 0.1948, "step": 2143 }, { "epoch": 3.4085850556438793, "grad_norm": 3.3276913456507016, "learning_rate": 3.3765751790258064e-05, "loss": 0.1543, "step": 2144 }, { "epoch": 3.4101748807631163, "grad_norm": 3.070162684081112, "learning_rate": 3.3770301657677275e-05, "loss": 0.1783, "step": 2145 }, { "epoch": 3.411764705882353, "grad_norm": 3.1458464718347794, "learning_rate": 3.37748536336238e-05, "loss": 0.2202, "step": 2146 }, { "epoch": 3.41335453100159, "grad_norm": 3.4623554352067476, "learning_rate": 3.377940771655696e-05, "loss": 0.1631, "step": 2147 }, { "epoch": 3.414944356120827, "grad_norm": 2.775281363278864, "learning_rate": 3.3783963904935367e-05, "loss": 0.1884, "step": 2148 }, { "epoch": 3.4165341812400634, "grad_norm": 2.8078079690465465, "learning_rate": 3.37885221972169e-05, "loss": 0.2201, "step": 2149 }, { "epoch": 3.4181240063593004, "grad_norm": 4.107895412851775, "learning_rate": 3.3793082591858753e-05, "loss": 0.2434, "step": 2150 }, { "epoch": 3.4197138314785374, "grad_norm": 3.1525698129921427, "learning_rate": 3.379764508731741e-05, "loss": 0.1979, "step": 2151 }, { "epoch": 3.4213036565977744, "grad_norm": 4.579972422764068, "learning_rate": 3.38022096820486e-05, "loss": 0.2237, "step": 2152 }, { "epoch": 3.4228934817170114, "grad_norm": 3.2064368093115676, "learning_rate": 3.3806776374507395e-05, "loss": 0.1749, "step": 2153 }, { "epoch": 3.424483306836248, "grad_norm": 3.7168287919867002, "learning_rate": 3.381134516314814e-05, "loss": 0.1725, "step": 2154 }, { "epoch": 3.426073131955485, "grad_norm": 2.9683114496060545, "learning_rate": 3.381591604642446e-05, "loss": 0.2521, "step": 2155 }, { "epoch": 3.427662957074722, "grad_norm": 3.8528097786993465, "learning_rate": 3.382048902278927e-05, "loss": 0.1841, "step": 2156 }, { "epoch": 3.4292527821939585, "grad_norm": 6.870375394785343, "learning_rate": 3.382506409069479e-05, "loss": 0.7296, "step": 2157 }, { "epoch": 3.4308426073131955, "grad_norm": 2.1184206237802536, "learning_rate": 3.382964124859252e-05, "loss": 0.1378, "step": 2158 }, { "epoch": 3.4324324324324325, "grad_norm": 3.000731446186443, "learning_rate": 3.383422049493325e-05, "loss": 0.1442, "step": 2159 }, { "epoch": 3.4340222575516695, "grad_norm": 3.2782961136649056, "learning_rate": 3.383880182816709e-05, "loss": 0.1615, "step": 2160 }, { "epoch": 3.435612082670906, "grad_norm": 4.054326351046822, "learning_rate": 3.384338524674342e-05, "loss": 0.2425, "step": 2161 }, { "epoch": 3.437201907790143, "grad_norm": 2.8977262837324553, "learning_rate": 3.384797074911091e-05, "loss": 0.1867, "step": 2162 }, { "epoch": 3.43879173290938, "grad_norm": 3.9481861133530023, "learning_rate": 3.385255833371753e-05, "loss": 0.1844, "step": 2163 }, { "epoch": 3.440381558028617, "grad_norm": 2.00942043841239, "learning_rate": 3.385714799901057e-05, "loss": 0.1502, "step": 2164 }, { "epoch": 3.4419713831478536, "grad_norm": 2.6894135580287992, "learning_rate": 3.386173974343657e-05, "loss": 0.1824, "step": 2165 }, { "epoch": 3.4435612082670906, "grad_norm": 1.5990469325129022, "learning_rate": 3.3866333565441406e-05, "loss": 0.1805, "step": 2166 }, { "epoch": 3.4451510333863276, "grad_norm": 3.2768738952473324, "learning_rate": 3.387092946347023e-05, "loss": 0.2358, "step": 2167 }, { "epoch": 3.4467408585055646, "grad_norm": 2.2199109289611307, "learning_rate": 3.387552743596751e-05, "loss": 0.1938, "step": 2168 }, { "epoch": 3.448330683624801, "grad_norm": 2.097812273139215, "learning_rate": 3.388012748137698e-05, "loss": 0.1297, "step": 2169 }, { "epoch": 3.449920508744038, "grad_norm": 3.212255973167637, "learning_rate": 3.388472959814169e-05, "loss": 0.1765, "step": 2170 }, { "epoch": 3.451510333863275, "grad_norm": 2.39076475124453, "learning_rate": 3.3889333784704e-05, "loss": 0.1489, "step": 2171 }, { "epoch": 3.453100158982512, "grad_norm": 2.781727952033124, "learning_rate": 3.389394003950556e-05, "loss": 0.2081, "step": 2172 }, { "epoch": 3.4546899841017487, "grad_norm": 2.7356780126644265, "learning_rate": 3.389854836098732e-05, "loss": 0.1744, "step": 2173 }, { "epoch": 3.4562798092209857, "grad_norm": 2.111571839276, "learning_rate": 3.3903158747589534e-05, "loss": 0.1709, "step": 2174 }, { "epoch": 3.4578696343402227, "grad_norm": 1.8040690783986737, "learning_rate": 3.390777119775174e-05, "loss": 0.1915, "step": 2175 }, { "epoch": 3.4594594594594597, "grad_norm": 1.3262928303365507, "learning_rate": 3.391238570991279e-05, "loss": 0.176, "step": 2176 }, { "epoch": 3.461049284578696, "grad_norm": 3.2926337738916747, "learning_rate": 3.3917002282510864e-05, "loss": 0.1997, "step": 2177 }, { "epoch": 3.462639109697933, "grad_norm": 2.123643365442141, "learning_rate": 3.3921620913983385e-05, "loss": 0.1991, "step": 2178 }, { "epoch": 3.46422893481717, "grad_norm": 1.6456511998951413, "learning_rate": 3.392624160276714e-05, "loss": 0.1589, "step": 2179 }, { "epoch": 3.4658187599364068, "grad_norm": 55.64065635317461, "learning_rate": 3.393086434729817e-05, "loss": 24.0611, "step": 2180 }, { "epoch": 3.4674085850556438, "grad_norm": 5.5795563955335465, "learning_rate": 3.393548914601187e-05, "loss": 0.2211, "step": 2181 }, { "epoch": 3.4689984101748808, "grad_norm": 4.4981944211527365, "learning_rate": 3.394011599734289e-05, "loss": 0.2543, "step": 2182 }, { "epoch": 3.4705882352941178, "grad_norm": 2.579640088456395, "learning_rate": 3.394474489972522e-05, "loss": 0.2024, "step": 2183 }, { "epoch": 3.4721780604133547, "grad_norm": 2.4271961898846834, "learning_rate": 3.394937585159214e-05, "loss": 0.1729, "step": 2184 }, { "epoch": 3.4737678855325913, "grad_norm": 3.5809252323718077, "learning_rate": 3.395400885137625e-05, "loss": 0.1688, "step": 2185 }, { "epoch": 3.4753577106518283, "grad_norm": 3.3666480906915566, "learning_rate": 3.395864389750944e-05, "loss": 0.1815, "step": 2186 }, { "epoch": 3.4769475357710653, "grad_norm": 1.414983950788271, "learning_rate": 3.396328098842291e-05, "loss": 0.1424, "step": 2187 }, { "epoch": 3.478537360890302, "grad_norm": 2.7099769763544432, "learning_rate": 3.396792012254718e-05, "loss": 0.2029, "step": 2188 }, { "epoch": 3.480127186009539, "grad_norm": 3.2846008613747064, "learning_rate": 3.397256129831206e-05, "loss": 0.1836, "step": 2189 }, { "epoch": 3.481717011128776, "grad_norm": 2.3755013789416926, "learning_rate": 3.3977204514146697e-05, "loss": 0.1599, "step": 2190 }, { "epoch": 3.483306836248013, "grad_norm": 3.576185000003846, "learning_rate": 3.398184976847951e-05, "loss": 0.1516, "step": 2191 }, { "epoch": 3.48489666136725, "grad_norm": 1.3608002899041598, "learning_rate": 3.3986497059738275e-05, "loss": 0.166, "step": 2192 }, { "epoch": 3.4864864864864864, "grad_norm": 2.7348729164966614, "learning_rate": 3.3991146386350036e-05, "loss": 0.2067, "step": 2193 }, { "epoch": 3.4880763116057234, "grad_norm": 3.860754630372321, "learning_rate": 3.399579774674116e-05, "loss": 0.149, "step": 2194 }, { "epoch": 3.4896661367249604, "grad_norm": 2.9614610542801074, "learning_rate": 3.400045113933734e-05, "loss": 0.234, "step": 2195 }, { "epoch": 3.491255961844197, "grad_norm": 3.950100658150459, "learning_rate": 3.4005106562563566e-05, "loss": 0.2657, "step": 2196 }, { "epoch": 3.492845786963434, "grad_norm": 1.981735627359607, "learning_rate": 3.400976401484414e-05, "loss": 0.1482, "step": 2197 }, { "epoch": 3.494435612082671, "grad_norm": 2.5275298011325527, "learning_rate": 3.40144234946027e-05, "loss": 0.199, "step": 2198 }, { "epoch": 3.496025437201908, "grad_norm": 4.411399450783077, "learning_rate": 3.401908500026217e-05, "loss": 0.1697, "step": 2199 }, { "epoch": 3.4976152623211445, "grad_norm": 2.4092006239734753, "learning_rate": 3.402374853024479e-05, "loss": 0.1184, "step": 2200 }, { "epoch": 3.4992050874403815, "grad_norm": 2.957952762723262, "learning_rate": 3.4028414082972135e-05, "loss": 0.1889, "step": 2201 }, { "epoch": 3.5007949125596185, "grad_norm": 2.788155962104228, "learning_rate": 3.4033081656865085e-05, "loss": 0.1552, "step": 2202 }, { "epoch": 3.502384737678855, "grad_norm": 2.438776921587329, "learning_rate": 3.403775125034384e-05, "loss": 0.206, "step": 2203 }, { "epoch": 3.503974562798092, "grad_norm": 3.876453117979367, "learning_rate": 3.404242286182791e-05, "loss": 0.154, "step": 2204 }, { "epoch": 3.505564387917329, "grad_norm": 3.239506735201599, "learning_rate": 3.404709648973611e-05, "loss": 0.2192, "step": 2205 }, { "epoch": 3.507154213036566, "grad_norm": 2.5948439312648413, "learning_rate": 3.4051772132486586e-05, "loss": 0.2253, "step": 2206 }, { "epoch": 3.508744038155803, "grad_norm": 4.030055172956011, "learning_rate": 3.405644978849682e-05, "loss": 0.2134, "step": 2207 }, { "epoch": 3.5103338632750396, "grad_norm": 4.175105502326585, "learning_rate": 3.4061129456183584e-05, "loss": 0.1504, "step": 2208 }, { "epoch": 3.5119236883942766, "grad_norm": 3.3917770467861166, "learning_rate": 3.406581113396298e-05, "loss": 0.21, "step": 2209 }, { "epoch": 3.5135135135135136, "grad_norm": 2.5649197187696853, "learning_rate": 3.4070494820250445e-05, "loss": 0.1793, "step": 2210 }, { "epoch": 3.51510333863275, "grad_norm": 3.4291171760907537, "learning_rate": 3.40751805134607e-05, "loss": 0.1714, "step": 2211 }, { "epoch": 3.516693163751987, "grad_norm": 2.3631258905262, "learning_rate": 3.4079868212007804e-05, "loss": 0.2271, "step": 2212 }, { "epoch": 3.518282988871224, "grad_norm": 3.3463495731738706, "learning_rate": 3.4084557914305156e-05, "loss": 0.1449, "step": 2213 }, { "epoch": 3.519872813990461, "grad_norm": 5.134986789482722, "learning_rate": 3.408924961876547e-05, "loss": 0.2584, "step": 2214 }, { "epoch": 3.521462639109698, "grad_norm": 3.363341197135916, "learning_rate": 3.4093943323800746e-05, "loss": 0.1694, "step": 2215 }, { "epoch": 3.5230524642289347, "grad_norm": 2.0588947012531795, "learning_rate": 3.4098639027822355e-05, "loss": 0.2065, "step": 2216 }, { "epoch": 3.5246422893481717, "grad_norm": 2.710768330059068, "learning_rate": 3.410333672924097e-05, "loss": 0.2022, "step": 2217 }, { "epoch": 3.5262321144674087, "grad_norm": 2.778468558206064, "learning_rate": 3.410803642646658e-05, "loss": 0.1645, "step": 2218 }, { "epoch": 3.5278219395866453, "grad_norm": 2.588865704206046, "learning_rate": 3.411273811790852e-05, "loss": 0.1556, "step": 2219 }, { "epoch": 3.5294117647058822, "grad_norm": 2.1734756855357906, "learning_rate": 3.411744180197542e-05, "loss": 0.1738, "step": 2220 }, { "epoch": 3.5310015898251192, "grad_norm": 1.859857625810375, "learning_rate": 3.412214747707527e-05, "loss": 0.2157, "step": 2221 }, { "epoch": 3.5325914149443562, "grad_norm": 2.2348037869973583, "learning_rate": 3.412685514161536e-05, "loss": 0.2166, "step": 2222 }, { "epoch": 3.5341812400635932, "grad_norm": 2.7988995354255506, "learning_rate": 3.413156479400232e-05, "loss": 0.1786, "step": 2223 }, { "epoch": 3.53577106518283, "grad_norm": 2.5668557554612383, "learning_rate": 3.413627643264211e-05, "loss": 0.1877, "step": 2224 }, { "epoch": 3.537360890302067, "grad_norm": 2.5269583639974114, "learning_rate": 3.414099005594e-05, "loss": 0.1769, "step": 2225 }, { "epoch": 3.538950715421304, "grad_norm": 4.06429581804545, "learning_rate": 3.41457056623006e-05, "loss": 0.2037, "step": 2226 }, { "epoch": 3.5405405405405403, "grad_norm": 2.518892455939632, "learning_rate": 3.415042325012785e-05, "loss": 0.1494, "step": 2227 }, { "epoch": 3.5421303656597773, "grad_norm": 1.7214389669068981, "learning_rate": 3.415514281782501e-05, "loss": 0.1276, "step": 2228 }, { "epoch": 3.5437201907790143, "grad_norm": 2.8492626005164525, "learning_rate": 3.41598643637947e-05, "loss": 0.1359, "step": 2229 }, { "epoch": 3.5453100158982513, "grad_norm": 1.7768946683979763, "learning_rate": 3.416458788643883e-05, "loss": 0.1348, "step": 2230 }, { "epoch": 3.5468998410174883, "grad_norm": 2.576159912904376, "learning_rate": 3.4169313384158655e-05, "loss": 0.2295, "step": 2231 }, { "epoch": 3.548489666136725, "grad_norm": 36.9665246530489, "learning_rate": 3.417404085535477e-05, "loss": 15.4191, "step": 2232 }, { "epoch": 3.550079491255962, "grad_norm": 3.525256989766491, "learning_rate": 3.4178770298427105e-05, "loss": 0.2494, "step": 2233 }, { "epoch": 3.551669316375199, "grad_norm": 2.859370599697808, "learning_rate": 3.418350171177491e-05, "loss": 0.1658, "step": 2234 }, { "epoch": 3.5532591414944354, "grad_norm": 1.9042829320072436, "learning_rate": 3.418823509379677e-05, "loss": 0.2296, "step": 2235 }, { "epoch": 3.5548489666136724, "grad_norm": 4.016433547259195, "learning_rate": 3.41929704428906e-05, "loss": 0.2644, "step": 2236 }, { "epoch": 3.5564387917329094, "grad_norm": 4.357613434026333, "learning_rate": 3.419770775745367e-05, "loss": 0.1314, "step": 2237 }, { "epoch": 3.5580286168521464, "grad_norm": 14.057584097209778, "learning_rate": 3.420244703588257e-05, "loss": 5.4259, "step": 2238 }, { "epoch": 3.559618441971383, "grad_norm": 2.253003561372279, "learning_rate": 3.420718827657321e-05, "loss": 0.1598, "step": 2239 }, { "epoch": 3.56120826709062, "grad_norm": 5.751152761592095, "learning_rate": 3.421193147792087e-05, "loss": 0.2441, "step": 2240 }, { "epoch": 3.562798092209857, "grad_norm": 2.6784110215551897, "learning_rate": 3.4216676638320134e-05, "loss": 0.1235, "step": 2241 }, { "epoch": 3.5643879173290935, "grad_norm": 2.114680599685505, "learning_rate": 3.422142375616495e-05, "loss": 0.1323, "step": 2242 }, { "epoch": 3.5659777424483305, "grad_norm": 4.075348934786634, "learning_rate": 3.422617282984858e-05, "loss": 0.2429, "step": 2243 }, { "epoch": 3.5675675675675675, "grad_norm": 2.409238601758043, "learning_rate": 3.4230923857763636e-05, "loss": 0.2051, "step": 2244 }, { "epoch": 3.5691573926868045, "grad_norm": 2.3893800850148454, "learning_rate": 3.4235676838302066e-05, "loss": 0.1243, "step": 2245 }, { "epoch": 3.5707472178060415, "grad_norm": 2.5854761109997764, "learning_rate": 3.4240431769855164e-05, "loss": 0.1967, "step": 2246 }, { "epoch": 3.572337042925278, "grad_norm": 2.9506156239410637, "learning_rate": 3.4245188650813566e-05, "loss": 0.1435, "step": 2247 }, { "epoch": 3.573926868044515, "grad_norm": 3.301166821257964, "learning_rate": 3.424994747956721e-05, "loss": 0.1946, "step": 2248 }, { "epoch": 3.575516693163752, "grad_norm": 3.320171001047837, "learning_rate": 3.425470825450544e-05, "loss": 0.2228, "step": 2249 }, { "epoch": 3.5771065182829886, "grad_norm": 4.566361161712173, "learning_rate": 3.4259470974016885e-05, "loss": 0.22, "step": 2250 }, { "epoch": 3.5786963434022256, "grad_norm": 3.029054064315815, "learning_rate": 3.4264235636489544e-05, "loss": 0.2434, "step": 2251 }, { "epoch": 3.5802861685214626, "grad_norm": 1.8838927540230304, "learning_rate": 3.426900224031074e-05, "loss": 0.1651, "step": 2252 }, { "epoch": 3.5818759936406996, "grad_norm": 4.596458434587626, "learning_rate": 3.427377078386716e-05, "loss": 0.228, "step": 2253 }, { "epoch": 3.5834658187599366, "grad_norm": 3.8577389660577066, "learning_rate": 3.4278541265544835e-05, "loss": 0.2014, "step": 2254 }, { "epoch": 3.585055643879173, "grad_norm": 6.338771202530857, "learning_rate": 3.4283313683729115e-05, "loss": 0.4152, "step": 2255 }, { "epoch": 3.58664546899841, "grad_norm": 1.841274035894927, "learning_rate": 3.4288088036804715e-05, "loss": 0.1563, "step": 2256 }, { "epoch": 3.588235294117647, "grad_norm": 5.6116912356050035, "learning_rate": 3.429286432315568e-05, "loss": 0.181, "step": 2257 }, { "epoch": 3.5898251192368837, "grad_norm": 4.214938153688392, "learning_rate": 3.429764254116542e-05, "loss": 0.2211, "step": 2258 }, { "epoch": 3.5914149443561207, "grad_norm": 2.6001930171720353, "learning_rate": 3.430242268921669e-05, "loss": 0.1626, "step": 2259 }, { "epoch": 3.5930047694753577, "grad_norm": 5.918990854283879, "learning_rate": 3.430720476569156e-05, "loss": 0.4128, "step": 2260 }, { "epoch": 3.5945945945945947, "grad_norm": 5.576818058686819, "learning_rate": 3.431198876897148e-05, "loss": 0.1494, "step": 2261 }, { "epoch": 3.5961844197138317, "grad_norm": 5.907947288151928, "learning_rate": 3.4316774697437244e-05, "loss": 0.1731, "step": 2262 }, { "epoch": 3.5977742448330683, "grad_norm": 2.327019084078932, "learning_rate": 3.4321562549468995e-05, "loss": 0.2101, "step": 2263 }, { "epoch": 3.5993640699523053, "grad_norm": 2.7985626720709877, "learning_rate": 3.43263523234462e-05, "loss": 0.1317, "step": 2264 }, { "epoch": 3.6009538950715423, "grad_norm": 4.072661424485802, "learning_rate": 3.433114401774769e-05, "loss": 0.1615, "step": 2265 }, { "epoch": 3.602543720190779, "grad_norm": 2.8231238512874657, "learning_rate": 3.4335937630751675e-05, "loss": 0.1673, "step": 2266 }, { "epoch": 3.604133545310016, "grad_norm": 3.3355505102611733, "learning_rate": 3.434073316083567e-05, "loss": 0.2443, "step": 2267 }, { "epoch": 3.605723370429253, "grad_norm": 3.562202793634764, "learning_rate": 3.4345530606376576e-05, "loss": 0.1669, "step": 2268 }, { "epoch": 3.60731319554849, "grad_norm": 1.7682861462217057, "learning_rate": 3.435032996575062e-05, "loss": 0.1754, "step": 2269 }, { "epoch": 3.6089030206677264, "grad_norm": 2.447206838622675, "learning_rate": 3.43551312373334e-05, "loss": 0.1885, "step": 2270 }, { "epoch": 3.6104928457869634, "grad_norm": 2.86177788923834, "learning_rate": 3.435993441949985e-05, "loss": 0.1619, "step": 2271 }, { "epoch": 3.6120826709062004, "grad_norm": 2.686715163840309, "learning_rate": 3.4364739510624285e-05, "loss": 0.1933, "step": 2272 }, { "epoch": 3.613672496025437, "grad_norm": 37.41382241954652, "learning_rate": 3.436954650908034e-05, "loss": 13.8121, "step": 2273 }, { "epoch": 3.615262321144674, "grad_norm": 1.9255833882516404, "learning_rate": 3.4374355413241026e-05, "loss": 0.1871, "step": 2274 }, { "epoch": 3.616852146263911, "grad_norm": 3.5383493640689485, "learning_rate": 3.437916622147869e-05, "loss": 0.1361, "step": 2275 }, { "epoch": 3.618441971383148, "grad_norm": 2.1782744586869316, "learning_rate": 3.4383978932165066e-05, "loss": 0.1045, "step": 2276 }, { "epoch": 3.620031796502385, "grad_norm": 2.9120566940800257, "learning_rate": 3.438879354367123e-05, "loss": 0.1437, "step": 2277 }, { "epoch": 3.6216216216216215, "grad_norm": 2.2983772759256205, "learning_rate": 3.4393610054367585e-05, "loss": 0.202, "step": 2278 }, { "epoch": 3.6232114467408585, "grad_norm": 1.8067785838437427, "learning_rate": 3.439842846262394e-05, "loss": 0.1745, "step": 2279 }, { "epoch": 3.6248012718600955, "grad_norm": 3.4078253803416394, "learning_rate": 3.4403248766809414e-05, "loss": 0.1872, "step": 2280 }, { "epoch": 3.626391096979332, "grad_norm": 2.5226845007078063, "learning_rate": 3.440807096529253e-05, "loss": 0.159, "step": 2281 }, { "epoch": 3.627980922098569, "grad_norm": 2.5203270088636724, "learning_rate": 3.441289505644114e-05, "loss": 0.2339, "step": 2282 }, { "epoch": 3.629570747217806, "grad_norm": 2.626330713440023, "learning_rate": 3.441772103862248e-05, "loss": 0.2438, "step": 2283 }, { "epoch": 3.631160572337043, "grad_norm": 2.0502242323045494, "learning_rate": 3.4422548910203095e-05, "loss": 0.1861, "step": 2284 }, { "epoch": 3.63275039745628, "grad_norm": 2.8756284595594077, "learning_rate": 3.442737866954896e-05, "loss": 0.2557, "step": 2285 }, { "epoch": 3.6343402225755166, "grad_norm": 1.7745984119176768, "learning_rate": 3.443221031502536e-05, "loss": 0.2023, "step": 2286 }, { "epoch": 3.6359300476947536, "grad_norm": 2.554434937357879, "learning_rate": 3.443704384499695e-05, "loss": 0.1187, "step": 2287 }, { "epoch": 3.6375198728139906, "grad_norm": 2.620617746737545, "learning_rate": 3.444187925782777e-05, "loss": 0.1723, "step": 2288 }, { "epoch": 3.639109697933227, "grad_norm": 2.922435096343134, "learning_rate": 3.444671655188121e-05, "loss": 0.2132, "step": 2289 }, { "epoch": 3.640699523052464, "grad_norm": 2.694029556211865, "learning_rate": 3.445155572552001e-05, "loss": 0.1747, "step": 2290 }, { "epoch": 3.642289348171701, "grad_norm": 4.540787233347911, "learning_rate": 3.445639677710628e-05, "loss": 0.235, "step": 2291 }, { "epoch": 3.643879173290938, "grad_norm": 4.964047278314403, "learning_rate": 3.446123970500152e-05, "loss": 0.3568, "step": 2292 }, { "epoch": 3.645468998410175, "grad_norm": 4.481992082432244, "learning_rate": 3.446608450756656e-05, "loss": 0.1436, "step": 2293 }, { "epoch": 3.6470588235294117, "grad_norm": 3.22670473643323, "learning_rate": 3.4470931183161605e-05, "loss": 0.1988, "step": 2294 }, { "epoch": 3.6486486486486487, "grad_norm": 3.671710213258525, "learning_rate": 3.4475779730146245e-05, "loss": 0.1582, "step": 2295 }, { "epoch": 3.6502384737678857, "grad_norm": 4.2184587667245035, "learning_rate": 3.448063014687942e-05, "loss": 0.1759, "step": 2296 }, { "epoch": 3.6518282988871222, "grad_norm": 3.106020188299759, "learning_rate": 3.448548243171943e-05, "loss": 0.1351, "step": 2297 }, { "epoch": 3.6534181240063592, "grad_norm": 2.66723789554461, "learning_rate": 3.449033658302396e-05, "loss": 0.1831, "step": 2298 }, { "epoch": 3.6550079491255962, "grad_norm": 1.9433657046836006, "learning_rate": 3.449519259915005e-05, "loss": 0.1679, "step": 2299 }, { "epoch": 3.6565977742448332, "grad_norm": 2.4516316541226852, "learning_rate": 3.450005047845411e-05, "loss": 0.1844, "step": 2300 }, { "epoch": 3.65818759936407, "grad_norm": 5.049219009301103, "learning_rate": 3.4504910219291934e-05, "loss": 0.183, "step": 2301 }, { "epoch": 3.659777424483307, "grad_norm": 5.49481646203679, "learning_rate": 3.450977182001869e-05, "loss": 0.1974, "step": 2302 }, { "epoch": 3.661367249602544, "grad_norm": 1.3940032561960058, "learning_rate": 3.451463527898887e-05, "loss": 0.1403, "step": 2303 }, { "epoch": 3.6629570747217803, "grad_norm": 4.562428681671952, "learning_rate": 3.451950059455638e-05, "loss": 0.1869, "step": 2304 }, { "epoch": 3.6645468998410173, "grad_norm": 4.936971297269469, "learning_rate": 3.4524367765074494e-05, "loss": 0.212, "step": 2305 }, { "epoch": 3.6661367249602543, "grad_norm": 2.6060978318861237, "learning_rate": 3.452923678889585e-05, "loss": 0.1422, "step": 2306 }, { "epoch": 3.6677265500794913, "grad_norm": 3.155356818348308, "learning_rate": 3.4534107664372465e-05, "loss": 0.239, "step": 2307 }, { "epoch": 3.6693163751987283, "grad_norm": 11.813544448137588, "learning_rate": 3.4538980389855704e-05, "loss": 0.6999, "step": 2308 }, { "epoch": 3.670906200317965, "grad_norm": 3.6389087904011648, "learning_rate": 3.454385496369635e-05, "loss": 0.1703, "step": 2309 }, { "epoch": 3.672496025437202, "grad_norm": 2.8944301972882336, "learning_rate": 3.454873138424452e-05, "loss": 0.1967, "step": 2310 }, { "epoch": 3.674085850556439, "grad_norm": 2.0656070003349427, "learning_rate": 3.455360964984973e-05, "loss": 0.1834, "step": 2311 }, { "epoch": 3.6756756756756754, "grad_norm": 2.477057663255453, "learning_rate": 3.455848975886086e-05, "loss": 0.1726, "step": 2312 }, { "epoch": 3.6772655007949124, "grad_norm": 42.26123804523394, "learning_rate": 3.456337170962617e-05, "loss": 12.6692, "step": 2313 }, { "epoch": 3.6788553259141494, "grad_norm": 2.4892692291877654, "learning_rate": 3.45682555004933e-05, "loss": 0.1478, "step": 2314 }, { "epoch": 3.6804451510333864, "grad_norm": 1.5452380338837028, "learning_rate": 3.457314112980925e-05, "loss": 0.145, "step": 2315 }, { "epoch": 3.6820349761526234, "grad_norm": 3.28085964751207, "learning_rate": 3.457802859592043e-05, "loss": 0.2478, "step": 2316 }, { "epoch": 3.68362480127186, "grad_norm": 1.8619416725266886, "learning_rate": 3.4582917897172606e-05, "loss": 0.1933, "step": 2317 }, { "epoch": 3.685214626391097, "grad_norm": 3.0318040634766428, "learning_rate": 3.4587809031910915e-05, "loss": 0.1959, "step": 2318 }, { "epoch": 3.686804451510334, "grad_norm": 6.527467021135294, "learning_rate": 3.459270199847989e-05, "loss": 0.2098, "step": 2319 }, { "epoch": 3.6883942766295705, "grad_norm": 3.1986261677412853, "learning_rate": 3.459759679522345e-05, "loss": 0.149, "step": 2320 }, { "epoch": 3.6899841017488075, "grad_norm": 1.0413873174843853, "learning_rate": 3.460249342048487e-05, "loss": 0.1505, "step": 2321 }, { "epoch": 3.6915739268680445, "grad_norm": 2.435611173089463, "learning_rate": 3.460739187260682e-05, "loss": 0.1677, "step": 2322 }, { "epoch": 3.6931637519872815, "grad_norm": 4.413936484957647, "learning_rate": 3.461229214993136e-05, "loss": 0.2017, "step": 2323 }, { "epoch": 3.6947535771065185, "grad_norm": 2.7072352061062515, "learning_rate": 3.461719425079993e-05, "loss": 0.2308, "step": 2324 }, { "epoch": 3.696343402225755, "grad_norm": 2.3537338098702585, "learning_rate": 3.462209817355333e-05, "loss": 0.1907, "step": 2325 }, { "epoch": 3.697933227344992, "grad_norm": 2.066459671956203, "learning_rate": 3.462700391653176e-05, "loss": 0.2215, "step": 2326 }, { "epoch": 3.699523052464229, "grad_norm": 2.0519213228534143, "learning_rate": 3.463191147807482e-05, "loss": 0.1468, "step": 2327 }, { "epoch": 3.7011128775834656, "grad_norm": 2.452532896603811, "learning_rate": 3.463682085652146e-05, "loss": 0.1486, "step": 2328 }, { "epoch": 3.7027027027027026, "grad_norm": 1.8992498150981318, "learning_rate": 3.464173205021004e-05, "loss": 0.1275, "step": 2329 }, { "epoch": 3.7042925278219396, "grad_norm": 2.4716633414403417, "learning_rate": 3.464664505747829e-05, "loss": 0.2015, "step": 2330 }, { "epoch": 3.7058823529411766, "grad_norm": 3.3707798231168296, "learning_rate": 3.465155987666335e-05, "loss": 0.1621, "step": 2331 }, { "epoch": 3.7074721780604136, "grad_norm": 2.2108990507123565, "learning_rate": 3.465647650610173e-05, "loss": 0.2192, "step": 2332 }, { "epoch": 3.70906200317965, "grad_norm": 56.21615648325044, "learning_rate": 3.4661394944129334e-05, "loss": 1.647, "step": 2333 }, { "epoch": 3.710651828298887, "grad_norm": 3.8275669427765107, "learning_rate": 3.466631518908143e-05, "loss": 0.2487, "step": 2334 }, { "epoch": 3.7122416534181237, "grad_norm": 6.050164351756658, "learning_rate": 3.4671237239292705e-05, "loss": 0.2014, "step": 2335 }, { "epoch": 3.7138314785373607, "grad_norm": 3.109243734928668, "learning_rate": 3.4676161093097213e-05, "loss": 0.2037, "step": 2336 }, { "epoch": 3.7154213036565977, "grad_norm": 2.8354605919419926, "learning_rate": 3.4681086748828426e-05, "loss": 0.2031, "step": 2337 }, { "epoch": 3.7170111287758347, "grad_norm": 2.4843078172652246, "learning_rate": 3.468601420481917e-05, "loss": 0.2555, "step": 2338 }, { "epoch": 3.7186009538950717, "grad_norm": 2.786951390503813, "learning_rate": 3.469094345940169e-05, "loss": 0.2061, "step": 2339 }, { "epoch": 3.7201907790143083, "grad_norm": 1.6697304009388219, "learning_rate": 3.469587451090761e-05, "loss": 0.2119, "step": 2340 }, { "epoch": 3.7217806041335453, "grad_norm": 3.5796718797231724, "learning_rate": 3.470080735766795e-05, "loss": 0.234, "step": 2341 }, { "epoch": 3.7233704292527823, "grad_norm": 2.0991303793579568, "learning_rate": 3.470574199801312e-05, "loss": 0.1727, "step": 2342 }, { "epoch": 3.724960254372019, "grad_norm": 1.6365875398144263, "learning_rate": 3.471067843027291e-05, "loss": 0.2046, "step": 2343 }, { "epoch": 3.726550079491256, "grad_norm": 1.68264238342068, "learning_rate": 3.471561665277653e-05, "loss": 0.1954, "step": 2344 }, { "epoch": 3.728139904610493, "grad_norm": 3.3875991538796857, "learning_rate": 3.472055666385256e-05, "loss": 0.18, "step": 2345 }, { "epoch": 3.72972972972973, "grad_norm": 1.3583845518927988, "learning_rate": 3.4725498461829006e-05, "loss": 0.1499, "step": 2346 }, { "epoch": 3.731319554848967, "grad_norm": 19.37809957544923, "learning_rate": 3.473044204503322e-05, "loss": 7.4579, "step": 2347 }, { "epoch": 3.7329093799682034, "grad_norm": 0.7892535760689335, "learning_rate": 3.4735387411792e-05, "loss": 0.1631, "step": 2348 }, { "epoch": 3.7344992050874404, "grad_norm": 2.4268763169751413, "learning_rate": 3.474033456043152e-05, "loss": 0.1397, "step": 2349 }, { "epoch": 3.7360890302066774, "grad_norm": 1.5980243894926773, "learning_rate": 3.474528348927732e-05, "loss": 0.162, "step": 2350 }, { "epoch": 3.737678855325914, "grad_norm": 3.6777773768793787, "learning_rate": 3.47502341966544e-05, "loss": 0.251, "step": 2351 }, { "epoch": 3.739268680445151, "grad_norm": 1.3088722355668778, "learning_rate": 3.475518668088711e-05, "loss": 0.1674, "step": 2352 }, { "epoch": 3.740858505564388, "grad_norm": 2.5353819230558408, "learning_rate": 3.4760140940299205e-05, "loss": 0.1111, "step": 2353 }, { "epoch": 3.742448330683625, "grad_norm": 1.8682585379053294, "learning_rate": 3.476509697321387e-05, "loss": 0.1761, "step": 2354 }, { "epoch": 3.744038155802862, "grad_norm": 2.6145514988595107, "learning_rate": 3.477005477795365e-05, "loss": 0.1363, "step": 2355 }, { "epoch": 3.7456279809220985, "grad_norm": 2.0331056246104007, "learning_rate": 3.4775014352840515e-05, "loss": 0.2026, "step": 2356 }, { "epoch": 3.7472178060413355, "grad_norm": 1.6606349979753297, "learning_rate": 3.477997569619583e-05, "loss": 0.1588, "step": 2357 }, { "epoch": 3.7488076311605725, "grad_norm": 1.6941984181243817, "learning_rate": 3.478493880634034e-05, "loss": 0.1302, "step": 2358 }, { "epoch": 3.750397456279809, "grad_norm": 2.694086276526322, "learning_rate": 3.478990368159424e-05, "loss": 0.2274, "step": 2359 }, { "epoch": 3.751987281399046, "grad_norm": 1.429161015228244, "learning_rate": 3.479487032027708e-05, "loss": 0.1837, "step": 2360 }, { "epoch": 3.753577106518283, "grad_norm": 2.3091023582664514, "learning_rate": 3.4799838720707845e-05, "loss": 0.166, "step": 2361 }, { "epoch": 3.75516693163752, "grad_norm": 1.738031921692634, "learning_rate": 3.4804808881204904e-05, "loss": 0.1375, "step": 2362 }, { "epoch": 3.756756756756757, "grad_norm": 1.0666717618053219, "learning_rate": 3.480978080008605e-05, "loss": 0.1521, "step": 2363 }, { "epoch": 3.7583465818759936, "grad_norm": 1.446082465316099, "learning_rate": 3.481475447566845e-05, "loss": 0.1797, "step": 2364 }, { "epoch": 3.7599364069952306, "grad_norm": 2.393680174347454, "learning_rate": 3.48197299062687e-05, "loss": 0.2275, "step": 2365 }, { "epoch": 3.7615262321144676, "grad_norm": 2.750729858337788, "learning_rate": 3.4824707090202807e-05, "loss": 0.1902, "step": 2366 }, { "epoch": 3.763116057233704, "grad_norm": 2.09719966026055, "learning_rate": 3.482968602578616e-05, "loss": 0.1771, "step": 2367 }, { "epoch": 3.764705882352941, "grad_norm": 0.8901560440594336, "learning_rate": 3.483466671133358e-05, "loss": 0.1709, "step": 2368 }, { "epoch": 3.766295707472178, "grad_norm": 2.832412908842704, "learning_rate": 3.483964914515929e-05, "loss": 0.1774, "step": 2369 }, { "epoch": 3.767885532591415, "grad_norm": 3.072405062063487, "learning_rate": 3.4844633325576905e-05, "loss": 0.1632, "step": 2370 }, { "epoch": 3.7694753577106517, "grad_norm": 1.3775188252973474, "learning_rate": 3.484961925089946e-05, "loss": 0.1798, "step": 2371 }, { "epoch": 3.7710651828298887, "grad_norm": 2.6656984173681115, "learning_rate": 3.485460691943941e-05, "loss": 0.2401, "step": 2372 }, { "epoch": 3.7726550079491257, "grad_norm": 2.8410475481407405, "learning_rate": 3.485959632950859e-05, "loss": 0.1532, "step": 2373 }, { "epoch": 3.7742448330683622, "grad_norm": 1.2800953904056138, "learning_rate": 3.48645874794183e-05, "loss": 0.1876, "step": 2374 }, { "epoch": 3.7758346581875992, "grad_norm": 3.651196357860561, "learning_rate": 3.486958036747919e-05, "loss": 0.1661, "step": 2375 }, { "epoch": 3.7774244833068362, "grad_norm": 1.967634903761419, "learning_rate": 3.487457499200135e-05, "loss": 0.1366, "step": 2376 }, { "epoch": 3.779014308426073, "grad_norm": 57.894450147851586, "learning_rate": 3.487957135129429e-05, "loss": 23.0071, "step": 2377 }, { "epoch": 3.78060413354531, "grad_norm": 1.9366433387358164, "learning_rate": 3.488456944366691e-05, "loss": 0.156, "step": 2378 }, { "epoch": 3.7821939586645468, "grad_norm": 1.9819929713986302, "learning_rate": 3.488956926742755e-05, "loss": 0.1209, "step": 2379 }, { "epoch": 3.7837837837837838, "grad_norm": 2.581085864964594, "learning_rate": 3.489457082088394e-05, "loss": 0.1926, "step": 2380 }, { "epoch": 3.7853736089030208, "grad_norm": 4.493207814518764, "learning_rate": 3.489957410234325e-05, "loss": 0.2476, "step": 2381 }, { "epoch": 3.7869634340222573, "grad_norm": 1.114143972082843, "learning_rate": 3.4904579110112034e-05, "loss": 0.1929, "step": 2382 }, { "epoch": 3.7885532591414943, "grad_norm": 2.1586913375810783, "learning_rate": 3.490958584249629e-05, "loss": 0.1605, "step": 2383 }, { "epoch": 3.7901430842607313, "grad_norm": 3.111469634313678, "learning_rate": 3.491459429780141e-05, "loss": 0.1804, "step": 2384 }, { "epoch": 3.7917329093799683, "grad_norm": 2.1194095132951283, "learning_rate": 3.4919604474332224e-05, "loss": 0.2098, "step": 2385 }, { "epoch": 3.7933227344992053, "grad_norm": 1.9405234973877377, "learning_rate": 3.492461637039296e-05, "loss": 0.164, "step": 2386 }, { "epoch": 3.794912559618442, "grad_norm": 2.5648742357407484, "learning_rate": 3.4929629984287275e-05, "loss": 0.2008, "step": 2387 }, { "epoch": 3.796502384737679, "grad_norm": 4.4408873963457545, "learning_rate": 3.4934645314318245e-05, "loss": 0.1842, "step": 2388 }, { "epoch": 3.798092209856916, "grad_norm": 2.48413781072719, "learning_rate": 3.493966235878836e-05, "loss": 0.1586, "step": 2389 }, { "epoch": 3.7996820349761524, "grad_norm": 2.386148581839202, "learning_rate": 3.4944681115999535e-05, "loss": 0.2059, "step": 2390 }, { "epoch": 3.8012718600953894, "grad_norm": 1.5817612876378282, "learning_rate": 3.49497015842531e-05, "loss": 0.1519, "step": 2391 }, { "epoch": 3.8028616852146264, "grad_norm": 1.9317169464537525, "learning_rate": 3.4954723761849814e-05, "loss": 0.1376, "step": 2392 }, { "epoch": 3.8044515103338634, "grad_norm": 3.1973302093237743, "learning_rate": 3.495974764708983e-05, "loss": 0.201, "step": 2393 }, { "epoch": 3.8060413354531004, "grad_norm": 3.0405341932530234, "learning_rate": 3.4964773238272774e-05, "loss": 0.1956, "step": 2394 }, { "epoch": 3.807631160572337, "grad_norm": 1.7686012740430275, "learning_rate": 3.4969800533697644e-05, "loss": 0.1453, "step": 2395 }, { "epoch": 3.809220985691574, "grad_norm": 5.717205893084337, "learning_rate": 3.4974829531662905e-05, "loss": 0.2696, "step": 2396 }, { "epoch": 3.810810810810811, "grad_norm": 1.8058259312810452, "learning_rate": 3.49798602304664e-05, "loss": 0.1395, "step": 2397 }, { "epoch": 3.8124006359300475, "grad_norm": 2.441482058096391, "learning_rate": 3.498489262840543e-05, "loss": 0.1405, "step": 2398 }, { "epoch": 3.8139904610492845, "grad_norm": 2.9756814243122442, "learning_rate": 3.498992672377671e-05, "loss": 0.1775, "step": 2399 }, { "epoch": 3.8155802861685215, "grad_norm": 1.756883062229637, "learning_rate": 3.499496251487637e-05, "loss": 0.1553, "step": 2400 }, { "epoch": 3.8171701112877585, "grad_norm": 1.656680457274463, "learning_rate": 3.5000000000000004e-05, "loss": 0.1677, "step": 2401 }, { "epoch": 3.818759936406995, "grad_norm": 12.710179507984915, "learning_rate": 3.500503917744258e-05, "loss": 0.4823, "step": 2402 }, { "epoch": 3.820349761526232, "grad_norm": 2.649124846644406, "learning_rate": 3.5010080045498535e-05, "loss": 0.2023, "step": 2403 }, { "epoch": 3.821939586645469, "grad_norm": 3.3603041894553916, "learning_rate": 3.50151226024617e-05, "loss": 0.2355, "step": 2404 }, { "epoch": 3.8235294117647056, "grad_norm": 1.8062759740131256, "learning_rate": 3.502016684662536e-05, "loss": 0.1368, "step": 2405 }, { "epoch": 3.8251192368839426, "grad_norm": 51.03220242009722, "learning_rate": 3.5025212776282234e-05, "loss": 20.0733, "step": 2406 }, { "epoch": 3.8267090620031796, "grad_norm": 3.760107257941648, "learning_rate": 3.5030260389724446e-05, "loss": 0.2466, "step": 2407 }, { "epoch": 3.8282988871224166, "grad_norm": 2.063777922408905, "learning_rate": 3.503530968524356e-05, "loss": 0.163, "step": 2408 }, { "epoch": 3.8298887122416536, "grad_norm": 2.7472765168197246, "learning_rate": 3.504036066113058e-05, "loss": 0.1665, "step": 2409 }, { "epoch": 3.83147853736089, "grad_norm": 14.259954166086972, "learning_rate": 3.504541331567592e-05, "loss": 0.2014, "step": 2410 }, { "epoch": 3.833068362480127, "grad_norm": 1.9234807647109433, "learning_rate": 3.505046764716946e-05, "loss": 0.1582, "step": 2411 }, { "epoch": 3.834658187599364, "grad_norm": 3.423325038728435, "learning_rate": 3.505552365390048e-05, "loss": 0.1668, "step": 2412 }, { "epoch": 3.8362480127186007, "grad_norm": 4.090884866253771, "learning_rate": 3.50605813341577e-05, "loss": 0.2612, "step": 2413 }, { "epoch": 3.8378378378378377, "grad_norm": 2.86284835758412, "learning_rate": 3.506564068622927e-05, "loss": 0.2451, "step": 2414 }, { "epoch": 3.8394276629570747, "grad_norm": 6.745436587247749, "learning_rate": 3.507070170840281e-05, "loss": 0.2984, "step": 2415 }, { "epoch": 3.8410174880763117, "grad_norm": 3.0543069602422723, "learning_rate": 3.5075764398965334e-05, "loss": 0.2088, "step": 2416 }, { "epoch": 3.8426073131955487, "grad_norm": 1.9401446800764404, "learning_rate": 3.5080828756203295e-05, "loss": 0.1326, "step": 2417 }, { "epoch": 3.8441971383147853, "grad_norm": 3.3092215033239722, "learning_rate": 3.50858947784026e-05, "loss": 0.1811, "step": 2418 }, { "epoch": 3.8457869634340223, "grad_norm": 5.095733416555802, "learning_rate": 3.5090962463848594e-05, "loss": 0.2737, "step": 2419 }, { "epoch": 3.8473767885532593, "grad_norm": 2.851306681658006, "learning_rate": 3.509603181082603e-05, "loss": 0.2068, "step": 2420 }, { "epoch": 3.848966613672496, "grad_norm": 4.305893085543297, "learning_rate": 3.510110281761913e-05, "loss": 0.2362, "step": 2421 }, { "epoch": 3.850556438791733, "grad_norm": 3.9469952522170533, "learning_rate": 3.510617548251154e-05, "loss": 0.183, "step": 2422 }, { "epoch": 3.85214626391097, "grad_norm": 2.2271723987204792, "learning_rate": 3.511124980378634e-05, "loss": 0.2052, "step": 2423 }, { "epoch": 3.853736089030207, "grad_norm": 2.6914982073820446, "learning_rate": 3.5116325779726066e-05, "loss": 0.2303, "step": 2424 }, { "epoch": 3.855325914149444, "grad_norm": 2.2197683212908053, "learning_rate": 3.512140340861268e-05, "loss": 0.2001, "step": 2425 }, { "epoch": 3.8569157392686804, "grad_norm": 2.9430444680745884, "learning_rate": 3.512648268872758e-05, "loss": 0.2329, "step": 2426 }, { "epoch": 3.8585055643879174, "grad_norm": 3.514802912130992, "learning_rate": 3.513156361835162e-05, "loss": 0.209, "step": 2427 }, { "epoch": 3.8600953895071544, "grad_norm": 3.6130586054933898, "learning_rate": 3.51366461957651e-05, "loss": 0.1501, "step": 2428 }, { "epoch": 3.861685214626391, "grad_norm": 2.7654406475294158, "learning_rate": 3.514173041924773e-05, "loss": 0.2438, "step": 2429 }, { "epoch": 3.863275039745628, "grad_norm": 2.608737201712087, "learning_rate": 3.514681628707871e-05, "loss": 0.173, "step": 2430 }, { "epoch": 3.864864864864865, "grad_norm": 2.3006024229888844, "learning_rate": 3.515190379753663e-05, "loss": 0.182, "step": 2431 }, { "epoch": 3.866454689984102, "grad_norm": 3.484628928285874, "learning_rate": 3.5156992948899576e-05, "loss": 0.1638, "step": 2432 }, { "epoch": 3.868044515103339, "grad_norm": 1.9459528299051319, "learning_rate": 3.516208373944504e-05, "loss": 0.1814, "step": 2433 }, { "epoch": 3.8696343402225755, "grad_norm": 1.839963229647296, "learning_rate": 3.5167176167449976e-05, "loss": 0.1166, "step": 2434 }, { "epoch": 3.8712241653418125, "grad_norm": 3.504119171750761, "learning_rate": 3.5172270231190785e-05, "loss": 0.1875, "step": 2435 }, { "epoch": 3.872813990461049, "grad_norm": 2.662756725082136, "learning_rate": 3.5177365928943314e-05, "loss": 0.1704, "step": 2436 }, { "epoch": 3.874403815580286, "grad_norm": 2.248953447929612, "learning_rate": 3.5182463258982854e-05, "loss": 0.1584, "step": 2437 }, { "epoch": 3.875993640699523, "grad_norm": 3.379535349683668, "learning_rate": 3.518756221958412e-05, "loss": 0.165, "step": 2438 }, { "epoch": 3.87758346581876, "grad_norm": 40.1916363439513, "learning_rate": 3.5192662809021336e-05, "loss": 14.5069, "step": 2439 }, { "epoch": 3.879173290937997, "grad_norm": 2.15921144500605, "learning_rate": 3.519776502556812e-05, "loss": 0.1903, "step": 2440 }, { "epoch": 3.8807631160572336, "grad_norm": 3.3737463743479674, "learning_rate": 3.5202868867497535e-05, "loss": 0.1614, "step": 2441 }, { "epoch": 3.8823529411764706, "grad_norm": 2.6087143641687014, "learning_rate": 3.520797433308215e-05, "loss": 0.1999, "step": 2442 }, { "epoch": 3.8839427662957076, "grad_norm": 2.282769125786468, "learning_rate": 3.521308142059393e-05, "loss": 0.1909, "step": 2443 }, { "epoch": 3.885532591414944, "grad_norm": 2.4181488796289083, "learning_rate": 3.521819012830432e-05, "loss": 0.2073, "step": 2444 }, { "epoch": 3.887122416534181, "grad_norm": 3.525508575349125, "learning_rate": 3.522330045448421e-05, "loss": 0.1132, "step": 2445 }, { "epoch": 3.888712241653418, "grad_norm": 2.245257200779694, "learning_rate": 3.5228412397403914e-05, "loss": 0.1297, "step": 2446 }, { "epoch": 3.890302066772655, "grad_norm": 3.960690661743815, "learning_rate": 3.5233525955333254e-05, "loss": 0.1479, "step": 2447 }, { "epoch": 3.891891891891892, "grad_norm": 2.607574382611782, "learning_rate": 3.523864112654147e-05, "loss": 0.1997, "step": 2448 }, { "epoch": 3.8934817170111287, "grad_norm": 1.3603443321277813, "learning_rate": 3.524375790929725e-05, "loss": 0.1728, "step": 2449 }, { "epoch": 3.8950715421303657, "grad_norm": 2.398928494582779, "learning_rate": 3.5248876301868754e-05, "loss": 0.1807, "step": 2450 }, { "epoch": 3.8966613672496027, "grad_norm": 2.072211532077674, "learning_rate": 3.52539963025236e-05, "loss": 0.2199, "step": 2451 }, { "epoch": 3.898251192368839, "grad_norm": 23.75057433890675, "learning_rate": 3.525911790952884e-05, "loss": 4.3345, "step": 2452 }, { "epoch": 3.899841017488076, "grad_norm": 4.151208819237776, "learning_rate": 3.5264241121151e-05, "loss": 0.1527, "step": 2453 }, { "epoch": 3.901430842607313, "grad_norm": 2.52612688840614, "learning_rate": 3.526936593565606e-05, "loss": 0.1967, "step": 2454 }, { "epoch": 3.90302066772655, "grad_norm": 3.0246190616549344, "learning_rate": 3.527449235130946e-05, "loss": 0.184, "step": 2455 }, { "epoch": 3.904610492845787, "grad_norm": 4.271881579103825, "learning_rate": 3.5279620366376087e-05, "loss": 0.1454, "step": 2456 }, { "epoch": 3.9062003179650238, "grad_norm": 3.5670224221087174, "learning_rate": 3.52847499791203e-05, "loss": 0.1479, "step": 2457 }, { "epoch": 3.9077901430842608, "grad_norm": 2.806537248601152, "learning_rate": 3.5289881187805904e-05, "loss": 0.165, "step": 2458 }, { "epoch": 3.9093799682034978, "grad_norm": 2.2702142539930974, "learning_rate": 3.5295013990696175e-05, "loss": 0.1798, "step": 2459 }, { "epoch": 3.9109697933227343, "grad_norm": 2.1196379847661917, "learning_rate": 3.5300148386053835e-05, "loss": 0.1762, "step": 2460 }, { "epoch": 3.9125596184419713, "grad_norm": 3.0816845470778316, "learning_rate": 3.53052843721411e-05, "loss": 0.1546, "step": 2461 }, { "epoch": 3.9141494435612083, "grad_norm": 3.1863201638618643, "learning_rate": 3.5310421947219595e-05, "loss": 0.1619, "step": 2462 }, { "epoch": 3.9157392686804453, "grad_norm": 3.1392011699340094, "learning_rate": 3.5315561109550455e-05, "loss": 0.2251, "step": 2463 }, { "epoch": 3.9173290937996823, "grad_norm": 2.6968733323538197, "learning_rate": 3.532070185739427e-05, "loss": 0.2102, "step": 2464 }, { "epoch": 3.918918918918919, "grad_norm": 2.531282954476646, "learning_rate": 3.5325844189011066e-05, "loss": 0.1847, "step": 2465 }, { "epoch": 3.920508744038156, "grad_norm": 3.1937133073499737, "learning_rate": 3.5330988102660344e-05, "loss": 0.1467, "step": 2466 }, { "epoch": 3.9220985691573924, "grad_norm": 1.9510489840910583, "learning_rate": 3.533613359660109e-05, "loss": 0.1676, "step": 2467 }, { "epoch": 3.9236883942766294, "grad_norm": 4.015000107746218, "learning_rate": 3.5341280669091734e-05, "loss": 0.1471, "step": 2468 }, { "epoch": 3.9252782193958664, "grad_norm": 2.393296239677172, "learning_rate": 3.534642931839018e-05, "loss": 0.1383, "step": 2469 }, { "epoch": 3.9268680445151034, "grad_norm": 3.7303861039752433, "learning_rate": 3.535157954275381e-05, "loss": 0.2361, "step": 2470 }, { "epoch": 3.9284578696343404, "grad_norm": 3.0680192836164806, "learning_rate": 3.535673134043943e-05, "loss": 0.1495, "step": 2471 }, { "epoch": 3.930047694753577, "grad_norm": 4.098729265569174, "learning_rate": 3.536188470970337e-05, "loss": 0.2483, "step": 2472 }, { "epoch": 3.931637519872814, "grad_norm": 3.07416289221877, "learning_rate": 3.536703964880138e-05, "loss": 0.2466, "step": 2473 }, { "epoch": 3.933227344992051, "grad_norm": 1.3144657546088647, "learning_rate": 3.537219615598872e-05, "loss": 0.1792, "step": 2474 }, { "epoch": 3.9348171701112875, "grad_norm": 2.7890898563836126, "learning_rate": 3.537735422952009e-05, "loss": 0.1722, "step": 2475 }, { "epoch": 3.9364069952305245, "grad_norm": 3.101523806384929, "learning_rate": 3.538251386764966e-05, "loss": 0.1842, "step": 2476 }, { "epoch": 3.9379968203497615, "grad_norm": 3.1351566854918356, "learning_rate": 3.5387675068631094e-05, "loss": 0.1326, "step": 2477 }, { "epoch": 3.9395866454689985, "grad_norm": 2.5568469699974385, "learning_rate": 3.5392837830717506e-05, "loss": 0.2199, "step": 2478 }, { "epoch": 3.9411764705882355, "grad_norm": 3.152698562235238, "learning_rate": 3.539800215216148e-05, "loss": 0.2167, "step": 2479 }, { "epoch": 3.942766295707472, "grad_norm": 2.3233547599794155, "learning_rate": 3.54031680312151e-05, "loss": 0.1961, "step": 2480 }, { "epoch": 3.944356120826709, "grad_norm": 4.776217524395933, "learning_rate": 3.540833546612989e-05, "loss": 0.1808, "step": 2481 }, { "epoch": 3.945945945945946, "grad_norm": 2.7665568807803105, "learning_rate": 3.5413504455156854e-05, "loss": 0.2102, "step": 2482 }, { "epoch": 3.9475357710651826, "grad_norm": 2.2106151605749798, "learning_rate": 3.541867499654649e-05, "loss": 0.1709, "step": 2483 }, { "epoch": 3.9491255961844196, "grad_norm": 2.1384481747260944, "learning_rate": 3.542384708854874e-05, "loss": 0.1429, "step": 2484 }, { "epoch": 3.9507154213036566, "grad_norm": 4.0496993514586235, "learning_rate": 3.542902072941306e-05, "loss": 0.2919, "step": 2485 }, { "epoch": 3.9523052464228936, "grad_norm": 2.609633301174429, "learning_rate": 3.543419591738835e-05, "loss": 0.2099, "step": 2486 }, { "epoch": 3.9538950715421306, "grad_norm": 1.5510065122093692, "learning_rate": 3.543937265072299e-05, "loss": 0.1418, "step": 2487 }, { "epoch": 3.955484896661367, "grad_norm": 3.036603358895255, "learning_rate": 3.544455092766485e-05, "loss": 0.182, "step": 2488 }, { "epoch": 3.957074721780604, "grad_norm": 1.989447320850289, "learning_rate": 3.5449730746461265e-05, "loss": 0.1752, "step": 2489 }, { "epoch": 3.958664546899841, "grad_norm": 3.0522344712938927, "learning_rate": 3.545491210535906e-05, "loss": 0.2401, "step": 2490 }, { "epoch": 3.9602543720190777, "grad_norm": 2.5157826879854817, "learning_rate": 3.5460095002604534e-05, "loss": 0.2001, "step": 2491 }, { "epoch": 3.9618441971383147, "grad_norm": 2.6362953045053654, "learning_rate": 3.546527943644345e-05, "loss": 0.2016, "step": 2492 }, { "epoch": 3.9634340222575517, "grad_norm": 2.7176922938212016, "learning_rate": 3.5470465405121096e-05, "loss": 0.1552, "step": 2493 }, { "epoch": 3.9650238473767887, "grad_norm": 1.352395017307062, "learning_rate": 3.547565290688218e-05, "loss": 0.185, "step": 2494 }, { "epoch": 3.9666136724960257, "grad_norm": 2.9172143704607008, "learning_rate": 3.5480841939970927e-05, "loss": 0.1893, "step": 2495 }, { "epoch": 3.9682034976152623, "grad_norm": 3.114537081421992, "learning_rate": 3.548603250263104e-05, "loss": 0.1486, "step": 2496 }, { "epoch": 3.9697933227344993, "grad_norm": 2.4737064533545254, "learning_rate": 3.549122459310569e-05, "loss": 0.1496, "step": 2497 }, { "epoch": 3.9713831478537363, "grad_norm": 1.6321133872125866, "learning_rate": 3.549641820963757e-05, "loss": 0.201, "step": 2498 }, { "epoch": 3.972972972972973, "grad_norm": 1.8528968206770604, "learning_rate": 3.55016133504688e-05, "loss": 0.1783, "step": 2499 }, { "epoch": 3.97456279809221, "grad_norm": 1.920757501079606, "learning_rate": 3.550681001384104e-05, "loss": 0.1023, "step": 2500 }, { "epoch": 3.976152623211447, "grad_norm": 1.5605398462428834, "learning_rate": 3.5512008197995385e-05, "loss": 0.1576, "step": 2501 }, { "epoch": 3.977742448330684, "grad_norm": 1.4653499470889104, "learning_rate": 3.551720790117244e-05, "loss": 0.1717, "step": 2502 }, { "epoch": 3.9793322734499204, "grad_norm": 3.149158580056034, "learning_rate": 3.552240912161231e-05, "loss": 0.2491, "step": 2503 }, { "epoch": 3.9809220985691574, "grad_norm": 1.8675693978738612, "learning_rate": 3.552761185755455e-05, "loss": 0.1301, "step": 2504 }, { "epoch": 3.9825119236883944, "grad_norm": 1.269456174649296, "learning_rate": 3.553281610723823e-05, "loss": 0.137, "step": 2505 }, { "epoch": 3.984101748807631, "grad_norm": 1.2907197159785364, "learning_rate": 3.553802186890191e-05, "loss": 0.1679, "step": 2506 }, { "epoch": 3.985691573926868, "grad_norm": 4.382660389061243, "learning_rate": 3.5543229140783626e-05, "loss": 0.121, "step": 2507 }, { "epoch": 3.987281399046105, "grad_norm": 3.9124623768642253, "learning_rate": 3.554843792112089e-05, "loss": 0.1956, "step": 2508 }, { "epoch": 3.988871224165342, "grad_norm": 3.784450163471156, "learning_rate": 3.5553648208150726e-05, "loss": 0.1748, "step": 2509 }, { "epoch": 3.990461049284579, "grad_norm": 3.260134734577807, "learning_rate": 3.555886000010965e-05, "loss": 0.1567, "step": 2510 }, { "epoch": 3.9920508744038155, "grad_norm": 2.377956461241035, "learning_rate": 3.5564073295233646e-05, "loss": 0.1498, "step": 2511 }, { "epoch": 3.9936406995230525, "grad_norm": 4.67320743724486, "learning_rate": 3.5569288091758204e-05, "loss": 0.2018, "step": 2512 }, { "epoch": 3.9952305246422894, "grad_norm": 3.3038487149362137, "learning_rate": 3.557450438791831e-05, "loss": 0.1526, "step": 2513 }, { "epoch": 3.996820349761526, "grad_norm": 2.4694900236450534, "learning_rate": 3.557972218194844e-05, "loss": 0.1414, "step": 2514 }, { "epoch": 3.998410174880763, "grad_norm": 3.528485949937308, "learning_rate": 3.558494147208255e-05, "loss": 0.1758, "step": 2515 }, { "epoch": 4.0, "grad_norm": 2.917378654821198, "learning_rate": 3.55901622565541e-05, "loss": 0.1948, "step": 2516 }, { "epoch": 4.001589825119237, "grad_norm": 3.5125347488278544, "learning_rate": 3.5595384533596046e-05, "loss": 0.1839, "step": 2517 }, { "epoch": 4.003179650238474, "grad_norm": 2.865393217103114, "learning_rate": 3.5600608301440846e-05, "loss": 0.196, "step": 2518 }, { "epoch": 4.004769475357711, "grad_norm": 2.091703164723003, "learning_rate": 3.560583355832044e-05, "loss": 0.1191, "step": 2519 }, { "epoch": 4.006359300476947, "grad_norm": 2.859144838286252, "learning_rate": 3.561106030246625e-05, "loss": 0.1672, "step": 2520 }, { "epoch": 4.007949125596184, "grad_norm": 5.409872828541667, "learning_rate": 3.561628853210923e-05, "loss": 0.2442, "step": 2521 }, { "epoch": 4.009538950715421, "grad_norm": 2.715933085530968, "learning_rate": 3.5621518245479805e-05, "loss": 0.1996, "step": 2522 }, { "epoch": 4.011128775834658, "grad_norm": 2.2312917411922806, "learning_rate": 3.5626749440807916e-05, "loss": 0.1718, "step": 2523 }, { "epoch": 4.012718600953895, "grad_norm": 1.6730027999192127, "learning_rate": 3.563198211632298e-05, "loss": 0.1261, "step": 2524 }, { "epoch": 4.014308426073132, "grad_norm": 3.730518253755824, "learning_rate": 3.5637216270253934e-05, "loss": 0.2291, "step": 2525 }, { "epoch": 4.015898251192369, "grad_norm": 2.4137992285459267, "learning_rate": 3.564245190082921e-05, "loss": 0.1388, "step": 2526 }, { "epoch": 4.017488076311606, "grad_norm": 3.1883868886845823, "learning_rate": 3.564768900627672e-05, "loss": 0.2168, "step": 2527 }, { "epoch": 4.019077901430842, "grad_norm": 1.937624566264173, "learning_rate": 3.565292758482392e-05, "loss": 0.2081, "step": 2528 }, { "epoch": 4.020667726550079, "grad_norm": 2.146416869766526, "learning_rate": 3.565816763469772e-05, "loss": 0.2658, "step": 2529 }, { "epoch": 4.022257551669316, "grad_norm": 2.8357864381839346, "learning_rate": 3.5663409154124556e-05, "loss": 0.1801, "step": 2530 }, { "epoch": 4.023847376788553, "grad_norm": 2.9829409511608813, "learning_rate": 3.5668652141330376e-05, "loss": 0.1805, "step": 2531 }, { "epoch": 4.02543720190779, "grad_norm": 2.3801137221671045, "learning_rate": 3.567389659454059e-05, "loss": 0.1914, "step": 2532 }, { "epoch": 4.027027027027027, "grad_norm": 1.5228563979544558, "learning_rate": 3.567914251198018e-05, "loss": 0.1264, "step": 2533 }, { "epoch": 4.028616852146264, "grad_norm": 1.9259917632666435, "learning_rate": 3.568438989187356e-05, "loss": 0.1804, "step": 2534 }, { "epoch": 4.030206677265501, "grad_norm": 12.676223011378253, "learning_rate": 3.5689638732444706e-05, "loss": 3.5049, "step": 2535 }, { "epoch": 4.031796502384737, "grad_norm": 2.8182668902691823, "learning_rate": 3.569488903191705e-05, "loss": 0.148, "step": 2536 }, { "epoch": 4.033386327503974, "grad_norm": 2.7453346182150247, "learning_rate": 3.570014078851357e-05, "loss": 0.1654, "step": 2537 }, { "epoch": 4.034976152623211, "grad_norm": 2.148787479534688, "learning_rate": 3.570539400045674e-05, "loss": 0.1551, "step": 2538 }, { "epoch": 4.036565977742448, "grad_norm": 2.745298960248963, "learning_rate": 3.571064866596854e-05, "loss": 0.1961, "step": 2539 }, { "epoch": 4.038155802861685, "grad_norm": 2.2169181664290076, "learning_rate": 3.571590478327045e-05, "loss": 0.2213, "step": 2540 }, { "epoch": 4.039745627980922, "grad_norm": 3.435458225096928, "learning_rate": 3.572116235058346e-05, "loss": 0.2095, "step": 2541 }, { "epoch": 4.041335453100159, "grad_norm": 32.04824081369827, "learning_rate": 3.572642136612808e-05, "loss": 8.5681, "step": 2542 }, { "epoch": 4.042925278219396, "grad_norm": 2.4283899406764275, "learning_rate": 3.573168182812432e-05, "loss": 0.1958, "step": 2543 }, { "epoch": 4.044515103338632, "grad_norm": 2.177852217748721, "learning_rate": 3.573694373479171e-05, "loss": 0.1666, "step": 2544 }, { "epoch": 4.046104928457869, "grad_norm": 2.960537239514045, "learning_rate": 3.574220708434928e-05, "loss": 0.1965, "step": 2545 }, { "epoch": 4.047694753577106, "grad_norm": 2.325024155037367, "learning_rate": 3.574747187501557e-05, "loss": 0.1652, "step": 2546 }, { "epoch": 4.049284578696343, "grad_norm": 2.196849574016487, "learning_rate": 3.575273810500866e-05, "loss": 0.1584, "step": 2547 }, { "epoch": 4.05087440381558, "grad_norm": 158.66865002808538, "learning_rate": 3.57580057725461e-05, "loss": 16.2171, "step": 2548 }, { "epoch": 4.052464228934817, "grad_norm": 2.0229446352077405, "learning_rate": 3.576327487584499e-05, "loss": 0.1986, "step": 2549 }, { "epoch": 4.054054054054054, "grad_norm": 3.504939202939681, "learning_rate": 3.576854541312192e-05, "loss": 0.1752, "step": 2550 }, { "epoch": 4.0556438791732905, "grad_norm": 2.703221637896262, "learning_rate": 3.577381738259301e-05, "loss": 0.1841, "step": 2551 }, { "epoch": 4.0572337042925275, "grad_norm": 9.235755329496932, "learning_rate": 3.577909078247388e-05, "loss": 1.0857, "step": 2552 }, { "epoch": 4.0588235294117645, "grad_norm": 3.083185840362384, "learning_rate": 3.5784365610979685e-05, "loss": 0.1658, "step": 2553 }, { "epoch": 4.0604133545310015, "grad_norm": 3.044260922276935, "learning_rate": 3.578964186632509e-05, "loss": 0.1748, "step": 2554 }, { "epoch": 4.0620031796502385, "grad_norm": 2.5188173064263477, "learning_rate": 3.5794919546724264e-05, "loss": 0.2696, "step": 2555 }, { "epoch": 4.0635930047694755, "grad_norm": 2.799516264975812, "learning_rate": 3.5800198650390906e-05, "loss": 0.1868, "step": 2556 }, { "epoch": 4.0651828298887125, "grad_norm": 2.8983494403645684, "learning_rate": 3.580547917553823e-05, "loss": 0.1993, "step": 2557 }, { "epoch": 4.0667726550079495, "grad_norm": 3.4830796317011052, "learning_rate": 3.5810761120378967e-05, "loss": 0.2489, "step": 2558 }, { "epoch": 4.068362480127186, "grad_norm": 2.4298914203302737, "learning_rate": 3.5816044483125385e-05, "loss": 0.1786, "step": 2559 }, { "epoch": 4.069952305246423, "grad_norm": 2.7659351926985205, "learning_rate": 3.582132926198923e-05, "loss": 0.1875, "step": 2560 }, { "epoch": 4.07154213036566, "grad_norm": 3.1983703432363644, "learning_rate": 3.5826615455181826e-05, "loss": 0.1711, "step": 2561 }, { "epoch": 4.073131955484897, "grad_norm": 3.868628043510097, "learning_rate": 3.583190306091396e-05, "loss": 0.1592, "step": 2562 }, { "epoch": 4.074721780604134, "grad_norm": 2.632769843890157, "learning_rate": 3.583719207739599e-05, "loss": 0.1719, "step": 2563 }, { "epoch": 4.076311605723371, "grad_norm": 2.426780028206715, "learning_rate": 3.584248250283777e-05, "loss": 0.1788, "step": 2564 }, { "epoch": 4.077901430842608, "grad_norm": 2.3334647190219706, "learning_rate": 3.584777433544867e-05, "loss": 0.1999, "step": 2565 }, { "epoch": 4.079491255961845, "grad_norm": 2.681549399032922, "learning_rate": 3.5853067573437606e-05, "loss": 0.1617, "step": 2566 }, { "epoch": 4.081081081081081, "grad_norm": 3.8034447868384347, "learning_rate": 3.585836221501302e-05, "loss": 0.1353, "step": 2567 }, { "epoch": 4.082670906200318, "grad_norm": 1.8680499685002874, "learning_rate": 3.586365825838285e-05, "loss": 0.1611, "step": 2568 }, { "epoch": 4.084260731319555, "grad_norm": 2.1274135878496065, "learning_rate": 3.586895570175458e-05, "loss": 0.1933, "step": 2569 }, { "epoch": 4.085850556438792, "grad_norm": 1.7237852665809665, "learning_rate": 3.587425454333523e-05, "loss": 0.1104, "step": 2570 }, { "epoch": 4.087440381558029, "grad_norm": 1.9171173271630355, "learning_rate": 3.5879554781331314e-05, "loss": 0.1657, "step": 2571 }, { "epoch": 4.089030206677266, "grad_norm": 2.698412939090953, "learning_rate": 3.588485641394891e-05, "loss": 0.1956, "step": 2572 }, { "epoch": 4.090620031796503, "grad_norm": 2.1542359336191095, "learning_rate": 3.58901594393936e-05, "loss": 0.1636, "step": 2573 }, { "epoch": 4.09220985691574, "grad_norm": 1.6697561519569541, "learning_rate": 3.589546385587051e-05, "loss": 0.205, "step": 2574 }, { "epoch": 4.093799682034976, "grad_norm": 1.5583387036635463, "learning_rate": 3.5900769661584274e-05, "loss": 0.206, "step": 2575 }, { "epoch": 4.095389507154213, "grad_norm": 3.214191276768939, "learning_rate": 3.5906076854739076e-05, "loss": 0.1108, "step": 2576 }, { "epoch": 4.09697933227345, "grad_norm": 1.7274702957364836, "learning_rate": 3.5911385433538624e-05, "loss": 0.1686, "step": 2577 }, { "epoch": 4.098569157392687, "grad_norm": 1.8914262922436411, "learning_rate": 3.591669539618616e-05, "loss": 0.2483, "step": 2578 }, { "epoch": 4.100158982511924, "grad_norm": 3.484272830076163, "learning_rate": 3.592200674088444e-05, "loss": 0.2068, "step": 2579 }, { "epoch": 4.101748807631161, "grad_norm": 2.834870843716858, "learning_rate": 3.5927319465835774e-05, "loss": 0.2352, "step": 2580 }, { "epoch": 4.103338632750398, "grad_norm": 2.731273231540694, "learning_rate": 3.5932633569242e-05, "loss": 0.183, "step": 2581 }, { "epoch": 4.104928457869635, "grad_norm": 1.6257336209148003, "learning_rate": 3.593794904930448e-05, "loss": 0.1697, "step": 2582 }, { "epoch": 4.106518282988871, "grad_norm": 2.566487916977648, "learning_rate": 3.5943265904224134e-05, "loss": 0.2068, "step": 2583 }, { "epoch": 4.108108108108108, "grad_norm": 2.720067528068507, "learning_rate": 3.594858413220137e-05, "loss": 0.2471, "step": 2584 }, { "epoch": 4.109697933227345, "grad_norm": 1.7671619872633417, "learning_rate": 3.595390373143619e-05, "loss": 0.1392, "step": 2585 }, { "epoch": 4.111287758346582, "grad_norm": 1.7685577260004848, "learning_rate": 3.5959224700128085e-05, "loss": 0.2124, "step": 2586 }, { "epoch": 4.112877583465819, "grad_norm": 1.870853995145257, "learning_rate": 3.59645470364761e-05, "loss": 0.1739, "step": 2587 }, { "epoch": 4.114467408585056, "grad_norm": 1.7021865766966053, "learning_rate": 3.596987073867882e-05, "loss": 0.2072, "step": 2588 }, { "epoch": 4.116057233704293, "grad_norm": 5.616823650089961, "learning_rate": 3.5975195804934373e-05, "loss": 7.6576, "step": 2589 }, { "epoch": 4.117647058823529, "grad_norm": 2.3711637678633855, "learning_rate": 3.59805222334404e-05, "loss": 0.1426, "step": 2590 }, { "epoch": 4.119236883942766, "grad_norm": 0.943145945860843, "learning_rate": 3.5985850022394115e-05, "loss": 0.1638, "step": 2591 }, { "epoch": 4.120826709062003, "grad_norm": 1.5680379217940754, "learning_rate": 3.599117916999224e-05, "loss": 0.1453, "step": 2592 }, { "epoch": 4.12241653418124, "grad_norm": 2.301409541776319, "learning_rate": 3.5996509674431056e-05, "loss": 0.1689, "step": 2593 }, { "epoch": 4.124006359300477, "grad_norm": 1.2457821087274152, "learning_rate": 3.600184153390638e-05, "loss": 0.1799, "step": 2594 }, { "epoch": 4.125596184419714, "grad_norm": 2.014737641798629, "learning_rate": 3.600717474661358e-05, "loss": 0.2377, "step": 2595 }, { "epoch": 4.127186009538951, "grad_norm": 2.2681098277587974, "learning_rate": 3.601250931074754e-05, "loss": 0.1899, "step": 2596 }, { "epoch": 4.128775834658188, "grad_norm": 2.8617757235505654, "learning_rate": 3.601784522450272e-05, "loss": 0.1634, "step": 2597 }, { "epoch": 4.130365659777424, "grad_norm": 1.430311352518948, "learning_rate": 3.602318248607309e-05, "loss": 0.1641, "step": 2598 }, { "epoch": 4.131955484896661, "grad_norm": 3.601325069866256, "learning_rate": 3.60285210936522e-05, "loss": 0.2021, "step": 2599 }, { "epoch": 4.133545310015898, "grad_norm": 1.8600422595400594, "learning_rate": 3.60338610454331e-05, "loss": 0.2222, "step": 2600 }, { "epoch": 4.135135135135135, "grad_norm": 2.8717644323128377, "learning_rate": 3.603920233960844e-05, "loss": 0.1746, "step": 2601 }, { "epoch": 4.136724960254372, "grad_norm": 2.1934706411692635, "learning_rate": 3.6044544974370355e-05, "loss": 0.1195, "step": 2602 }, { "epoch": 4.138314785373609, "grad_norm": 1.2528174713899627, "learning_rate": 3.604988894791057e-05, "loss": 0.1527, "step": 2603 }, { "epoch": 4.139904610492846, "grad_norm": 1.4102457846683318, "learning_rate": 3.6055234258420346e-05, "loss": 0.1601, "step": 2604 }, { "epoch": 4.141494435612083, "grad_norm": 2.055286138422291, "learning_rate": 3.606058090409049e-05, "loss": 0.1792, "step": 2605 }, { "epoch": 4.143084260731319, "grad_norm": 2.0721522562202876, "learning_rate": 3.606592888311135e-05, "loss": 0.2006, "step": 2606 }, { "epoch": 4.144674085850556, "grad_norm": 1.2467439429735536, "learning_rate": 3.607127819367283e-05, "loss": 0.1412, "step": 2607 }, { "epoch": 4.146263910969793, "grad_norm": 51.62272940003203, "learning_rate": 3.607662883396439e-05, "loss": 14.3185, "step": 2608 }, { "epoch": 4.14785373608903, "grad_norm": 3.791169706538436, "learning_rate": 3.6081980802175014e-05, "loss": 0.373, "step": 2609 }, { "epoch": 4.149443561208267, "grad_norm": 2.5846253450907457, "learning_rate": 3.608733409649328e-05, "loss": 0.14, "step": 2610 }, { "epoch": 4.151033386327504, "grad_norm": 50.32377859297938, "learning_rate": 3.609268871510727e-05, "loss": 16.4595, "step": 2611 }, { "epoch": 4.152623211446741, "grad_norm": 3.322793804234763, "learning_rate": 3.6098044656204636e-05, "loss": 0.2678, "step": 2612 }, { "epoch": 4.154213036565977, "grad_norm": 2.6230785533858896, "learning_rate": 3.6103401917972614e-05, "loss": 0.1414, "step": 2613 }, { "epoch": 4.155802861685214, "grad_norm": 3.6459821814226485, "learning_rate": 3.610876049859794e-05, "loss": 0.183, "step": 2614 }, { "epoch": 4.157392686804451, "grad_norm": 2.158306729984067, "learning_rate": 3.611412039626694e-05, "loss": 0.1466, "step": 2615 }, { "epoch": 4.158982511923688, "grad_norm": 3.214937128982499, "learning_rate": 3.6119481609165476e-05, "loss": 0.1645, "step": 2616 }, { "epoch": 4.160572337042925, "grad_norm": 2.3220342397726514, "learning_rate": 3.612484413547897e-05, "loss": 0.1995, "step": 2617 }, { "epoch": 4.162162162162162, "grad_norm": 2.608819600018179, "learning_rate": 3.6130207973392415e-05, "loss": 0.1891, "step": 2618 }, { "epoch": 4.163751987281399, "grad_norm": 1.75453889544742, "learning_rate": 3.613557312109033e-05, "loss": 0.1855, "step": 2619 }, { "epoch": 4.165341812400636, "grad_norm": 2.255692700689292, "learning_rate": 3.6140939576756814e-05, "loss": 0.1509, "step": 2620 }, { "epoch": 4.166931637519872, "grad_norm": 3.7064843449878966, "learning_rate": 3.614630733857552e-05, "loss": 0.1668, "step": 2621 }, { "epoch": 4.168521462639109, "grad_norm": 2.4847019646961312, "learning_rate": 3.6151676404729645e-05, "loss": 0.1556, "step": 2622 }, { "epoch": 4.170111287758346, "grad_norm": 3.1156487284386216, "learning_rate": 3.6157046773401964e-05, "loss": 0.1454, "step": 2623 }, { "epoch": 4.171701112877583, "grad_norm": 6.444378461489699, "learning_rate": 3.61624184427748e-05, "loss": 0.2125, "step": 2624 }, { "epoch": 4.17329093799682, "grad_norm": 2.90917773299034, "learning_rate": 3.616779141103003e-05, "loss": 0.1813, "step": 2625 }, { "epoch": 4.174880763116057, "grad_norm": 2.6017592717961695, "learning_rate": 3.61731656763491e-05, "loss": 0.1544, "step": 2626 }, { "epoch": 4.176470588235294, "grad_norm": 3.4949250718379647, "learning_rate": 3.617854123691303e-05, "loss": 0.2069, "step": 2627 }, { "epoch": 4.178060413354531, "grad_norm": 4.00988653636346, "learning_rate": 3.618391809090238e-05, "loss": 0.2029, "step": 2628 }, { "epoch": 4.1796502384737675, "grad_norm": 3.0203370585041087, "learning_rate": 3.6189296236497255e-05, "loss": 0.1417, "step": 2629 }, { "epoch": 4.1812400635930045, "grad_norm": 3.6484348364358326, "learning_rate": 3.6194675671877395e-05, "loss": 0.2191, "step": 2630 }, { "epoch": 4.1828298887122415, "grad_norm": 2.109931995336289, "learning_rate": 3.620005639522201e-05, "loss": 0.104, "step": 2631 }, { "epoch": 4.1844197138314785, "grad_norm": 5.322513605535563, "learning_rate": 3.620543840470995e-05, "loss": 0.215, "step": 2632 }, { "epoch": 4.1860095389507155, "grad_norm": 2.822585307981762, "learning_rate": 3.621082169851959e-05, "loss": 0.1551, "step": 2633 }, { "epoch": 4.1875993640699525, "grad_norm": 3.1168746712290836, "learning_rate": 3.621620627482888e-05, "loss": 0.2244, "step": 2634 }, { "epoch": 4.1891891891891895, "grad_norm": 4.012009398444918, "learning_rate": 3.622159213181533e-05, "loss": 0.1913, "step": 2635 }, { "epoch": 4.1907790143084265, "grad_norm": 5.589943858404057, "learning_rate": 3.6226979267656035e-05, "loss": 0.2269, "step": 2636 }, { "epoch": 4.192368839427663, "grad_norm": 1.7220056346790287, "learning_rate": 3.6232367680527634e-05, "loss": 0.1426, "step": 2637 }, { "epoch": 4.1939586645469, "grad_norm": 3.9570101958872157, "learning_rate": 3.623775736860635e-05, "loss": 0.2488, "step": 2638 }, { "epoch": 4.195548489666137, "grad_norm": 3.826450792136232, "learning_rate": 3.624314833006796e-05, "loss": 0.1643, "step": 2639 }, { "epoch": 4.197138314785374, "grad_norm": 32.55621151259951, "learning_rate": 3.624854056308783e-05, "loss": 9.861, "step": 2640 }, { "epoch": 4.198728139904611, "grad_norm": 2.9895635358558863, "learning_rate": 3.625393406584088e-05, "loss": 0.1996, "step": 2641 }, { "epoch": 4.200317965023848, "grad_norm": 2.9508203173429206, "learning_rate": 3.625932883650161e-05, "loss": 0.1982, "step": 2642 }, { "epoch": 4.201907790143085, "grad_norm": 4.839071719036717, "learning_rate": 3.6264724873244074e-05, "loss": 0.1628, "step": 2643 }, { "epoch": 4.203497615262322, "grad_norm": 4.174836149787471, "learning_rate": 3.627012217424191e-05, "loss": 0.1691, "step": 2644 }, { "epoch": 4.205087440381558, "grad_norm": 4.920053909346024, "learning_rate": 3.627552073766834e-05, "loss": 0.1803, "step": 2645 }, { "epoch": 4.206677265500795, "grad_norm": 3.3732409467923117, "learning_rate": 3.628092056169614e-05, "loss": 0.2197, "step": 2646 }, { "epoch": 4.208267090620032, "grad_norm": 5.29426581225197, "learning_rate": 3.628632164449765e-05, "loss": 0.2207, "step": 2647 }, { "epoch": 4.209856915739269, "grad_norm": 5.354406853759848, "learning_rate": 3.6291723984244824e-05, "loss": 0.2292, "step": 2648 }, { "epoch": 4.211446740858506, "grad_norm": 160.4010414373529, "learning_rate": 3.629712757910915e-05, "loss": 8.6029, "step": 2649 }, { "epoch": 4.213036565977743, "grad_norm": 6.746407517547819, "learning_rate": 3.630253242726171e-05, "loss": 0.2074, "step": 2650 }, { "epoch": 4.21462639109698, "grad_norm": 5.0885689546621995, "learning_rate": 3.630793852687316e-05, "loss": 0.1917, "step": 2651 }, { "epoch": 4.216216216216216, "grad_norm": 2.357998478924187, "learning_rate": 3.631334587611373e-05, "loss": 0.1045, "step": 2652 }, { "epoch": 4.217806041335453, "grad_norm": 4.71153141149594, "learning_rate": 3.631875447315322e-05, "loss": 0.2956, "step": 2653 }, { "epoch": 4.21939586645469, "grad_norm": 8.098104413236705, "learning_rate": 3.632416431616103e-05, "loss": 0.1945, "step": 2654 }, { "epoch": 4.220985691573927, "grad_norm": 2.637818313434111, "learning_rate": 3.632957540330612e-05, "loss": 0.1046, "step": 2655 }, { "epoch": 4.222575516693164, "grad_norm": 5.925312865469627, "learning_rate": 3.633498773275703e-05, "loss": 0.1738, "step": 2656 }, { "epoch": 4.224165341812401, "grad_norm": 9.967813560771157, "learning_rate": 3.634040130268188e-05, "loss": 0.2563, "step": 2657 }, { "epoch": 4.225755166931638, "grad_norm": 3.848052597739168, "learning_rate": 3.634581611124838e-05, "loss": 0.133, "step": 2658 }, { "epoch": 4.227344992050875, "grad_norm": 8.360872285952548, "learning_rate": 3.6351232156623805e-05, "loss": 0.246, "step": 2659 }, { "epoch": 4.228934817170111, "grad_norm": 5.909109090532057, "learning_rate": 3.6356649436975025e-05, "loss": 0.1652, "step": 2660 }, { "epoch": 4.230524642289348, "grad_norm": 4.32512618012791, "learning_rate": 3.636206795046848e-05, "loss": 0.1628, "step": 2661 }, { "epoch": 4.232114467408585, "grad_norm": 6.356703735794279, "learning_rate": 3.636748769527022e-05, "loss": 0.2258, "step": 2662 }, { "epoch": 4.233704292527822, "grad_norm": 10.909012764021467, "learning_rate": 3.6372908669545833e-05, "loss": 0.1897, "step": 2663 }, { "epoch": 4.235294117647059, "grad_norm": 5.471527227553544, "learning_rate": 3.637833087146053e-05, "loss": 0.3065, "step": 2664 }, { "epoch": 4.236883942766296, "grad_norm": 4.102357036129911, "learning_rate": 3.638375429917908e-05, "loss": 0.1887, "step": 2665 }, { "epoch": 4.238473767885533, "grad_norm": 8.825678254514678, "learning_rate": 3.638917895086586e-05, "loss": 0.2471, "step": 2666 }, { "epoch": 4.24006359300477, "grad_norm": 5.85351207258296, "learning_rate": 3.639460482468482e-05, "loss": 0.2109, "step": 2667 }, { "epoch": 4.241653418124006, "grad_norm": 4.0660631468204995, "learning_rate": 3.640003191879948e-05, "loss": 0.2062, "step": 2668 }, { "epoch": 4.243243243243243, "grad_norm": 6.037543091691723, "learning_rate": 3.6405460231373005e-05, "loss": 0.2501, "step": 2669 }, { "epoch": 4.24483306836248, "grad_norm": 2.8105689972528216, "learning_rate": 3.641088976056807e-05, "loss": 0.1706, "step": 2670 }, { "epoch": 4.246422893481717, "grad_norm": 17.36328034885686, "learning_rate": 3.6416320504547e-05, "loss": 0.3135, "step": 2671 }, { "epoch": 4.248012718600954, "grad_norm": 4.186353610505322, "learning_rate": 3.6421752461471674e-05, "loss": 0.2532, "step": 2672 }, { "epoch": 4.249602543720191, "grad_norm": 9.842024940376808, "learning_rate": 3.642718562950356e-05, "loss": 0.1777, "step": 2673 }, { "epoch": 4.251192368839428, "grad_norm": 6.128437705109032, "learning_rate": 3.643262000680375e-05, "loss": 0.2072, "step": 2674 }, { "epoch": 4.252782193958664, "grad_norm": 6.517263165754503, "learning_rate": 3.6438055591532895e-05, "loss": 0.2246, "step": 2675 }, { "epoch": 4.254372019077901, "grad_norm": 5.468729619601952, "learning_rate": 3.644349238185124e-05, "loss": 0.1952, "step": 2676 }, { "epoch": 4.255961844197138, "grad_norm": 8.302234882987564, "learning_rate": 3.644893037591863e-05, "loss": 0.202, "step": 2677 }, { "epoch": 4.257551669316375, "grad_norm": 6.283138724901664, "learning_rate": 3.645436957189451e-05, "loss": 0.2065, "step": 2678 }, { "epoch": 4.259141494435612, "grad_norm": 21.442714400362348, "learning_rate": 3.6459809967937906e-05, "loss": 5.8897, "step": 2679 }, { "epoch": 4.260731319554849, "grad_norm": 4.054543492929826, "learning_rate": 3.646525156220743e-05, "loss": 0.1905, "step": 2680 }, { "epoch": 4.262321144674086, "grad_norm": 6.20388183572734, "learning_rate": 3.6470694352861315e-05, "loss": 0.1921, "step": 2681 }, { "epoch": 4.263910969793323, "grad_norm": 7.191170983352282, "learning_rate": 3.6476138338057367e-05, "loss": 0.2056, "step": 2682 }, { "epoch": 4.26550079491256, "grad_norm": 2.1414628660681325, "learning_rate": 3.648158351595298e-05, "loss": 0.1841, "step": 2683 }, { "epoch": 4.267090620031796, "grad_norm": 7.005617010890684, "learning_rate": 3.648702988470518e-05, "loss": 0.2057, "step": 2684 }, { "epoch": 4.268680445151033, "grad_norm": 3.737852647160204, "learning_rate": 3.6492477442470564e-05, "loss": 0.1602, "step": 2685 }, { "epoch": 4.27027027027027, "grad_norm": 4.184546399706757, "learning_rate": 3.649792618740533e-05, "loss": 0.2856, "step": 2686 }, { "epoch": 4.271860095389507, "grad_norm": 4.463398609951982, "learning_rate": 3.6503376117665265e-05, "loss": 0.1866, "step": 2687 }, { "epoch": 4.273449920508744, "grad_norm": 6.075873238989334, "learning_rate": 3.6508827231405775e-05, "loss": 0.2206, "step": 2688 }, { "epoch": 4.275039745627981, "grad_norm": 3.638851809792017, "learning_rate": 3.651427952678185e-05, "loss": 0.2493, "step": 2689 }, { "epoch": 4.276629570747218, "grad_norm": 3.960162781741858, "learning_rate": 3.651973300194809e-05, "loss": 0.2272, "step": 2690 }, { "epoch": 4.278219395866454, "grad_norm": 5.503408794151038, "learning_rate": 3.652518765505869e-05, "loss": 0.179, "step": 2691 }, { "epoch": 4.279809220985691, "grad_norm": 2.8986482810655003, "learning_rate": 3.653064348426745e-05, "loss": 0.1642, "step": 2692 }, { "epoch": 4.281399046104928, "grad_norm": 3.995806164588751, "learning_rate": 3.6536100487727755e-05, "loss": 0.158, "step": 2693 }, { "epoch": 4.282988871224165, "grad_norm": 2.8716230600332904, "learning_rate": 3.654155866359263e-05, "loss": 0.1757, "step": 2694 }, { "epoch": 4.284578696343402, "grad_norm": 5.298558594128385, "learning_rate": 3.654701801001466e-05, "loss": 0.2072, "step": 2695 }, { "epoch": 4.286168521462639, "grad_norm": 2.278107412142198, "learning_rate": 3.655247852514606e-05, "loss": 0.141, "step": 2696 }, { "epoch": 4.287758346581876, "grad_norm": 3.1097924898701117, "learning_rate": 3.655794020713865e-05, "loss": 0.1991, "step": 2697 }, { "epoch": 4.289348171701113, "grad_norm": 3.2331113837860266, "learning_rate": 3.656340305414384e-05, "loss": 0.1322, "step": 2698 }, { "epoch": 4.290937996820349, "grad_norm": 3.740215850735703, "learning_rate": 3.656886706431267e-05, "loss": 0.135, "step": 2699 }, { "epoch": 4.292527821939586, "grad_norm": 3.204813306660602, "learning_rate": 3.657433223579574e-05, "loss": 0.164, "step": 2700 }, { "epoch": 4.294117647058823, "grad_norm": 1.7242666735296988, "learning_rate": 3.6579798566743314e-05, "loss": 0.1341, "step": 2701 }, { "epoch": 4.29570747217806, "grad_norm": 3.196599253357943, "learning_rate": 3.658526605530523e-05, "loss": 0.1468, "step": 2702 }, { "epoch": 4.297297297297297, "grad_norm": 2.9927088179256174, "learning_rate": 3.659073469963094e-05, "loss": 0.1536, "step": 2703 }, { "epoch": 4.298887122416534, "grad_norm": 2.2110937165329148, "learning_rate": 3.65962044978695e-05, "loss": 0.1645, "step": 2704 }, { "epoch": 4.300476947535771, "grad_norm": 3.6079806151496188, "learning_rate": 3.660167544816959e-05, "loss": 0.1787, "step": 2705 }, { "epoch": 4.302066772655008, "grad_norm": 3.585262841904655, "learning_rate": 3.660714754867949e-05, "loss": 0.2007, "step": 2706 }, { "epoch": 4.3036565977742445, "grad_norm": 2.569430965336825, "learning_rate": 3.6612620797547084e-05, "loss": 0.1616, "step": 2707 }, { "epoch": 4.3052464228934815, "grad_norm": 2.9172509601080097, "learning_rate": 3.661809519291989e-05, "loss": 0.1645, "step": 2708 }, { "epoch": 4.3068362480127185, "grad_norm": 2.5688462954812046, "learning_rate": 3.6623570732945006e-05, "loss": 0.1278, "step": 2709 }, { "epoch": 4.3084260731319555, "grad_norm": 2.8645330308010792, "learning_rate": 3.662904741576918e-05, "loss": 0.1306, "step": 2710 }, { "epoch": 4.3100158982511925, "grad_norm": 2.944714214475977, "learning_rate": 3.663452523953874e-05, "loss": 0.1686, "step": 2711 }, { "epoch": 4.3116057233704295, "grad_norm": 1.9873329297346898, "learning_rate": 3.664000420239964e-05, "loss": 0.1678, "step": 2712 }, { "epoch": 4.3131955484896665, "grad_norm": 4.9327898010633655, "learning_rate": 3.664548430249745e-05, "loss": 0.1922, "step": 2713 }, { "epoch": 4.314785373608903, "grad_norm": 2.8359832019966627, "learning_rate": 3.665096553797736e-05, "loss": 0.1741, "step": 2714 }, { "epoch": 4.31637519872814, "grad_norm": 2.949733494563496, "learning_rate": 3.665644790698417e-05, "loss": 0.1659, "step": 2715 }, { "epoch": 4.317965023847377, "grad_norm": 4.045401247445941, "learning_rate": 3.6661931407662294e-05, "loss": 0.1806, "step": 2716 }, { "epoch": 4.319554848966614, "grad_norm": 3.3181079553035544, "learning_rate": 3.6667416038155756e-05, "loss": 0.1936, "step": 2717 }, { "epoch": 4.321144674085851, "grad_norm": 3.5146240760721836, "learning_rate": 3.6672901796608224e-05, "loss": 0.1889, "step": 2718 }, { "epoch": 4.322734499205088, "grad_norm": 2.543682724162422, "learning_rate": 3.6678388681162976e-05, "loss": 0.2109, "step": 2719 }, { "epoch": 4.324324324324325, "grad_norm": 6.397625058600096, "learning_rate": 3.668387668996286e-05, "loss": 0.3312, "step": 2720 }, { "epoch": 4.325914149443562, "grad_norm": 5.485836836138616, "learning_rate": 3.6689365821150425e-05, "loss": 0.1988, "step": 2721 }, { "epoch": 4.327503974562799, "grad_norm": 3.7908427510107887, "learning_rate": 3.669485607286777e-05, "loss": 0.1191, "step": 2722 }, { "epoch": 4.329093799682035, "grad_norm": 3.5461132810895775, "learning_rate": 3.670034744325666e-05, "loss": 0.1553, "step": 2723 }, { "epoch": 4.330683624801272, "grad_norm": 5.055192122477079, "learning_rate": 3.6705839930458466e-05, "loss": 0.2001, "step": 2724 }, { "epoch": 4.332273449920509, "grad_norm": 2.2907995961639154, "learning_rate": 3.671133353261417e-05, "loss": 0.1922, "step": 2725 }, { "epoch": 4.333863275039746, "grad_norm": 6.0071808266270335, "learning_rate": 3.6716828247864396e-05, "loss": 0.1524, "step": 2726 }, { "epoch": 4.335453100158983, "grad_norm": 2.919174510256984, "learning_rate": 3.672232407434937e-05, "loss": 0.1751, "step": 2727 }, { "epoch": 4.33704292527822, "grad_norm": 3.9701786776207757, "learning_rate": 3.6727821010208965e-05, "loss": 0.144, "step": 2728 }, { "epoch": 4.338632750397457, "grad_norm": 2.267128019843562, "learning_rate": 3.673331905358266e-05, "loss": 0.1352, "step": 2729 }, { "epoch": 4.340222575516693, "grad_norm": 2.434928793100397, "learning_rate": 3.673881820260957e-05, "loss": 0.1599, "step": 2730 }, { "epoch": 4.34181240063593, "grad_norm": 1.8483118056371717, "learning_rate": 3.674431845542843e-05, "loss": 0.1975, "step": 2731 }, { "epoch": 4.343402225755167, "grad_norm": 3.828776479036021, "learning_rate": 3.674981981017761e-05, "loss": 0.155, "step": 2732 }, { "epoch": 4.344992050874404, "grad_norm": 3.973162968564649, "learning_rate": 3.67553222649951e-05, "loss": 0.21, "step": 2733 }, { "epoch": 4.346581875993641, "grad_norm": 2.6259682888789557, "learning_rate": 3.67608258180185e-05, "loss": 0.1385, "step": 2734 }, { "epoch": 4.348171701112878, "grad_norm": 5.199045865872974, "learning_rate": 3.6766330467385085e-05, "loss": 0.1842, "step": 2735 }, { "epoch": 4.349761526232115, "grad_norm": 3.639265218194823, "learning_rate": 3.677183621123171e-05, "loss": 0.2359, "step": 2736 }, { "epoch": 4.351351351351352, "grad_norm": 6.6463089266357755, "learning_rate": 3.677734304769489e-05, "loss": 0.1993, "step": 2737 }, { "epoch": 4.352941176470588, "grad_norm": 5.388148038750875, "learning_rate": 3.678285097491075e-05, "loss": 0.2136, "step": 2738 }, { "epoch": 4.354531001589825, "grad_norm": 3.1318104063775785, "learning_rate": 3.678835999101507e-05, "loss": 0.1725, "step": 2739 }, { "epoch": 4.356120826709062, "grad_norm": 4.1035092355690574, "learning_rate": 3.679387009414324e-05, "loss": 0.1257, "step": 2740 }, { "epoch": 4.357710651828299, "grad_norm": 4.471408504940488, "learning_rate": 3.679938128243029e-05, "loss": 0.1449, "step": 2741 }, { "epoch": 4.359300476947536, "grad_norm": 3.726520642595853, "learning_rate": 3.6804893554010876e-05, "loss": 0.2134, "step": 2742 }, { "epoch": 4.360890302066773, "grad_norm": 2.5915664696735523, "learning_rate": 3.6810406907019304e-05, "loss": 0.1458, "step": 2743 }, { "epoch": 4.36248012718601, "grad_norm": 2.990889940844389, "learning_rate": 3.68159213395895e-05, "loss": 0.2314, "step": 2744 }, { "epoch": 4.364069952305247, "grad_norm": 3.150654551382951, "learning_rate": 3.682143684985503e-05, "loss": 0.1999, "step": 2745 }, { "epoch": 4.365659777424483, "grad_norm": 1.708517664827149, "learning_rate": 3.682695343594908e-05, "loss": 0.1889, "step": 2746 }, { "epoch": 4.36724960254372, "grad_norm": 2.176965273218404, "learning_rate": 3.683247109600451e-05, "loss": 0.1647, "step": 2747 }, { "epoch": 4.368839427662957, "grad_norm": 4.126780458342471, "learning_rate": 3.683798982815377e-05, "loss": 0.1729, "step": 2748 }, { "epoch": 4.370429252782194, "grad_norm": 4.087307192787987, "learning_rate": 3.684350963052898e-05, "loss": 0.2913, "step": 2749 }, { "epoch": 4.372019077901431, "grad_norm": 3.439228142764807, "learning_rate": 3.684903050126188e-05, "loss": 0.1435, "step": 2750 }, { "epoch": 4.373608903020668, "grad_norm": 3.430060799474536, "learning_rate": 3.6854552438483864e-05, "loss": 0.1444, "step": 2751 }, { "epoch": 4.375198728139905, "grad_norm": 46.58806938344657, "learning_rate": 3.686007544032595e-05, "loss": 9.7998, "step": 2752 }, { "epoch": 4.376788553259141, "grad_norm": 3.701340429498749, "learning_rate": 3.6865599504918805e-05, "loss": 0.2045, "step": 2753 }, { "epoch": 4.378378378378378, "grad_norm": 3.0624153999447405, "learning_rate": 3.687112463039274e-05, "loss": 0.1951, "step": 2754 }, { "epoch": 4.379968203497615, "grad_norm": 27.651019430636584, "learning_rate": 3.6876650814877675e-05, "loss": 8.646, "step": 2755 }, { "epoch": 4.381558028616852, "grad_norm": 2.1058910279558205, "learning_rate": 3.688217805650323e-05, "loss": 0.1525, "step": 2756 }, { "epoch": 4.383147853736089, "grad_norm": 2.7941184698924144, "learning_rate": 3.68877063533986e-05, "loss": 0.187, "step": 2757 }, { "epoch": 4.384737678855326, "grad_norm": 3.4307143044270028, "learning_rate": 3.689323570369268e-05, "loss": 0.1714, "step": 2758 }, { "epoch": 4.386327503974563, "grad_norm": 3.31524555588055, "learning_rate": 3.6898766105513986e-05, "loss": 0.2197, "step": 2759 }, { "epoch": 4.3879173290938, "grad_norm": 3.4165375880700664, "learning_rate": 3.690429755699067e-05, "loss": 0.1773, "step": 2760 }, { "epoch": 4.389507154213036, "grad_norm": 4.402656186439025, "learning_rate": 3.690983005625053e-05, "loss": 0.2251, "step": 2761 }, { "epoch": 4.391096979332273, "grad_norm": 3.791981740612013, "learning_rate": 3.691536360142102e-05, "loss": 0.2226, "step": 2762 }, { "epoch": 4.39268680445151, "grad_norm": 3.6231411394281934, "learning_rate": 3.6920898190629246e-05, "loss": 0.1894, "step": 2763 }, { "epoch": 4.394276629570747, "grad_norm": 4.993100673181136, "learning_rate": 3.6926433822001934e-05, "loss": 0.1881, "step": 2764 }, { "epoch": 4.395866454689984, "grad_norm": 5.380414237960948, "learning_rate": 3.6931970493665475e-05, "loss": 0.1548, "step": 2765 }, { "epoch": 4.397456279809221, "grad_norm": 3.990439257680693, "learning_rate": 3.693750820374592e-05, "loss": 0.1522, "step": 2766 }, { "epoch": 4.399046104928458, "grad_norm": 4.704991687147685, "learning_rate": 3.694304695036894e-05, "loss": 0.2506, "step": 2767 }, { "epoch": 4.400635930047695, "grad_norm": 5.55982771451882, "learning_rate": 3.6948586731659886e-05, "loss": 0.2184, "step": 2768 }, { "epoch": 4.402225755166931, "grad_norm": 6.272620307366908, "learning_rate": 3.695412754574372e-05, "loss": 0.2335, "step": 2769 }, { "epoch": 4.403815580286168, "grad_norm": 4.701942122458604, "learning_rate": 3.69596693907451e-05, "loss": 0.1914, "step": 2770 }, { "epoch": 4.405405405405405, "grad_norm": 1.384871393641088, "learning_rate": 3.6965212264788296e-05, "loss": 0.2565, "step": 2771 }, { "epoch": 4.406995230524642, "grad_norm": 8.70404537833915, "learning_rate": 3.697075616599725e-05, "loss": 0.1757, "step": 2772 }, { "epoch": 4.408585055643879, "grad_norm": 3.6852798624386507, "learning_rate": 3.6976301092495554e-05, "loss": 0.1431, "step": 2773 }, { "epoch": 4.410174880763116, "grad_norm": 6.505368143514953, "learning_rate": 3.6981847042406455e-05, "loss": 0.1617, "step": 2774 }, { "epoch": 4.411764705882353, "grad_norm": 10.031257241859679, "learning_rate": 3.698739401385284e-05, "loss": 0.2817, "step": 2775 }, { "epoch": 4.413354531001589, "grad_norm": 2.593628496426279, "learning_rate": 3.699294200495727e-05, "loss": 0.1241, "step": 2776 }, { "epoch": 4.414944356120826, "grad_norm": 5.087044479908797, "learning_rate": 3.699849101384195e-05, "loss": 0.2164, "step": 2777 }, { "epoch": 4.416534181240063, "grad_norm": 4.567404460760233, "learning_rate": 3.7004041038628726e-05, "loss": 0.1874, "step": 2778 }, { "epoch": 4.4181240063593, "grad_norm": 6.364426986989506, "learning_rate": 3.700959207743914e-05, "loss": 0.1939, "step": 2779 }, { "epoch": 4.419713831478537, "grad_norm": 3.3366653148307, "learning_rate": 3.701514412839434e-05, "loss": 0.1527, "step": 2780 }, { "epoch": 4.421303656597774, "grad_norm": 6.040785096110123, "learning_rate": 3.7020697189615184e-05, "loss": 0.1967, "step": 2781 }, { "epoch": 4.422893481717011, "grad_norm": 4.354372018678005, "learning_rate": 3.702625125922214e-05, "loss": 0.2341, "step": 2782 }, { "epoch": 4.424483306836248, "grad_norm": 5.301465836162514, "learning_rate": 3.703180633533537e-05, "loss": 0.2158, "step": 2783 }, { "epoch": 4.426073131955485, "grad_norm": 3.2896212507038634, "learning_rate": 3.703736241607468e-05, "loss": 0.1952, "step": 2784 }, { "epoch": 4.4276629570747215, "grad_norm": 4.629143110218315, "learning_rate": 3.704291949955953e-05, "loss": 0.2097, "step": 2785 }, { "epoch": 4.4292527821939585, "grad_norm": 4.197135425512117, "learning_rate": 3.704847758390907e-05, "loss": 0.1437, "step": 2786 }, { "epoch": 4.4308426073131955, "grad_norm": 2.828531380039823, "learning_rate": 3.705403666724205e-05, "loss": 0.1629, "step": 2787 }, { "epoch": 4.4324324324324325, "grad_norm": 6.023618988066048, "learning_rate": 3.705959674767696e-05, "loss": 0.2207, "step": 2788 }, { "epoch": 4.4340222575516695, "grad_norm": 3.6184250424935787, "learning_rate": 3.7065157823331896e-05, "loss": 0.2019, "step": 2789 }, { "epoch": 4.4356120826709065, "grad_norm": 3.6868921122778198, "learning_rate": 3.707071989232464e-05, "loss": 0.162, "step": 2790 }, { "epoch": 4.4372019077901435, "grad_norm": 4.39189914767115, "learning_rate": 3.707628295277263e-05, "loss": 0.197, "step": 2791 }, { "epoch": 4.43879173290938, "grad_norm": 4.538233626622601, "learning_rate": 3.708184700279298e-05, "loss": 0.1944, "step": 2792 }, { "epoch": 4.440381558028617, "grad_norm": 3.956791389701744, "learning_rate": 3.708741204050245e-05, "loss": 0.1836, "step": 2793 }, { "epoch": 4.441971383147854, "grad_norm": 5.844487852894146, "learning_rate": 3.7092978064017475e-05, "loss": 0.3074, "step": 2794 }, { "epoch": 4.443561208267091, "grad_norm": 5.362990828820045, "learning_rate": 3.709854507145417e-05, "loss": 0.1716, "step": 2795 }, { "epoch": 4.4451510333863276, "grad_norm": 4.022579602980249, "learning_rate": 3.710411306092829e-05, "loss": 0.1305, "step": 2796 }, { "epoch": 4.4467408585055646, "grad_norm": 5.683804537535434, "learning_rate": 3.710968203055528e-05, "loss": 0.1918, "step": 2797 }, { "epoch": 4.4483306836248016, "grad_norm": 4.883417975304302, "learning_rate": 3.711525197845026e-05, "loss": 0.1449, "step": 2798 }, { "epoch": 4.4499205087440385, "grad_norm": 3.309917234397699, "learning_rate": 3.712082290272797e-05, "loss": 0.1763, "step": 2799 }, { "epoch": 4.451510333863275, "grad_norm": 5.185115182954079, "learning_rate": 3.712639480150288e-05, "loss": 0.1708, "step": 2800 }, { "epoch": 4.453100158982512, "grad_norm": 4.12508687447716, "learning_rate": 3.71319676728891e-05, "loss": 0.2181, "step": 2801 }, { "epoch": 4.454689984101749, "grad_norm": 34.9179672194163, "learning_rate": 3.713754151500041e-05, "loss": 2.5219, "step": 2802 }, { "epoch": 4.456279809220986, "grad_norm": 6.049362525678668, "learning_rate": 3.714311632595027e-05, "loss": 0.1907, "step": 2803 }, { "epoch": 4.457869634340223, "grad_norm": 2.9145074223688723, "learning_rate": 3.71486921038518e-05, "loss": 0.1805, "step": 2804 }, { "epoch": 4.45945945945946, "grad_norm": 3.8931006091671225, "learning_rate": 3.715426884681781e-05, "loss": 0.2147, "step": 2805 }, { "epoch": 4.461049284578697, "grad_norm": 6.611337180132665, "learning_rate": 3.7159846552960776e-05, "loss": 0.3192, "step": 2806 }, { "epoch": 4.462639109697934, "grad_norm": 3.629436247169493, "learning_rate": 3.716542522039284e-05, "loss": 0.2102, "step": 2807 }, { "epoch": 4.46422893481717, "grad_norm": 2.258745493852795, "learning_rate": 3.7171004847225825e-05, "loss": 0.1742, "step": 2808 }, { "epoch": 4.465818759936407, "grad_norm": 6.062549778453651, "learning_rate": 3.717658543157124e-05, "loss": 0.1641, "step": 2809 }, { "epoch": 4.467408585055644, "grad_norm": 4.219589608455295, "learning_rate": 3.718216697154024e-05, "loss": 0.2141, "step": 2810 }, { "epoch": 4.468998410174881, "grad_norm": 7.534390965453846, "learning_rate": 3.718774946524369e-05, "loss": 0.2214, "step": 2811 }, { "epoch": 4.470588235294118, "grad_norm": 2.829483610733524, "learning_rate": 3.719333291079212e-05, "loss": 0.1485, "step": 2812 }, { "epoch": 4.472178060413355, "grad_norm": 3.698981875189589, "learning_rate": 3.719891730629573e-05, "loss": 0.1725, "step": 2813 }, { "epoch": 4.473767885532592, "grad_norm": 4.505116913402184, "learning_rate": 3.7204502649864404e-05, "loss": 0.1973, "step": 2814 }, { "epoch": 4.475357710651828, "grad_norm": 3.265320633529519, "learning_rate": 3.721008893960771e-05, "loss": 0.2059, "step": 2815 }, { "epoch": 4.476947535771065, "grad_norm": 4.5241444493374585, "learning_rate": 3.721567617363488e-05, "loss": 0.2121, "step": 2816 }, { "epoch": 4.478537360890302, "grad_norm": 3.6111145012584074, "learning_rate": 3.7221264350054855e-05, "loss": 0.1342, "step": 2817 }, { "epoch": 4.480127186009539, "grad_norm": 3.4718237823325335, "learning_rate": 3.722685346697622e-05, "loss": 0.1569, "step": 2818 }, { "epoch": 4.481717011128776, "grad_norm": 27.295358827841486, "learning_rate": 3.723244352250729e-05, "loss": 35.825, "step": 2819 }, { "epoch": 4.483306836248013, "grad_norm": 9.07619442854584, "learning_rate": 3.723803451475599e-05, "loss": 0.2175, "step": 2820 }, { "epoch": 4.48489666136725, "grad_norm": 5.768878262228427, "learning_rate": 3.724362644183001e-05, "loss": 0.2082, "step": 2821 }, { "epoch": 4.486486486486487, "grad_norm": 4.031848957655296, "learning_rate": 3.7249219301836675e-05, "loss": 0.1783, "step": 2822 }, { "epoch": 4.488076311605723, "grad_norm": 7.9223708612477015, "learning_rate": 3.7254813092882994e-05, "loss": 0.215, "step": 2823 }, { "epoch": 4.48966613672496, "grad_norm": 3.6757952282275834, "learning_rate": 3.726040781307567e-05, "loss": 0.1762, "step": 2824 }, { "epoch": 4.491255961844197, "grad_norm": 3.0646252721992266, "learning_rate": 3.726600346052112e-05, "loss": 0.1736, "step": 2825 }, { "epoch": 4.492845786963434, "grad_norm": 3.997769383392965, "learning_rate": 3.727160003332539e-05, "loss": 0.1778, "step": 2826 }, { "epoch": 4.494435612082671, "grad_norm": 6.709997982591083, "learning_rate": 3.727719752959426e-05, "loss": 0.2102, "step": 2827 }, { "epoch": 4.496025437201908, "grad_norm": 6.75511785140146, "learning_rate": 3.7282795947433166e-05, "loss": 0.3789, "step": 2828 }, { "epoch": 4.497615262321145, "grad_norm": 11.963881764755069, "learning_rate": 3.7288395284947264e-05, "loss": 0.22, "step": 2829 }, { "epoch": 4.499205087440382, "grad_norm": 6.021469160749505, "learning_rate": 3.7293995540241366e-05, "loss": 0.1727, "step": 2830 }, { "epoch": 4.500794912559618, "grad_norm": 5.126158245069067, "learning_rate": 3.729959671141999e-05, "loss": 0.1794, "step": 2831 }, { "epoch": 4.502384737678855, "grad_norm": 9.785680090767471, "learning_rate": 3.7305198796587356e-05, "loss": 0.2051, "step": 2832 }, { "epoch": 4.503974562798092, "grad_norm": 4.6361494902371465, "learning_rate": 3.731080179384735e-05, "loss": 0.1835, "step": 2833 }, { "epoch": 4.505564387917329, "grad_norm": 3.2293456119664095, "learning_rate": 3.731640570130355e-05, "loss": 0.2385, "step": 2834 }, { "epoch": 4.507154213036566, "grad_norm": 13.642501089659985, "learning_rate": 3.7322010517059255e-05, "loss": 0.3587, "step": 2835 }, { "epoch": 4.508744038155803, "grad_norm": 30.60348472399762, "learning_rate": 3.7327616239217434e-05, "loss": 11.3504, "step": 2836 }, { "epoch": 4.51033386327504, "grad_norm": 3.618786746035887, "learning_rate": 3.733322286588075e-05, "loss": 0.3155, "step": 2837 }, { "epoch": 4.511923688394276, "grad_norm": 5.416537160564723, "learning_rate": 3.7338830395151554e-05, "loss": 0.1898, "step": 2838 }, { "epoch": 4.513513513513513, "grad_norm": 2.5996520449174816, "learning_rate": 3.734443882513192e-05, "loss": 0.1011, "step": 2839 }, { "epoch": 4.51510333863275, "grad_norm": 4.71495042240284, "learning_rate": 3.735004815392357e-05, "loss": 0.2611, "step": 2840 }, { "epoch": 4.516693163751987, "grad_norm": 66.40559706763618, "learning_rate": 3.735565837962798e-05, "loss": 20.057, "step": 2841 }, { "epoch": 4.518282988871224, "grad_norm": 3.152306397931738, "learning_rate": 3.736126950034628e-05, "loss": 0.2576, "step": 2842 }, { "epoch": 4.519872813990461, "grad_norm": 4.989368054441349, "learning_rate": 3.736688151417929e-05, "loss": 0.2456, "step": 2843 }, { "epoch": 4.521462639109698, "grad_norm": 8.500775974951233, "learning_rate": 3.737249441922757e-05, "loss": 0.1583, "step": 2844 }, { "epoch": 4.523052464228935, "grad_norm": 4.401315475819665, "learning_rate": 3.7378108213591354e-05, "loss": 0.1797, "step": 2845 }, { "epoch": 4.524642289348172, "grad_norm": 5.871625193044864, "learning_rate": 3.738372289537057e-05, "loss": 0.4668, "step": 2846 }, { "epoch": 4.526232114467408, "grad_norm": 4.675678678415982, "learning_rate": 3.738933846266484e-05, "loss": 0.236, "step": 2847 }, { "epoch": 4.527821939586645, "grad_norm": 3.364556648408934, "learning_rate": 3.739495491357352e-05, "loss": 0.2292, "step": 2848 }, { "epoch": 4.529411764705882, "grad_norm": 3.5825149594659447, "learning_rate": 3.740057224619563e-05, "loss": 0.2048, "step": 2849 }, { "epoch": 4.531001589825119, "grad_norm": 4.709106247739551, "learning_rate": 3.7406190458629906e-05, "loss": 0.1675, "step": 2850 }, { "epoch": 4.532591414944356, "grad_norm": 4.826751422511153, "learning_rate": 3.741180954897479e-05, "loss": 0.1672, "step": 2851 }, { "epoch": 4.534181240063593, "grad_norm": 2.8660126792764844, "learning_rate": 3.741742951532843e-05, "loss": 0.1557, "step": 2852 }, { "epoch": 4.53577106518283, "grad_norm": 3.967471914387211, "learning_rate": 3.742305035578866e-05, "loss": 0.1639, "step": 2853 }, { "epoch": 4.537360890302066, "grad_norm": 5.303662444613524, "learning_rate": 3.7428672068453035e-05, "loss": 0.1763, "step": 2854 }, { "epoch": 4.538950715421303, "grad_norm": 4.027149475718436, "learning_rate": 3.743429465141881e-05, "loss": 0.1796, "step": 2855 }, { "epoch": 4.54054054054054, "grad_norm": 2.3258477806024564, "learning_rate": 3.7439918102782945e-05, "loss": 0.1723, "step": 2856 }, { "epoch": 4.542130365659777, "grad_norm": 4.73420352541784, "learning_rate": 3.74455424206421e-05, "loss": 0.1749, "step": 2857 }, { "epoch": 4.543720190779014, "grad_norm": 5.815176253798402, "learning_rate": 3.7451167603092644e-05, "loss": 0.217, "step": 2858 }, { "epoch": 4.545310015898251, "grad_norm": 4.855606317556176, "learning_rate": 3.745679364823066e-05, "loss": 0.1705, "step": 2859 }, { "epoch": 4.546899841017488, "grad_norm": 4.394428255071976, "learning_rate": 3.746242055415195e-05, "loss": 0.137, "step": 2860 }, { "epoch": 4.548489666136725, "grad_norm": 3.051067466298107, "learning_rate": 3.746804831895198e-05, "loss": 0.1688, "step": 2861 }, { "epoch": 4.550079491255962, "grad_norm": 6.478224658482319, "learning_rate": 3.747367694072599e-05, "loss": 0.2346, "step": 2862 }, { "epoch": 4.5516693163751984, "grad_norm": 2.847955289662053, "learning_rate": 3.747930641756886e-05, "loss": 0.1798, "step": 2863 }, { "epoch": 4.5532591414944354, "grad_norm": 5.705028785605883, "learning_rate": 3.748493674757525e-05, "loss": 0.1844, "step": 2864 }, { "epoch": 4.5548489666136724, "grad_norm": 6.481835535104653, "learning_rate": 3.749056792883948e-05, "loss": 0.1902, "step": 2865 }, { "epoch": 4.556438791732909, "grad_norm": 2.3467054867298853, "learning_rate": 3.749619995945559e-05, "loss": 0.1369, "step": 2866 }, { "epoch": 4.558028616852146, "grad_norm": 5.061642652221836, "learning_rate": 3.750183283751736e-05, "loss": 0.2625, "step": 2867 }, { "epoch": 4.559618441971383, "grad_norm": 9.072954499200883, "learning_rate": 3.750746656111825e-05, "loss": 1.0663, "step": 2868 }, { "epoch": 4.56120826709062, "grad_norm": 2.986979905087573, "learning_rate": 3.751310112835145e-05, "loss": 0.1639, "step": 2869 }, { "epoch": 4.5627980922098565, "grad_norm": 5.439717361733004, "learning_rate": 3.751873653730988e-05, "loss": 0.1607, "step": 2870 }, { "epoch": 4.5643879173290935, "grad_norm": 5.297552435329612, "learning_rate": 3.752437278608615e-05, "loss": 0.1647, "step": 2871 }, { "epoch": 4.5659777424483305, "grad_norm": 3.0939035910055406, "learning_rate": 3.753000987277257e-05, "loss": 0.1621, "step": 2872 }, { "epoch": 4.5675675675675675, "grad_norm": 4.1145267409351, "learning_rate": 3.7535647795461226e-05, "loss": 0.1696, "step": 2873 }, { "epoch": 4.5691573926868045, "grad_norm": 5.511622115063244, "learning_rate": 3.7541286552243866e-05, "loss": 0.1514, "step": 2874 }, { "epoch": 4.5707472178060415, "grad_norm": 4.201066705365509, "learning_rate": 3.7546926141211975e-05, "loss": 0.1881, "step": 2875 }, { "epoch": 4.5723370429252785, "grad_norm": 2.7474518348050236, "learning_rate": 3.755256656045676e-05, "loss": 0.2011, "step": 2876 }, { "epoch": 4.573926868044515, "grad_norm": 4.672119909022862, "learning_rate": 3.755820780806915e-05, "loss": 0.1287, "step": 2877 }, { "epoch": 4.575516693163752, "grad_norm": 6.427234917577963, "learning_rate": 3.756384988213978e-05, "loss": 0.2484, "step": 2878 }, { "epoch": 4.577106518282989, "grad_norm": 2.6518638705510775, "learning_rate": 3.756949278075901e-05, "loss": 0.1753, "step": 2879 }, { "epoch": 4.578696343402226, "grad_norm": 4.102056281935237, "learning_rate": 3.757513650201692e-05, "loss": 0.1431, "step": 2880 }, { "epoch": 4.580286168521463, "grad_norm": 4.50911895348498, "learning_rate": 3.758078104400333e-05, "loss": 0.2021, "step": 2881 }, { "epoch": 4.5818759936407, "grad_norm": 8.520109212665893, "learning_rate": 3.7586426404807746e-05, "loss": 0.3287, "step": 2882 }, { "epoch": 4.583465818759937, "grad_norm": 2.621910342230516, "learning_rate": 3.759207258251944e-05, "loss": 0.2072, "step": 2883 }, { "epoch": 4.585055643879174, "grad_norm": 5.268388231332253, "learning_rate": 3.759771957522736e-05, "loss": 0.1596, "step": 2884 }, { "epoch": 4.586645468998411, "grad_norm": 2.3716181188864454, "learning_rate": 3.760336738102023e-05, "loss": 0.1807, "step": 2885 }, { "epoch": 4.588235294117647, "grad_norm": 3.1786401685425427, "learning_rate": 3.7609015997986456e-05, "loss": 0.2016, "step": 2886 }, { "epoch": 4.589825119236884, "grad_norm": 1.7059595072140057, "learning_rate": 3.7614665424214193e-05, "loss": 0.1337, "step": 2887 }, { "epoch": 4.591414944356121, "grad_norm": 59.26652420688663, "learning_rate": 3.76203156577913e-05, "loss": 16.4848, "step": 2888 }, { "epoch": 4.593004769475358, "grad_norm": 3.3265049300937215, "learning_rate": 3.76259666968054e-05, "loss": 0.163, "step": 2889 }, { "epoch": 4.594594594594595, "grad_norm": 2.11070844646554, "learning_rate": 3.7631618539343814e-05, "loss": 0.8586, "step": 2890 }, { "epoch": 4.596184419713832, "grad_norm": 2.6287192942695747, "learning_rate": 3.763727118349359e-05, "loss": 0.1547, "step": 2891 }, { "epoch": 4.597774244833069, "grad_norm": 3.924075880786414, "learning_rate": 3.764292462734152e-05, "loss": 0.156, "step": 2892 }, { "epoch": 4.599364069952305, "grad_norm": 2.733295188724249, "learning_rate": 3.76485788689741e-05, "loss": 0.1621, "step": 2893 }, { "epoch": 4.600953895071542, "grad_norm": 3.081777402410622, "learning_rate": 3.76542339064776e-05, "loss": 0.2026, "step": 2894 }, { "epoch": 4.602543720190779, "grad_norm": 4.99250817482992, "learning_rate": 3.765988973793798e-05, "loss": 0.1557, "step": 2895 }, { "epoch": 4.604133545310016, "grad_norm": 2.2807979132046046, "learning_rate": 3.7665546361440945e-05, "loss": 0.1269, "step": 2896 }, { "epoch": 4.605723370429253, "grad_norm": 8.405147682434572, "learning_rate": 3.767120377507194e-05, "loss": 0.3099, "step": 2897 }, { "epoch": 4.60731319554849, "grad_norm": 3.0530003363190525, "learning_rate": 3.767686197691613e-05, "loss": 0.1992, "step": 2898 }, { "epoch": 4.608903020667727, "grad_norm": 2.172573133556041, "learning_rate": 3.7682520965058435e-05, "loss": 0.1318, "step": 2899 }, { "epoch": 4.610492845786963, "grad_norm": 2.6943062677208225, "learning_rate": 3.768818073758346e-05, "loss": 0.1428, "step": 2900 }, { "epoch": 4.6120826709062, "grad_norm": 2.9187779138871144, "learning_rate": 3.76938412925756e-05, "loss": 0.1658, "step": 2901 }, { "epoch": 4.613672496025437, "grad_norm": 2.826619002470318, "learning_rate": 3.7699502628118955e-05, "loss": 0.1699, "step": 2902 }, { "epoch": 4.615262321144674, "grad_norm": 3.3533843818535285, "learning_rate": 3.770516474229738e-05, "loss": 0.1767, "step": 2903 }, { "epoch": 4.616852146263911, "grad_norm": 2.4722416502378812, "learning_rate": 3.771082763319443e-05, "loss": 0.1409, "step": 2904 }, { "epoch": 4.618441971383148, "grad_norm": 2.0670050910687046, "learning_rate": 3.7716491298893444e-05, "loss": 0.1596, "step": 2905 }, { "epoch": 4.620031796502385, "grad_norm": 3.016923117600807, "learning_rate": 3.772215573747746e-05, "loss": 0.1449, "step": 2906 }, { "epoch": 4.621621621621622, "grad_norm": 1.6640428099149303, "learning_rate": 3.772782094702929e-05, "loss": 0.163, "step": 2907 }, { "epoch": 4.623211446740859, "grad_norm": 2.8608169340897245, "learning_rate": 3.7733486925631454e-05, "loss": 0.1495, "step": 2908 }, { "epoch": 4.624801271860095, "grad_norm": 3.0543808279729068, "learning_rate": 3.773915367136621e-05, "loss": 0.1467, "step": 2909 }, { "epoch": 4.626391096979332, "grad_norm": 2.622726718147098, "learning_rate": 3.77448211823156e-05, "loss": 0.1707, "step": 2910 }, { "epoch": 4.627980922098569, "grad_norm": 5.576265127092346, "learning_rate": 3.775048945656135e-05, "loss": 0.173, "step": 2911 }, { "epoch": 4.629570747217806, "grad_norm": 2.7600958364610366, "learning_rate": 3.775615849218497e-05, "loss": 0.1624, "step": 2912 }, { "epoch": 4.631160572337043, "grad_norm": 5.445072715945005, "learning_rate": 3.7761828287267685e-05, "loss": 0.1925, "step": 2913 }, { "epoch": 4.63275039745628, "grad_norm": 41.24983257642017, "learning_rate": 3.776749883989049e-05, "loss": 11.0079, "step": 2914 }, { "epoch": 4.634340222575517, "grad_norm": 3.100296713856521, "learning_rate": 3.777317014813409e-05, "loss": 0.1441, "step": 2915 }, { "epoch": 4.635930047694753, "grad_norm": 3.574802053971851, "learning_rate": 3.777884221007897e-05, "loss": 0.1924, "step": 2916 }, { "epoch": 4.63751987281399, "grad_norm": 4.391112410158943, "learning_rate": 3.7784515023805324e-05, "loss": 0.1516, "step": 2917 }, { "epoch": 4.639109697933227, "grad_norm": 3.779933020332043, "learning_rate": 3.7790188587393134e-05, "loss": 0.23, "step": 2918 }, { "epoch": 4.640699523052464, "grad_norm": 2.641751129125442, "learning_rate": 3.7795862898922075e-05, "loss": 0.1355, "step": 2919 }, { "epoch": 4.642289348171701, "grad_norm": 6.048211643722015, "learning_rate": 3.7801537956471625e-05, "loss": 0.1961, "step": 2920 }, { "epoch": 4.643879173290938, "grad_norm": 2.9238567038790135, "learning_rate": 3.780721375812097e-05, "loss": 0.131, "step": 2921 }, { "epoch": 4.645468998410175, "grad_norm": 2.8797445111279165, "learning_rate": 3.781289030194905e-05, "loss": 0.1714, "step": 2922 }, { "epoch": 4.647058823529412, "grad_norm": 4.238923219208015, "learning_rate": 3.781856758603458e-05, "loss": 0.2889, "step": 2923 }, { "epoch": 4.648648648648649, "grad_norm": 27.17517441629114, "learning_rate": 3.782424560845598e-05, "loss": 4.5325, "step": 2924 }, { "epoch": 4.650238473767885, "grad_norm": 3.6137526219204528, "learning_rate": 3.782992436729147e-05, "loss": 0.1658, "step": 2925 }, { "epoch": 4.651828298887122, "grad_norm": 4.800934225703837, "learning_rate": 3.783560386061897e-05, "loss": 0.1376, "step": 2926 }, { "epoch": 4.653418124006359, "grad_norm": 4.0535697033233, "learning_rate": 3.78412840865162e-05, "loss": 0.2865, "step": 2927 }, { "epoch": 4.655007949125596, "grad_norm": 4.682326791935431, "learning_rate": 3.7846965043060597e-05, "loss": 0.1728, "step": 2928 }, { "epoch": 4.656597774244833, "grad_norm": 4.371263340441853, "learning_rate": 3.7852646728329374e-05, "loss": 0.2363, "step": 2929 }, { "epoch": 4.65818759936407, "grad_norm": 4.9388028511102355, "learning_rate": 3.785832914039947e-05, "loss": 0.1924, "step": 2930 }, { "epoch": 4.659777424483307, "grad_norm": 3.0128601695484742, "learning_rate": 3.78640122773476e-05, "loss": 0.1643, "step": 2931 }, { "epoch": 4.661367249602543, "grad_norm": 5.771423900752951, "learning_rate": 3.786969613725024e-05, "loss": 1.023, "step": 2932 }, { "epoch": 4.66295707472178, "grad_norm": 5.48536869888573, "learning_rate": 3.7875380718183595e-05, "loss": 0.2292, "step": 2933 }, { "epoch": 4.664546899841017, "grad_norm": 6.098775172864054, "learning_rate": 3.788106601822364e-05, "loss": 0.2111, "step": 2934 }, { "epoch": 4.666136724960254, "grad_norm": 6.0951413559209255, "learning_rate": 3.788675203544611e-05, "loss": 0.1543, "step": 2935 }, { "epoch": 4.667726550079491, "grad_norm": 3.744145421572132, "learning_rate": 3.789243876792651e-05, "loss": 0.1609, "step": 2936 }, { "epoch": 4.669316375198728, "grad_norm": 8.05013352203148, "learning_rate": 3.7898126213740064e-05, "loss": 0.1755, "step": 2937 }, { "epoch": 4.670906200317965, "grad_norm": 8.15763257841782, "learning_rate": 3.7903814370961785e-05, "loss": 0.1515, "step": 2938 }, { "epoch": 4.672496025437201, "grad_norm": 4.545912947789513, "learning_rate": 3.7909503237666435e-05, "loss": 0.1434, "step": 2939 }, { "epoch": 4.674085850556438, "grad_norm": 4.598716644531128, "learning_rate": 3.791519281192855e-05, "loss": 0.2647, "step": 2940 }, { "epoch": 4.675675675675675, "grad_norm": 9.081038264677316, "learning_rate": 3.792088309182241e-05, "loss": 0.2429, "step": 2941 }, { "epoch": 4.677265500794912, "grad_norm": 7.360321809767228, "learning_rate": 3.7926574075422056e-05, "loss": 0.1485, "step": 2942 }, { "epoch": 4.678855325914149, "grad_norm": 5.47606383062219, "learning_rate": 3.7932265760801294e-05, "loss": 0.1815, "step": 2943 }, { "epoch": 4.680445151033386, "grad_norm": 6.55417122302052, "learning_rate": 3.7937958146033705e-05, "loss": 0.1385, "step": 2944 }, { "epoch": 4.682034976152623, "grad_norm": 5.404671642146511, "learning_rate": 3.7943651229192614e-05, "loss": 0.215, "step": 2945 }, { "epoch": 4.68362480127186, "grad_norm": 5.834554209116783, "learning_rate": 3.7949345008351124e-05, "loss": 0.1667, "step": 2946 }, { "epoch": 4.685214626391097, "grad_norm": 4.115579261924594, "learning_rate": 3.79550394815821e-05, "loss": 0.1263, "step": 2947 }, { "epoch": 4.6868044515103335, "grad_norm": 3.5753700051751562, "learning_rate": 3.796073464695816e-05, "loss": 0.1486, "step": 2948 }, { "epoch": 4.6883942766295705, "grad_norm": 4.554149204213057, "learning_rate": 3.79664305025517e-05, "loss": 0.1439, "step": 2949 }, { "epoch": 4.6899841017488075, "grad_norm": 3.773053705629598, "learning_rate": 3.7972127046434884e-05, "loss": 0.178, "step": 2950 }, { "epoch": 4.6915739268680445, "grad_norm": 3.6737401041770643, "learning_rate": 3.797782427667962e-05, "loss": 0.1618, "step": 2951 }, { "epoch": 4.6931637519872815, "grad_norm": 5.608265374853665, "learning_rate": 3.798352219135763e-05, "loss": 0.1478, "step": 2952 }, { "epoch": 4.6947535771065185, "grad_norm": 7.989345613173626, "learning_rate": 3.798922078854035e-05, "loss": 0.4584, "step": 2953 }, { "epoch": 4.6963434022257555, "grad_norm": 3.351765565071844, "learning_rate": 3.7994920066299036e-05, "loss": 0.1753, "step": 2954 }, { "epoch": 4.697933227344992, "grad_norm": 5.728752981867214, "learning_rate": 3.800062002270467e-05, "loss": 0.2282, "step": 2955 }, { "epoch": 4.699523052464229, "grad_norm": 5.2001883236729345, "learning_rate": 3.800632065582803e-05, "loss": 0.1726, "step": 2956 }, { "epoch": 4.701112877583466, "grad_norm": 3.663551163561718, "learning_rate": 3.801202196373966e-05, "loss": 0.1584, "step": 2957 }, { "epoch": 4.702702702702703, "grad_norm": 2.479764929918765, "learning_rate": 3.801772394450986e-05, "loss": 0.1554, "step": 2958 }, { "epoch": 4.70429252782194, "grad_norm": 6.328522080071939, "learning_rate": 3.802342659620874e-05, "loss": 0.1424, "step": 2959 }, { "epoch": 4.705882352941177, "grad_norm": 4.903041959057388, "learning_rate": 3.802912991690614e-05, "loss": 0.1771, "step": 2960 }, { "epoch": 4.707472178060414, "grad_norm": 4.0683432747591315, "learning_rate": 3.80348339046717e-05, "loss": 0.1667, "step": 2961 }, { "epoch": 4.709062003179651, "grad_norm": 2.5599177866461242, "learning_rate": 3.8040538557574826e-05, "loss": 0.1643, "step": 2962 }, { "epoch": 4.710651828298887, "grad_norm": 7.731778265898322, "learning_rate": 3.8046243873684696e-05, "loss": 0.1248, "step": 2963 }, { "epoch": 4.712241653418124, "grad_norm": 3.6230250918108515, "learning_rate": 3.8051949851070274e-05, "loss": 0.1382, "step": 2964 }, { "epoch": 4.713831478537361, "grad_norm": 4.210084136828786, "learning_rate": 3.8057656487800284e-05, "loss": 0.1531, "step": 2965 }, { "epoch": 4.715421303656598, "grad_norm": 3.712019543233206, "learning_rate": 3.806336378194324e-05, "loss": 0.1677, "step": 2966 }, { "epoch": 4.717011128775835, "grad_norm": 3.1880198978075134, "learning_rate": 3.8069071731567434e-05, "loss": 0.1601, "step": 2967 }, { "epoch": 4.718600953895072, "grad_norm": 4.291405206248979, "learning_rate": 3.807478033474093e-05, "loss": 0.196, "step": 2968 }, { "epoch": 4.720190779014309, "grad_norm": 2.244007352051777, "learning_rate": 3.808048958953157e-05, "loss": 0.1587, "step": 2969 }, { "epoch": 4.721780604133546, "grad_norm": 4.877429121242834, "learning_rate": 3.808619949400697e-05, "loss": 0.2168, "step": 2970 }, { "epoch": 4.723370429252782, "grad_norm": 5.472412880007288, "learning_rate": 3.8091910046234556e-05, "loss": 0.1441, "step": 2971 }, { "epoch": 4.724960254372019, "grad_norm": 4.6464547110154895, "learning_rate": 3.809762124428149e-05, "loss": 0.1817, "step": 2972 }, { "epoch": 4.726550079491256, "grad_norm": 5.574834948434259, "learning_rate": 3.810333308621475e-05, "loss": 0.2534, "step": 2973 }, { "epoch": 4.728139904610493, "grad_norm": 4.698425855483335, "learning_rate": 3.810904557010109e-05, "loss": 0.2308, "step": 2974 }, { "epoch": 4.72972972972973, "grad_norm": 4.018454706624181, "learning_rate": 3.811475869400703e-05, "loss": 0.2559, "step": 2975 }, { "epoch": 4.731319554848967, "grad_norm": 4.725735347081218, "learning_rate": 3.8120472455998885e-05, "loss": 0.1606, "step": 2976 }, { "epoch": 4.732909379968204, "grad_norm": 2.6501809646362466, "learning_rate": 3.8126186854142755e-05, "loss": 0.1541, "step": 2977 }, { "epoch": 4.73449920508744, "grad_norm": 2.9955998923272373, "learning_rate": 3.813190188650452e-05, "loss": 0.1715, "step": 2978 }, { "epoch": 4.736089030206677, "grad_norm": 4.183801742787861, "learning_rate": 3.813761755114987e-05, "loss": 0.1852, "step": 2979 }, { "epoch": 4.737678855325914, "grad_norm": 2.65072119049328, "learning_rate": 3.814333384614423e-05, "loss": 0.1775, "step": 2980 }, { "epoch": 4.739268680445151, "grad_norm": 1.735938869945464, "learning_rate": 3.814905076955286e-05, "loss": 0.2176, "step": 2981 }, { "epoch": 4.740858505564388, "grad_norm": 3.3746454204775973, "learning_rate": 3.815476831944077e-05, "loss": 0.1666, "step": 2982 }, { "epoch": 4.742448330683625, "grad_norm": 3.2335479627086863, "learning_rate": 3.81604864938728e-05, "loss": 0.1894, "step": 2983 }, { "epoch": 4.744038155802862, "grad_norm": 2.1535029919800794, "learning_rate": 3.816620529091354e-05, "loss": 0.1272, "step": 2984 }, { "epoch": 4.745627980922099, "grad_norm": 1.6824078861062293, "learning_rate": 3.817192470862739e-05, "loss": 0.1408, "step": 2985 }, { "epoch": 4.747217806041336, "grad_norm": 4.2899638019314095, "learning_rate": 3.8177644745078524e-05, "loss": 0.177, "step": 2986 }, { "epoch": 4.748807631160572, "grad_norm": 3.92971756672654, "learning_rate": 3.8183365398330933e-05, "loss": 0.204, "step": 2987 }, { "epoch": 4.750397456279809, "grad_norm": 2.9381704887726063, "learning_rate": 3.8189086666448374e-05, "loss": 0.1961, "step": 2988 }, { "epoch": 4.751987281399046, "grad_norm": 4.826389452928204, "learning_rate": 3.81948085474944e-05, "loss": 0.4684, "step": 2989 }, { "epoch": 4.753577106518283, "grad_norm": 3.1035230630691846, "learning_rate": 3.820053103953237e-05, "loss": 0.1969, "step": 2990 }, { "epoch": 4.75516693163752, "grad_norm": 3.276407800940697, "learning_rate": 3.820625414062543e-05, "loss": 0.242, "step": 2991 }, { "epoch": 4.756756756756757, "grad_norm": 3.7809944988819013, "learning_rate": 3.821197784883651e-05, "loss": 0.2356, "step": 2992 }, { "epoch": 4.758346581875994, "grad_norm": 4.5483131732096185, "learning_rate": 3.8217702162228335e-05, "loss": 0.2662, "step": 2993 }, { "epoch": 4.75993640699523, "grad_norm": 2.868839161665655, "learning_rate": 3.822342707886345e-05, "loss": 0.1543, "step": 2994 }, { "epoch": 4.761526232114467, "grad_norm": 2.7140135035763313, "learning_rate": 3.8229152596804167e-05, "loss": 0.2482, "step": 2995 }, { "epoch": 4.763116057233704, "grad_norm": 3.612947774766078, "learning_rate": 3.823487871411261e-05, "loss": 0.2006, "step": 2996 }, { "epoch": 4.764705882352941, "grad_norm": 2.517607104243198, "learning_rate": 3.8240605428850696e-05, "loss": 0.1862, "step": 2997 }, { "epoch": 4.766295707472178, "grad_norm": 1.9601154761340402, "learning_rate": 3.824633273908013e-05, "loss": 0.13, "step": 2998 }, { "epoch": 4.767885532591415, "grad_norm": 1.8342826909814802, "learning_rate": 3.8252060642862435e-05, "loss": 0.1335, "step": 2999 }, { "epoch": 4.769475357710652, "grad_norm": 2.7459191557330933, "learning_rate": 3.825778913825892e-05, "loss": 0.157, "step": 3000 }, { "epoch": 4.771065182829888, "grad_norm": 2.495855523347949, "learning_rate": 3.82635182233307e-05, "loss": 0.1464, "step": 3001 }, { "epoch": 4.772655007949125, "grad_norm": 2.686792704109436, "learning_rate": 3.826924789613868e-05, "loss": 0.1768, "step": 3002 }, { "epoch": 4.774244833068362, "grad_norm": 2.5362524970960316, "learning_rate": 3.827497815474358e-05, "loss": 0.1864, "step": 3003 }, { "epoch": 4.775834658187599, "grad_norm": 4.548583178132155, "learning_rate": 3.828070899720591e-05, "loss": 0.1931, "step": 3004 }, { "epoch": 4.777424483306836, "grad_norm": 4.182438926166377, "learning_rate": 3.828644042158598e-05, "loss": 0.1638, "step": 3005 }, { "epoch": 4.779014308426073, "grad_norm": 2.9211668882192607, "learning_rate": 3.829217242594393e-05, "loss": 0.1864, "step": 3006 }, { "epoch": 4.78060413354531, "grad_norm": 6.190585740945075, "learning_rate": 3.8297905008339675e-05, "loss": 0.216, "step": 3007 }, { "epoch": 4.782193958664547, "grad_norm": 3.1959291420240206, "learning_rate": 3.830363816683294e-05, "loss": 0.1372, "step": 3008 }, { "epoch": 4.783783783783784, "grad_norm": 3.5564398692603665, "learning_rate": 3.8309371899483264e-05, "loss": 0.2979, "step": 3009 }, { "epoch": 4.78537360890302, "grad_norm": 3.744152937116926, "learning_rate": 3.8315106204349976e-05, "loss": 0.175, "step": 3010 }, { "epoch": 4.786963434022257, "grad_norm": 2.40855839657775, "learning_rate": 3.832084107949223e-05, "loss": 0.1902, "step": 3011 }, { "epoch": 4.788553259141494, "grad_norm": 4.4067276422411945, "learning_rate": 3.8326576522968985e-05, "loss": 0.307, "step": 3012 }, { "epoch": 4.790143084260731, "grad_norm": 23.065858315303657, "learning_rate": 3.8332312532838976e-05, "loss": 5.4769, "step": 3013 }, { "epoch": 4.791732909379968, "grad_norm": 3.239619711022394, "learning_rate": 3.83380491071608e-05, "loss": 0.1714, "step": 3014 }, { "epoch": 4.793322734499205, "grad_norm": 5.120412285290346, "learning_rate": 3.834378624399282e-05, "loss": 0.2041, "step": 3015 }, { "epoch": 4.794912559618442, "grad_norm": 4.462799013875103, "learning_rate": 3.834952394139322e-05, "loss": 0.1908, "step": 3016 }, { "epoch": 4.796502384737678, "grad_norm": 3.750797849260666, "learning_rate": 3.835526219742001e-05, "loss": 0.1813, "step": 3017 }, { "epoch": 4.798092209856915, "grad_norm": 4.260022621199357, "learning_rate": 3.836100101013099e-05, "loss": 0.1859, "step": 3018 }, { "epoch": 4.799682034976152, "grad_norm": 4.6083764971251195, "learning_rate": 3.836674037758378e-05, "loss": 0.1775, "step": 3019 }, { "epoch": 4.801271860095389, "grad_norm": 1.884814548062428, "learning_rate": 3.837248029783581e-05, "loss": 0.1228, "step": 3020 }, { "epoch": 4.802861685214626, "grad_norm": 5.362004424208794, "learning_rate": 3.837822076894432e-05, "loss": 0.2078, "step": 3021 }, { "epoch": 4.804451510333863, "grad_norm": 3.1578508127971303, "learning_rate": 3.8383961788966396e-05, "loss": 0.2041, "step": 3022 }, { "epoch": 4.8060413354531, "grad_norm": 3.217119174595127, "learning_rate": 3.838970335595887e-05, "loss": 0.1548, "step": 3023 }, { "epoch": 4.807631160572337, "grad_norm": 3.907332624767132, "learning_rate": 3.839544546797845e-05, "loss": 0.1814, "step": 3024 }, { "epoch": 4.809220985691574, "grad_norm": 2.7174728962230845, "learning_rate": 3.8401188123081654e-05, "loss": 0.1604, "step": 3025 }, { "epoch": 4.8108108108108105, "grad_norm": 2.8664883378746167, "learning_rate": 3.840693131932477e-05, "loss": 0.1298, "step": 3026 }, { "epoch": 4.8124006359300475, "grad_norm": 2.877993003538523, "learning_rate": 3.8412675054763964e-05, "loss": 0.1999, "step": 3027 }, { "epoch": 4.8139904610492845, "grad_norm": 3.4419441334672145, "learning_rate": 3.841841932745517e-05, "loss": 0.1571, "step": 3028 }, { "epoch": 4.8155802861685215, "grad_norm": 4.6139097584681945, "learning_rate": 3.842416413545416e-05, "loss": 0.2102, "step": 3029 }, { "epoch": 4.8171701112877585, "grad_norm": 3.3559044315417834, "learning_rate": 3.842990947681653e-05, "loss": 0.1908, "step": 3030 }, { "epoch": 4.8187599364069955, "grad_norm": 5.3714477350554475, "learning_rate": 3.8435655349597696e-05, "loss": 0.1878, "step": 3031 }, { "epoch": 4.8203497615262325, "grad_norm": 3.7923853565264705, "learning_rate": 3.8441401751852875e-05, "loss": 0.1941, "step": 3032 }, { "epoch": 4.821939586645469, "grad_norm": 2.7935698674882072, "learning_rate": 3.8447148681637124e-05, "loss": 0.1627, "step": 3033 }, { "epoch": 4.823529411764706, "grad_norm": 3.376541887833986, "learning_rate": 3.845289613700532e-05, "loss": 0.1856, "step": 3034 }, { "epoch": 4.825119236883943, "grad_norm": 4.390988742496754, "learning_rate": 3.845864411601216e-05, "loss": 0.1289, "step": 3035 }, { "epoch": 4.82670906200318, "grad_norm": 3.8045898092249075, "learning_rate": 3.846439261671214e-05, "loss": 0.1819, "step": 3036 }, { "epoch": 4.828298887122417, "grad_norm": 2.8590396080800957, "learning_rate": 3.8470141637159625e-05, "loss": 0.1422, "step": 3037 }, { "epoch": 4.829888712241654, "grad_norm": 25.327219583187013, "learning_rate": 3.8475891175408764e-05, "loss": 7.7389, "step": 3038 }, { "epoch": 4.831478537360891, "grad_norm": 5.687063206963253, "learning_rate": 3.848164122951355e-05, "loss": 0.1876, "step": 3039 }, { "epoch": 4.833068362480127, "grad_norm": 4.339890585481548, "learning_rate": 3.8487391797527804e-05, "loss": 0.1577, "step": 3040 }, { "epoch": 4.834658187599364, "grad_norm": 1.402866518035886, "learning_rate": 3.8493142877505175e-05, "loss": 0.1785, "step": 3041 }, { "epoch": 4.836248012718601, "grad_norm": 57.02033533889198, "learning_rate": 3.849889446749911e-05, "loss": 13.9439, "step": 3042 }, { "epoch": 4.837837837837838, "grad_norm": 3.0936122114608797, "learning_rate": 3.8504646565562906e-05, "loss": 0.1369, "step": 3043 }, { "epoch": 4.839427662957075, "grad_norm": 2.945826960200759, "learning_rate": 3.8510399169749706e-05, "loss": 0.1823, "step": 3044 }, { "epoch": 4.841017488076312, "grad_norm": 2.511771939012823, "learning_rate": 3.851615227811244e-05, "loss": 0.1468, "step": 3045 }, { "epoch": 4.842607313195549, "grad_norm": 3.748724025241797, "learning_rate": 3.8521905888703893e-05, "loss": 0.1774, "step": 3046 }, { "epoch": 4.844197138314786, "grad_norm": 5.397717320548106, "learning_rate": 3.852765999957669e-05, "loss": 0.1985, "step": 3047 }, { "epoch": 4.845786963434023, "grad_norm": 4.060201377235402, "learning_rate": 3.8533414608783265e-05, "loss": 0.2052, "step": 3048 }, { "epoch": 4.847376788553259, "grad_norm": 4.528780043025934, "learning_rate": 3.8539169714375886e-05, "loss": 0.1826, "step": 3049 }, { "epoch": 4.848966613672496, "grad_norm": 3.4507830680495912, "learning_rate": 3.854492531440666e-05, "loss": 0.1449, "step": 3050 }, { "epoch": 4.850556438791733, "grad_norm": 3.4985331505427797, "learning_rate": 3.8550681406927535e-05, "loss": 0.1801, "step": 3051 }, { "epoch": 4.85214626391097, "grad_norm": 6.716313893832901, "learning_rate": 3.8556437989990266e-05, "loss": 0.1426, "step": 3052 }, { "epoch": 4.853736089030207, "grad_norm": 3.933207677646741, "learning_rate": 3.856219506164647e-05, "loss": 0.2232, "step": 3053 }, { "epoch": 4.855325914149444, "grad_norm": 8.372877894603345, "learning_rate": 3.856795261994759e-05, "loss": 0.4122, "step": 3054 }, { "epoch": 4.856915739268681, "grad_norm": 8.017840555513521, "learning_rate": 3.8573710662944885e-05, "loss": 0.2029, "step": 3055 }, { "epoch": 4.858505564387917, "grad_norm": 5.879004983279431, "learning_rate": 3.857946918868948e-05, "loss": 0.1584, "step": 3056 }, { "epoch": 4.860095389507154, "grad_norm": 5.366191530963387, "learning_rate": 3.8585228195232313e-05, "loss": 0.1512, "step": 3057 }, { "epoch": 4.861685214626391, "grad_norm": 5.7604413637058025, "learning_rate": 3.859098768062417e-05, "loss": 0.1569, "step": 3058 }, { "epoch": 4.863275039745628, "grad_norm": 6.526736811339729, "learning_rate": 3.8596747642915684e-05, "loss": 0.1842, "step": 3059 }, { "epoch": 4.864864864864865, "grad_norm": 5.68490123402592, "learning_rate": 3.860250808015731e-05, "loss": 0.2087, "step": 3060 }, { "epoch": 4.866454689984102, "grad_norm": 7.932916365313232, "learning_rate": 3.8608268990399345e-05, "loss": 0.1992, "step": 3061 }, { "epoch": 4.868044515103339, "grad_norm": 5.723877038081454, "learning_rate": 3.861403037169193e-05, "loss": 0.1621, "step": 3062 }, { "epoch": 4.869634340222575, "grad_norm": 7.5001573825791485, "learning_rate": 3.8619792222085056e-05, "loss": 0.1822, "step": 3063 }, { "epoch": 4.871224165341812, "grad_norm": 11.666769204610338, "learning_rate": 3.862555453962854e-05, "loss": 0.222, "step": 3064 }, { "epoch": 4.872813990461049, "grad_norm": 4.057896982215662, "learning_rate": 3.8631317322372036e-05, "loss": 0.0982, "step": 3065 }, { "epoch": 4.874403815580286, "grad_norm": 5.237446732327668, "learning_rate": 3.863708056836505e-05, "loss": 0.1509, "step": 3066 }, { "epoch": 4.875993640699523, "grad_norm": 8.848062985536522, "learning_rate": 3.8642844275656955e-05, "loss": 0.2099, "step": 3067 }, { "epoch": 4.87758346581876, "grad_norm": 5.59100556265263, "learning_rate": 3.8648608442296925e-05, "loss": 0.1747, "step": 3068 }, { "epoch": 4.879173290937997, "grad_norm": 4.335339671618627, "learning_rate": 3.8654373066334e-05, "loss": 0.2023, "step": 3069 }, { "epoch": 4.880763116057234, "grad_norm": 6.1660375755880885, "learning_rate": 3.866013814581708e-05, "loss": 0.1384, "step": 3070 }, { "epoch": 4.882352941176471, "grad_norm": 4.315836463738177, "learning_rate": 3.866590367879488e-05, "loss": 0.2015, "step": 3071 }, { "epoch": 4.883942766295707, "grad_norm": 51.837744991612645, "learning_rate": 3.8671669663315966e-05, "loss": 9.2805, "step": 3072 }, { "epoch": 4.885532591414944, "grad_norm": 6.860661464763248, "learning_rate": 3.867743609742878e-05, "loss": 0.2658, "step": 3073 }, { "epoch": 4.887122416534181, "grad_norm": 3.640286980568797, "learning_rate": 3.868320297918158e-05, "loss": 0.186, "step": 3074 }, { "epoch": 4.888712241653418, "grad_norm": 8.674872158566979, "learning_rate": 3.868897030662249e-05, "loss": 0.2201, "step": 3075 }, { "epoch": 4.890302066772655, "grad_norm": 8.636316592995279, "learning_rate": 3.869473807779948e-05, "loss": 0.241, "step": 3076 }, { "epoch": 4.891891891891892, "grad_norm": 7.652251627907137, "learning_rate": 3.870050629076037e-05, "loss": 0.2084, "step": 3077 }, { "epoch": 4.893481717011129, "grad_norm": 7.607224868679097, "learning_rate": 3.8706274943552834e-05, "loss": 0.2531, "step": 3078 }, { "epoch": 4.895071542130365, "grad_norm": 4.213324358795593, "learning_rate": 3.871204403422437e-05, "loss": 0.1292, "step": 3079 }, { "epoch": 4.896661367249602, "grad_norm": 6.245712388579463, "learning_rate": 3.871781356082237e-05, "loss": 0.2433, "step": 3080 }, { "epoch": 4.898251192368839, "grad_norm": 5.9606445508352985, "learning_rate": 3.872358352139405e-05, "loss": 0.2576, "step": 3081 }, { "epoch": 4.899841017488076, "grad_norm": 3.911129096129277, "learning_rate": 3.87293539139865e-05, "loss": 0.1654, "step": 3082 }, { "epoch": 4.901430842607313, "grad_norm": 6.771520275163338, "learning_rate": 3.873512473664663e-05, "loss": 0.2137, "step": 3083 }, { "epoch": 4.90302066772655, "grad_norm": 5.304142581286025, "learning_rate": 3.874089598742123e-05, "loss": 0.1802, "step": 3084 }, { "epoch": 4.904610492845787, "grad_norm": 4.040043617014427, "learning_rate": 3.874666766435696e-05, "loss": 0.2137, "step": 3085 }, { "epoch": 4.906200317965024, "grad_norm": 7.562522845513537, "learning_rate": 3.87524397655003e-05, "loss": 0.2367, "step": 3086 }, { "epoch": 4.907790143084261, "grad_norm": 4.213430829014083, "learning_rate": 3.87582122888976e-05, "loss": 0.1901, "step": 3087 }, { "epoch": 4.909379968203497, "grad_norm": 6.056482399804958, "learning_rate": 3.8763985232595074e-05, "loss": 0.2467, "step": 3088 }, { "epoch": 4.910969793322734, "grad_norm": 6.4159264473951625, "learning_rate": 3.8769758594638794e-05, "loss": 0.1628, "step": 3089 }, { "epoch": 4.912559618441971, "grad_norm": 5.777009509157185, "learning_rate": 3.877553237307468e-05, "loss": 0.2864, "step": 3090 }, { "epoch": 4.914149443561208, "grad_norm": 7.990333782273365, "learning_rate": 3.8781306565948524e-05, "loss": 0.1866, "step": 3091 }, { "epoch": 4.915739268680445, "grad_norm": 6.5651236602109755, "learning_rate": 3.878708117130597e-05, "loss": 0.2128, "step": 3092 }, { "epoch": 4.917329093799682, "grad_norm": 9.95734657709305, "learning_rate": 3.879285618719252e-05, "loss": 0.1815, "step": 3093 }, { "epoch": 4.918918918918919, "grad_norm": 5.1939003965343735, "learning_rate": 3.879863161165353e-05, "loss": 0.1626, "step": 3094 }, { "epoch": 4.920508744038155, "grad_norm": 5.723299283432753, "learning_rate": 3.880440744273425e-05, "loss": 0.1781, "step": 3095 }, { "epoch": 4.922098569157392, "grad_norm": 7.218476478290796, "learning_rate": 3.881018367847975e-05, "loss": 0.213, "step": 3096 }, { "epoch": 4.923688394276629, "grad_norm": 5.426793080187448, "learning_rate": 3.881596031693499e-05, "loss": 0.1935, "step": 3097 }, { "epoch": 4.925278219395866, "grad_norm": 3.739091091665215, "learning_rate": 3.88217373561448e-05, "loss": 0.2606, "step": 3098 }, { "epoch": 4.926868044515103, "grad_norm": 10.127601911827314, "learning_rate": 3.882751479415384e-05, "loss": 0.1638, "step": 3099 }, { "epoch": 4.92845786963434, "grad_norm": 3.848298022275133, "learning_rate": 3.883329262900667e-05, "loss": 0.1869, "step": 3100 }, { "epoch": 4.930047694753577, "grad_norm": 5.4332112614738985, "learning_rate": 3.88390708587477e-05, "loss": 0.1583, "step": 3101 }, { "epoch": 4.9316375198728135, "grad_norm": 8.498489618072727, "learning_rate": 3.88448494814212e-05, "loss": 0.1447, "step": 3102 }, { "epoch": 4.9332273449920505, "grad_norm": 4.13153271477355, "learning_rate": 3.885062849507133e-05, "loss": 0.1734, "step": 3103 }, { "epoch": 4.9348171701112875, "grad_norm": 4.24989692186795, "learning_rate": 3.88564078977421e-05, "loss": 0.1457, "step": 3104 }, { "epoch": 4.9364069952305245, "grad_norm": 8.100504599419704, "learning_rate": 3.8862187687477385e-05, "loss": 0.1868, "step": 3105 }, { "epoch": 4.9379968203497615, "grad_norm": 5.8665941092678215, "learning_rate": 3.8867967862320934e-05, "loss": 0.142, "step": 3106 }, { "epoch": 4.9395866454689985, "grad_norm": 5.4589548097785965, "learning_rate": 3.8873748420316374e-05, "loss": 0.2053, "step": 3107 }, { "epoch": 4.9411764705882355, "grad_norm": 3.3942593491296678, "learning_rate": 3.887952935950719e-05, "loss": 0.1408, "step": 3108 }, { "epoch": 4.9427662957074725, "grad_norm": 5.12486787459897, "learning_rate": 3.888531067793675e-05, "loss": 0.2025, "step": 3109 }, { "epoch": 4.9443561208267095, "grad_norm": 5.2525956519749135, "learning_rate": 3.889109237364828e-05, "loss": 0.1913, "step": 3110 }, { "epoch": 4.945945945945946, "grad_norm": 2.864038316996026, "learning_rate": 3.889687444468488e-05, "loss": 0.1404, "step": 3111 }, { "epoch": 4.947535771065183, "grad_norm": 6.797000131628201, "learning_rate": 3.890265688908955e-05, "loss": 0.2165, "step": 3112 }, { "epoch": 4.94912559618442, "grad_norm": 3.466099705023118, "learning_rate": 3.8908439704905117e-05, "loss": 0.2067, "step": 3113 }, { "epoch": 4.950715421303657, "grad_norm": 5.277411737435799, "learning_rate": 3.891422289017433e-05, "loss": 0.1917, "step": 3114 }, { "epoch": 4.952305246422894, "grad_norm": 3.8337225622576536, "learning_rate": 3.8920006442939776e-05, "loss": 0.189, "step": 3115 }, { "epoch": 4.953895071542131, "grad_norm": 2.5796117173994944, "learning_rate": 3.892579036124393e-05, "loss": 0.1863, "step": 3116 }, { "epoch": 4.955484896661368, "grad_norm": 2.940643750393657, "learning_rate": 3.893157464312915e-05, "loss": 0.2351, "step": 3117 }, { "epoch": 4.957074721780604, "grad_norm": 3.8168939822379033, "learning_rate": 3.893735928663767e-05, "loss": 0.1579, "step": 3118 }, { "epoch": 4.958664546899841, "grad_norm": 3.7655867256903237, "learning_rate": 3.894314428981159e-05, "loss": 0.1185, "step": 3119 }, { "epoch": 4.960254372019078, "grad_norm": 3.879014527226025, "learning_rate": 3.89489296506929e-05, "loss": 0.1563, "step": 3120 }, { "epoch": 4.961844197138315, "grad_norm": 3.8334049876077083, "learning_rate": 3.8954715367323464e-05, "loss": 0.1989, "step": 3121 }, { "epoch": 4.963434022257552, "grad_norm": 95.99177881105098, "learning_rate": 3.896050143774503e-05, "loss": 7.1704, "step": 3122 }, { "epoch": 4.965023847376789, "grad_norm": 4.616689773430598, "learning_rate": 3.896628785999922e-05, "loss": 0.1759, "step": 3123 }, { "epoch": 4.966613672496026, "grad_norm": 3.1736841179069137, "learning_rate": 3.897207463212753e-05, "loss": 0.192, "step": 3124 }, { "epoch": 4.968203497615263, "grad_norm": 5.365203475002759, "learning_rate": 3.897786175217137e-05, "loss": 0.1588, "step": 3125 }, { "epoch": 4.9697933227345, "grad_norm": 5.527596614855162, "learning_rate": 3.898364921817199e-05, "loss": 0.2012, "step": 3126 }, { "epoch": 4.971383147853736, "grad_norm": 3.327038316209095, "learning_rate": 3.898943702817054e-05, "loss": 0.2234, "step": 3127 }, { "epoch": 4.972972972972973, "grad_norm": 6.278150492275312, "learning_rate": 3.899522518020807e-05, "loss": 0.2227, "step": 3128 }, { "epoch": 4.97456279809221, "grad_norm": 4.4099050207338735, "learning_rate": 3.900101367232549e-05, "loss": 0.2306, "step": 3129 }, { "epoch": 4.976152623211447, "grad_norm": 32.832611074731645, "learning_rate": 3.900680250256361e-05, "loss": 3.6093, "step": 3130 }, { "epoch": 4.977742448330684, "grad_norm": 4.3220987737851155, "learning_rate": 3.9012591668963124e-05, "loss": 0.1678, "step": 3131 }, { "epoch": 4.979332273449921, "grad_norm": 3.7522681148538624, "learning_rate": 3.90183811695646e-05, "loss": 0.1821, "step": 3132 }, { "epoch": 4.980922098569158, "grad_norm": 3.364546562516134, "learning_rate": 3.9024171002408507e-05, "loss": 0.2195, "step": 3133 }, { "epoch": 4.982511923688394, "grad_norm": 3.0099732441028775, "learning_rate": 3.902996116553519e-05, "loss": 0.1391, "step": 3134 }, { "epoch": 4.984101748807631, "grad_norm": 3.6622506682196216, "learning_rate": 3.9035751656984906e-05, "loss": 0.1804, "step": 3135 }, { "epoch": 4.985691573926868, "grad_norm": 4.461055451995872, "learning_rate": 3.904154247479776e-05, "loss": 0.2082, "step": 3136 }, { "epoch": 4.987281399046105, "grad_norm": 5.065649779144699, "learning_rate": 3.904733361701378e-05, "loss": 0.1792, "step": 3137 }, { "epoch": 4.988871224165342, "grad_norm": 4.688758338761301, "learning_rate": 3.9053125081672884e-05, "loss": 0.2159, "step": 3138 }, { "epoch": 4.990461049284579, "grad_norm": 36.94888357646705, "learning_rate": 3.905891686681486e-05, "loss": 0.7229, "step": 3139 }, { "epoch": 4.992050874403816, "grad_norm": 3.4979988817246075, "learning_rate": 3.9064708970479394e-05, "loss": 0.1593, "step": 3140 }, { "epoch": 4.993640699523052, "grad_norm": 3.836601798619348, "learning_rate": 3.907050139070608e-05, "loss": 0.1366, "step": 3141 }, { "epoch": 4.995230524642289, "grad_norm": 7.737929992375876, "learning_rate": 3.907629412553438e-05, "loss": 0.2417, "step": 3142 }, { "epoch": 4.996820349761526, "grad_norm": 4.245040692056753, "learning_rate": 3.908208717300368e-05, "loss": 0.1997, "step": 3143 }, { "epoch": 4.998410174880763, "grad_norm": 4.493782036073353, "learning_rate": 3.908788053115324e-05, "loss": 0.2497, "step": 3144 }, { "epoch": 5.0, "grad_norm": 3.813783244571172, "learning_rate": 3.9093674198022205e-05, "loss": 0.2005, "step": 3145 }, { "epoch": 5.001589825119237, "grad_norm": 3.1417976810739967, "learning_rate": 3.909946817164963e-05, "loss": 0.1914, "step": 3146 }, { "epoch": 5.003179650238474, "grad_norm": 3.9449831084003137, "learning_rate": 3.9105262450074476e-05, "loss": 0.174, "step": 3147 }, { "epoch": 5.004769475357711, "grad_norm": 6.406942478942772, "learning_rate": 3.9111057031335585e-05, "loss": 0.2943, "step": 3148 }, { "epoch": 5.006359300476947, "grad_norm": 6.309121861947579, "learning_rate": 3.91168519134717e-05, "loss": 0.1483, "step": 3149 }, { "epoch": 5.007949125596184, "grad_norm": 6.913258123726462, "learning_rate": 3.912264709452147e-05, "loss": 0.1531, "step": 3150 }, { "epoch": 5.009538950715421, "grad_norm": 5.1484398513077565, "learning_rate": 3.912844257252342e-05, "loss": 0.1614, "step": 3151 }, { "epoch": 5.011128775834658, "grad_norm": 5.152002639415075, "learning_rate": 3.913423834551601e-05, "loss": 0.2535, "step": 3152 }, { "epoch": 5.012718600953895, "grad_norm": 6.753549498925175, "learning_rate": 3.914003441153756e-05, "loss": 0.2111, "step": 3153 }, { "epoch": 5.014308426073132, "grad_norm": 6.631188047880723, "learning_rate": 3.914583076862632e-05, "loss": 0.2029, "step": 3154 }, { "epoch": 5.015898251192369, "grad_norm": 4.47973446001777, "learning_rate": 3.915162741482045e-05, "loss": 0.1836, "step": 3155 }, { "epoch": 5.017488076311606, "grad_norm": 4.821564486769781, "learning_rate": 3.915742434815797e-05, "loss": 0.1872, "step": 3156 }, { "epoch": 5.019077901430842, "grad_norm": 4.236725563156107, "learning_rate": 3.916322156667684e-05, "loss": 0.2052, "step": 3157 }, { "epoch": 5.020667726550079, "grad_norm": 6.193891378671041, "learning_rate": 3.9169019068414915e-05, "loss": 0.1834, "step": 3158 }, { "epoch": 5.022257551669316, "grad_norm": 4.206869567992898, "learning_rate": 3.9174816851409946e-05, "loss": 0.151, "step": 3159 }, { "epoch": 5.023847376788553, "grad_norm": 4.201639270549849, "learning_rate": 3.918061491369959e-05, "loss": 0.1732, "step": 3160 }, { "epoch": 5.02543720190779, "grad_norm": 9.223174267708767, "learning_rate": 3.9186413253321415e-05, "loss": 0.1554, "step": 3161 }, { "epoch": 5.027027027027027, "grad_norm": 3.951693125178095, "learning_rate": 3.91922118683129e-05, "loss": 0.1459, "step": 3162 }, { "epoch": 5.028616852146264, "grad_norm": 5.464513987640649, "learning_rate": 3.919801075671141e-05, "loss": 0.155, "step": 3163 }, { "epoch": 5.030206677265501, "grad_norm": 4.296348378630646, "learning_rate": 3.9203809916554244e-05, "loss": 0.1718, "step": 3164 }, { "epoch": 5.031796502384737, "grad_norm": 6.216783349367615, "learning_rate": 3.920960934587859e-05, "loss": 0.1661, "step": 3165 }, { "epoch": 5.033386327503974, "grad_norm": 5.746216944526852, "learning_rate": 3.921540904272155e-05, "loss": 0.1482, "step": 3166 }, { "epoch": 5.034976152623211, "grad_norm": 6.646989962893031, "learning_rate": 3.922120900512014e-05, "loss": 0.2222, "step": 3167 }, { "epoch": 5.036565977742448, "grad_norm": 4.274581344410608, "learning_rate": 3.9227009231111287e-05, "loss": 0.1962, "step": 3168 }, { "epoch": 5.038155802861685, "grad_norm": 5.526619859866819, "learning_rate": 3.9232809718731816e-05, "loss": 0.1538, "step": 3169 }, { "epoch": 5.039745627980922, "grad_norm": 3.466969258873424, "learning_rate": 3.9238610466018474e-05, "loss": 0.188, "step": 3170 }, { "epoch": 5.041335453100159, "grad_norm": 4.678882367267456, "learning_rate": 3.924441147100792e-05, "loss": 0.1577, "step": 3171 }, { "epoch": 5.042925278219396, "grad_norm": 6.234370958066066, "learning_rate": 3.9250212731736725e-05, "loss": 0.1545, "step": 3172 }, { "epoch": 5.044515103338632, "grad_norm": 6.22621475238784, "learning_rate": 3.9256014246241365e-05, "loss": 0.2901, "step": 3173 }, { "epoch": 5.046104928457869, "grad_norm": 2.930700640998104, "learning_rate": 3.9261816012558254e-05, "loss": 0.2099, "step": 3174 }, { "epoch": 5.047694753577106, "grad_norm": 5.456786983899228, "learning_rate": 3.9267618028723686e-05, "loss": 0.1296, "step": 3175 }, { "epoch": 5.049284578696343, "grad_norm": 3.3555529633921, "learning_rate": 3.927342029277389e-05, "loss": 0.2012, "step": 3176 }, { "epoch": 5.05087440381558, "grad_norm": 3.851414442594951, "learning_rate": 3.9279222802745025e-05, "loss": 0.1742, "step": 3177 }, { "epoch": 5.052464228934817, "grad_norm": 8.304943650481555, "learning_rate": 3.928502555667314e-05, "loss": 0.184, "step": 3178 }, { "epoch": 5.054054054054054, "grad_norm": 2.458337897465749, "learning_rate": 3.9290828552594215e-05, "loss": 0.215, "step": 3179 }, { "epoch": 5.0556438791732905, "grad_norm": 8.75973173521465, "learning_rate": 3.929663178854415e-05, "loss": 0.2166, "step": 3180 }, { "epoch": 5.0572337042925275, "grad_norm": 8.031836035969858, "learning_rate": 3.9302435262558754e-05, "loss": 0.1975, "step": 3181 }, { "epoch": 5.0588235294117645, "grad_norm": 6.718383278025328, "learning_rate": 3.930823897267376e-05, "loss": 0.171, "step": 3182 }, { "epoch": 5.0604133545310015, "grad_norm": 7.544222715371571, "learning_rate": 3.931404291692482e-05, "loss": 0.1957, "step": 3183 }, { "epoch": 5.0620031796502385, "grad_norm": 4.207459218002939, "learning_rate": 3.931984709334752e-05, "loss": 0.1771, "step": 3184 }, { "epoch": 5.0635930047694755, "grad_norm": 5.2824820028364545, "learning_rate": 3.9325651499977346e-05, "loss": 0.1335, "step": 3185 }, { "epoch": 5.0651828298887125, "grad_norm": 4.981247974779617, "learning_rate": 3.933145613484973e-05, "loss": 0.1212, "step": 3186 }, { "epoch": 5.0667726550079495, "grad_norm": 6.804385146422567, "learning_rate": 3.9337260996e-05, "loss": 0.2551, "step": 3187 }, { "epoch": 5.068362480127186, "grad_norm": 5.870305467893209, "learning_rate": 3.934306608146343e-05, "loss": 0.182, "step": 3188 }, { "epoch": 5.069952305246423, "grad_norm": 3.783745988016876, "learning_rate": 3.934887138927519e-05, "loss": 0.1281, "step": 3189 }, { "epoch": 5.07154213036566, "grad_norm": 5.4158916666554395, "learning_rate": 3.935467691747042e-05, "loss": 0.1636, "step": 3190 }, { "epoch": 5.073131955484897, "grad_norm": 5.4575347927256415, "learning_rate": 3.936048266408415e-05, "loss": 0.2357, "step": 3191 }, { "epoch": 5.074721780604134, "grad_norm": 1.9378795006277796, "learning_rate": 3.936628862715133e-05, "loss": 0.1402, "step": 3192 }, { "epoch": 5.076311605723371, "grad_norm": 4.030018064462303, "learning_rate": 3.9372094804706866e-05, "loss": 0.1065, "step": 3193 }, { "epoch": 5.077901430842608, "grad_norm": 16.864532344243376, "learning_rate": 3.937790119478558e-05, "loss": 2.0799, "step": 3194 }, { "epoch": 5.079491255961845, "grad_norm": 2.1652832401350377, "learning_rate": 3.9383707795422206e-05, "loss": 0.1495, "step": 3195 }, { "epoch": 5.081081081081081, "grad_norm": 6.010736434788803, "learning_rate": 3.938951460465143e-05, "loss": 0.1946, "step": 3196 }, { "epoch": 5.082670906200318, "grad_norm": 4.6598299262982845, "learning_rate": 3.939532162050786e-05, "loss": 0.1434, "step": 3197 }, { "epoch": 5.084260731319555, "grad_norm": 4.116653593391845, "learning_rate": 3.940112884102602e-05, "loss": 0.1846, "step": 3198 }, { "epoch": 5.085850556438792, "grad_norm": 5.347967583013869, "learning_rate": 3.940693626424038e-05, "loss": 0.149, "step": 3199 }, { "epoch": 5.087440381558029, "grad_norm": 4.141401834558379, "learning_rate": 3.9412743888185346e-05, "loss": 0.1861, "step": 3200 }, { "epoch": 5.089030206677266, "grad_norm": 3.134515148826531, "learning_rate": 3.9418551710895245e-05, "loss": 0.1369, "step": 3201 }, { "epoch": 5.090620031796503, "grad_norm": 3.342417052993152, "learning_rate": 3.9424359730404326e-05, "loss": 0.1721, "step": 3202 }, { "epoch": 5.09220985691574, "grad_norm": 4.2037465630375666, "learning_rate": 3.943016794474681e-05, "loss": 0.1365, "step": 3203 }, { "epoch": 5.093799682034976, "grad_norm": 3.4948477781928817, "learning_rate": 3.943597635195679e-05, "loss": 0.1631, "step": 3204 }, { "epoch": 5.095389507154213, "grad_norm": 5.418980214654038, "learning_rate": 3.944178495006837e-05, "loss": 0.1665, "step": 3205 }, { "epoch": 5.09697933227345, "grad_norm": 6.748148422544536, "learning_rate": 3.944759373711552e-05, "loss": 0.1442, "step": 3206 }, { "epoch": 5.098569157392687, "grad_norm": 2.7301299987849483, "learning_rate": 3.9453402711132186e-05, "loss": 0.1858, "step": 3207 }, { "epoch": 5.100158982511924, "grad_norm": 5.289910021179296, "learning_rate": 3.945921187015225e-05, "loss": 0.1651, "step": 3208 }, { "epoch": 5.101748807631161, "grad_norm": 3.9987481519172365, "learning_rate": 3.9465021212209516e-05, "loss": 0.1734, "step": 3209 }, { "epoch": 5.103338632750398, "grad_norm": 8.694527939580983, "learning_rate": 3.947083073533772e-05, "loss": 0.1702, "step": 3210 }, { "epoch": 5.104928457869635, "grad_norm": 4.200271018200816, "learning_rate": 3.9476640437570556e-05, "loss": 0.1762, "step": 3211 }, { "epoch": 5.106518282988871, "grad_norm": 5.000848684418953, "learning_rate": 3.948245031694167e-05, "loss": 0.18, "step": 3212 }, { "epoch": 5.108108108108108, "grad_norm": 3.4518084572134162, "learning_rate": 3.94882603714846e-05, "loss": 0.1451, "step": 3213 }, { "epoch": 5.109697933227345, "grad_norm": 6.324116123340721, "learning_rate": 3.9494070599232865e-05, "loss": 0.2063, "step": 3214 }, { "epoch": 5.111287758346582, "grad_norm": 9.26886097780332, "learning_rate": 3.9499880998219915e-05, "loss": 0.6387, "step": 3215 }, { "epoch": 5.112877583465819, "grad_norm": 3.334544865492814, "learning_rate": 3.950569156647914e-05, "loss": 0.2308, "step": 3216 }, { "epoch": 5.114467408585056, "grad_norm": 6.556957324346155, "learning_rate": 3.9511502302043866e-05, "loss": 0.1696, "step": 3217 }, { "epoch": 5.116057233704293, "grad_norm": 3.543963033174834, "learning_rate": 3.951731320294738e-05, "loss": 0.2287, "step": 3218 }, { "epoch": 5.117647058823529, "grad_norm": 145.31802865354538, "learning_rate": 3.9523124267222896e-05, "loss": 19.6723, "step": 3219 }, { "epoch": 5.119236883942766, "grad_norm": 10.808288613485125, "learning_rate": 3.952893549290357e-05, "loss": 0.2665, "step": 3220 }, { "epoch": 5.120826709062003, "grad_norm": 4.531459823519616, "learning_rate": 3.9534746878022534e-05, "loss": 0.1729, "step": 3221 }, { "epoch": 5.12241653418124, "grad_norm": 5.816490601092628, "learning_rate": 3.9540558420612835e-05, "loss": 0.2112, "step": 3222 }, { "epoch": 5.124006359300477, "grad_norm": 6.345665738840117, "learning_rate": 3.954637011870746e-05, "loss": 0.2446, "step": 3223 }, { "epoch": 5.125596184419714, "grad_norm": 7.331748459175939, "learning_rate": 3.955218197033939e-05, "loss": 0.2145, "step": 3224 }, { "epoch": 5.127186009538951, "grad_norm": 5.652738366933304, "learning_rate": 3.9557993973541496e-05, "loss": 0.2006, "step": 3225 }, { "epoch": 5.128775834658188, "grad_norm": 11.039973620648666, "learning_rate": 3.9563806126346645e-05, "loss": 0.2168, "step": 3226 }, { "epoch": 5.130365659777424, "grad_norm": 7.40845146039602, "learning_rate": 3.956961842678762e-05, "loss": 0.3478, "step": 3227 }, { "epoch": 5.131955484896661, "grad_norm": 10.789564130931259, "learning_rate": 3.9575430872897176e-05, "loss": 0.1828, "step": 3228 }, { "epoch": 5.133545310015898, "grad_norm": 9.06250493324314, "learning_rate": 3.958124346270801e-05, "loss": 0.1507, "step": 3229 }, { "epoch": 5.135135135135135, "grad_norm": 5.2972303558972635, "learning_rate": 3.958705619425276e-05, "loss": 0.1879, "step": 3230 }, { "epoch": 5.136724960254372, "grad_norm": 7.993466568212637, "learning_rate": 3.9592869065564043e-05, "loss": 0.2312, "step": 3231 }, { "epoch": 5.138314785373609, "grad_norm": 8.578904337817681, "learning_rate": 3.9598682074674406e-05, "loss": 0.2132, "step": 3232 }, { "epoch": 5.139904610492846, "grad_norm": 5.414221216728222, "learning_rate": 3.960449521961635e-05, "loss": 0.1625, "step": 3233 }, { "epoch": 5.141494435612083, "grad_norm": 8.939647641844699, "learning_rate": 3.9610308498422346e-05, "loss": 0.19, "step": 3234 }, { "epoch": 5.143084260731319, "grad_norm": 4092.262599592264, "learning_rate": 3.9616121909124805e-05, "loss": 0.8037, "step": 3235 }, { "epoch": 5.144674085850556, "grad_norm": 7.23206547269352, "learning_rate": 3.962193544975609e-05, "loss": 0.2043, "step": 3236 }, { "epoch": 5.146263910969793, "grad_norm": 8.93656683745644, "learning_rate": 3.962774911834854e-05, "loss": 0.1126, "step": 3237 }, { "epoch": 5.14785373608903, "grad_norm": 4.323413224009858, "learning_rate": 3.963356291293444e-05, "loss": 0.1965, "step": 3238 }, { "epoch": 5.149443561208267, "grad_norm": 5.428463932826973, "learning_rate": 3.963937683154602e-05, "loss": 0.1433, "step": 3239 }, { "epoch": 5.151033386327504, "grad_norm": 4.449657009795377, "learning_rate": 3.9645190872215485e-05, "loss": 0.1661, "step": 3240 }, { "epoch": 5.152623211446741, "grad_norm": 5.787103594564749, "learning_rate": 3.9651005032975e-05, "loss": 0.1817, "step": 3241 }, { "epoch": 5.154213036565977, "grad_norm": 6.983731210632461, "learning_rate": 3.9656819311856655e-05, "loss": 0.2322, "step": 3242 }, { "epoch": 5.155802861685214, "grad_norm": 6.836654150500689, "learning_rate": 3.9662633706892565e-05, "loss": 0.2177, "step": 3243 }, { "epoch": 5.157392686804451, "grad_norm": 6.553532738540634, "learning_rate": 3.9668448216114736e-05, "loss": 0.2438, "step": 3244 }, { "epoch": 5.158982511923688, "grad_norm": 4.597066789152632, "learning_rate": 3.967426283755519e-05, "loss": 0.1506, "step": 3245 }, { "epoch": 5.160572337042925, "grad_norm": 3.9257205352117843, "learning_rate": 3.968007756924587e-05, "loss": 0.174, "step": 3246 }, { "epoch": 5.162162162162162, "grad_norm": 5.039360461836179, "learning_rate": 3.968589240921872e-05, "loss": 0.1864, "step": 3247 }, { "epoch": 5.163751987281399, "grad_norm": 3.2656331476213194, "learning_rate": 3.969170735550561e-05, "loss": 0.1405, "step": 3248 }, { "epoch": 5.165341812400636, "grad_norm": 5.186387579253642, "learning_rate": 3.969752240613839e-05, "loss": 0.3296, "step": 3249 }, { "epoch": 5.166931637519872, "grad_norm": 7.761758220008351, "learning_rate": 3.970333755914889e-05, "loss": 0.2316, "step": 3250 }, { "epoch": 5.168521462639109, "grad_norm": 2.947448626730211, "learning_rate": 3.970915281256889e-05, "loss": 0.1702, "step": 3251 }, { "epoch": 5.170111287758346, "grad_norm": 4.6527327075401566, "learning_rate": 3.971496816443012e-05, "loss": 0.1717, "step": 3252 }, { "epoch": 5.171701112877583, "grad_norm": 7.074653775487216, "learning_rate": 3.9720783612764316e-05, "loss": 0.1162, "step": 3253 }, { "epoch": 5.17329093799682, "grad_norm": 4.435167150520846, "learning_rate": 3.972659915560314e-05, "loss": 0.2195, "step": 3254 }, { "epoch": 5.174880763116057, "grad_norm": 4.245252395138678, "learning_rate": 3.9732414790978256e-05, "loss": 0.1491, "step": 3255 }, { "epoch": 5.176470588235294, "grad_norm": 6.579511092477509, "learning_rate": 3.9738230516921264e-05, "loss": 0.8204, "step": 3256 }, { "epoch": 5.178060413354531, "grad_norm": 2.638582405072483, "learning_rate": 3.974404633146378e-05, "loss": 0.1757, "step": 3257 }, { "epoch": 5.1796502384737675, "grad_norm": 3.423092395941031, "learning_rate": 3.974986223263734e-05, "loss": 0.1567, "step": 3258 }, { "epoch": 5.1812400635930045, "grad_norm": 5.188173294617301, "learning_rate": 3.975567821847347e-05, "loss": 0.1773, "step": 3259 }, { "epoch": 5.1828298887122415, "grad_norm": 1.897788890273969, "learning_rate": 3.9761494287003676e-05, "loss": 0.2289, "step": 3260 }, { "epoch": 5.1844197138314785, "grad_norm": 4.475780906306349, "learning_rate": 3.976731043625944e-05, "loss": 0.1919, "step": 3261 }, { "epoch": 5.1860095389507155, "grad_norm": 3.9359197141331075, "learning_rate": 3.977312666427219e-05, "loss": 0.144, "step": 3262 }, { "epoch": 5.1875993640699525, "grad_norm": 2.269791581924391, "learning_rate": 3.977894296907335e-05, "loss": 0.1865, "step": 3263 }, { "epoch": 5.1891891891891895, "grad_norm": 2.4280072915130173, "learning_rate": 3.9784759348694306e-05, "loss": 0.1425, "step": 3264 }, { "epoch": 5.1907790143084265, "grad_norm": 2.8096177936195104, "learning_rate": 3.979057580116643e-05, "loss": 0.1904, "step": 3265 }, { "epoch": 5.192368839427663, "grad_norm": 4.409887184584052, "learning_rate": 3.9796392324521065e-05, "loss": 0.1311, "step": 3266 }, { "epoch": 5.1939586645469, "grad_norm": 3.4016637983221583, "learning_rate": 3.9802208916789524e-05, "loss": 0.1331, "step": 3267 }, { "epoch": 5.195548489666137, "grad_norm": 4.769797843588128, "learning_rate": 3.98080255760031e-05, "loss": 0.1388, "step": 3268 }, { "epoch": 5.197138314785374, "grad_norm": 40.81795972676857, "learning_rate": 3.9813842300193074e-05, "loss": 4.4096, "step": 3269 }, { "epoch": 5.198728139904611, "grad_norm": 2.825967566402579, "learning_rate": 3.981965908739068e-05, "loss": 0.1179, "step": 3270 }, { "epoch": 5.200317965023848, "grad_norm": 4.762896386637612, "learning_rate": 3.9825475935627164e-05, "loss": 0.1797, "step": 3271 }, { "epoch": 5.201907790143085, "grad_norm": 3.7688040914293, "learning_rate": 3.983129284293372e-05, "loss": 0.1727, "step": 3272 }, { "epoch": 5.203497615262322, "grad_norm": 18.756918820172697, "learning_rate": 3.983710980734154e-05, "loss": 2.2986, "step": 3273 }, { "epoch": 5.205087440381558, "grad_norm": 5.068450244438738, "learning_rate": 3.98429268268818e-05, "loss": 0.1633, "step": 3274 }, { "epoch": 5.206677265500795, "grad_norm": 6.316160716035796, "learning_rate": 3.9848743899585624e-05, "loss": 0.1777, "step": 3275 }, { "epoch": 5.208267090620032, "grad_norm": 4.978922563980388, "learning_rate": 3.985456102348417e-05, "loss": 0.1758, "step": 3276 }, { "epoch": 5.209856915739269, "grad_norm": 5.128355211832838, "learning_rate": 3.9860378196608546e-05, "loss": 0.155, "step": 3277 }, { "epoch": 5.211446740858506, "grad_norm": 4.605099223317655, "learning_rate": 3.986619541698985e-05, "loss": 0.1333, "step": 3278 }, { "epoch": 5.213036565977743, "grad_norm": 5.014356736648708, "learning_rate": 3.9872012682659156e-05, "loss": 0.2243, "step": 3279 }, { "epoch": 5.21462639109698, "grad_norm": 6.247221292130196, "learning_rate": 3.987782999164753e-05, "loss": 0.1712, "step": 3280 }, { "epoch": 5.216216216216216, "grad_norm": 4.029832460639627, "learning_rate": 3.988364734198603e-05, "loss": 0.1758, "step": 3281 }, { "epoch": 5.217806041335453, "grad_norm": 2.8470184076843075, "learning_rate": 3.98894647317057e-05, "loss": 0.1547, "step": 3282 }, { "epoch": 5.21939586645469, "grad_norm": 3.743275809736415, "learning_rate": 3.9895282158837544e-05, "loss": 0.1708, "step": 3283 }, { "epoch": 5.220985691573927, "grad_norm": 3.367643093305827, "learning_rate": 3.990109962141259e-05, "loss": 0.8137, "step": 3284 }, { "epoch": 5.222575516693164, "grad_norm": 3.162086008074213, "learning_rate": 3.990691711746183e-05, "loss": 0.1538, "step": 3285 }, { "epoch": 5.224165341812401, "grad_norm": 4.858849251407634, "learning_rate": 3.991273464501626e-05, "loss": 0.153, "step": 3286 }, { "epoch": 5.225755166931638, "grad_norm": 31.148176906356223, "learning_rate": 3.9918552202106855e-05, "loss": 3.5175, "step": 3287 }, { "epoch": 5.227344992050875, "grad_norm": 2.5285767735175972, "learning_rate": 3.9924369786764576e-05, "loss": 0.1612, "step": 3288 }, { "epoch": 5.228934817170111, "grad_norm": 2.353182039923773, "learning_rate": 3.9930187397020386e-05, "loss": 0.1328, "step": 3289 }, { "epoch": 5.230524642289348, "grad_norm": 4.475020504788664, "learning_rate": 3.9936005030905236e-05, "loss": 0.1434, "step": 3290 }, { "epoch": 5.232114467408585, "grad_norm": 4.449036431526402, "learning_rate": 3.994182268645006e-05, "loss": 0.1849, "step": 3291 }, { "epoch": 5.233704292527822, "grad_norm": 2.6514551813284974, "learning_rate": 3.9947640361685806e-05, "loss": 0.1433, "step": 3292 }, { "epoch": 5.235294117647059, "grad_norm": 2.8249125552238588, "learning_rate": 3.995345805464339e-05, "loss": 0.168, "step": 3293 }, { "epoch": 5.236883942766296, "grad_norm": 1.9998998085158213, "learning_rate": 3.9959275763353736e-05, "loss": 0.1304, "step": 3294 }, { "epoch": 5.238473767885533, "grad_norm": 2.776144824906679, "learning_rate": 3.996509348584777e-05, "loss": 0.1639, "step": 3295 }, { "epoch": 5.24006359300477, "grad_norm": 4.202170410640712, "learning_rate": 3.9970911220156376e-05, "loss": 0.1085, "step": 3296 }, { "epoch": 5.241653418124006, "grad_norm": 5.437767960722192, "learning_rate": 3.9976728964310496e-05, "loss": 0.4754, "step": 3297 }, { "epoch": 5.243243243243243, "grad_norm": 2.364960647546418, "learning_rate": 3.998254671634102e-05, "loss": 0.1758, "step": 3298 }, { "epoch": 5.24483306836248, "grad_norm": 2.484354277729835, "learning_rate": 3.9988364474278846e-05, "loss": 0.1545, "step": 3299 }, { "epoch": 5.246422893481717, "grad_norm": 3.2765260505908214, "learning_rate": 3.9994182236154874e-05, "loss": 0.1243, "step": 3300 }, { "epoch": 5.248012718600954, "grad_norm": 4.676120416950602, "learning_rate": 4e-05, "loss": 0.1345, "step": 3301 }, { "epoch": 5.249602543720191, "grad_norm": 4.706684573215264, "learning_rate": 4.000581776384513e-05, "loss": 0.1956, "step": 3302 }, { "epoch": 5.251192368839428, "grad_norm": 2.9243588502642552, "learning_rate": 4.001163552572116e-05, "loss": 0.2116, "step": 3303 }, { "epoch": 5.252782193958664, "grad_norm": 4.474191424182363, "learning_rate": 4.001745328365899e-05, "loss": 0.1562, "step": 3304 }, { "epoch": 5.254372019077901, "grad_norm": 3.2772366021728554, "learning_rate": 4.0023271035689504e-05, "loss": 0.1661, "step": 3305 }, { "epoch": 5.255961844197138, "grad_norm": 23.306050064455068, "learning_rate": 4.002908877984362e-05, "loss": 2.7261, "step": 3306 }, { "epoch": 5.257551669316375, "grad_norm": 4.585041783181546, "learning_rate": 4.003490651415224e-05, "loss": 0.1744, "step": 3307 }, { "epoch": 5.259141494435612, "grad_norm": 3.2449976298934313, "learning_rate": 4.004072423664627e-05, "loss": 0.2096, "step": 3308 }, { "epoch": 5.260731319554849, "grad_norm": 2.824591069935683, "learning_rate": 4.004654194535661e-05, "loss": 0.1679, "step": 3309 }, { "epoch": 5.262321144674086, "grad_norm": 5.7845290415263015, "learning_rate": 4.0052359638314194e-05, "loss": 0.1972, "step": 3310 }, { "epoch": 5.263910969793323, "grad_norm": 6.705097214681461, "learning_rate": 4.005817731354994e-05, "loss": 0.1877, "step": 3311 }, { "epoch": 5.26550079491256, "grad_norm": 2.934404440871054, "learning_rate": 4.0063994969094764e-05, "loss": 0.2175, "step": 3312 }, { "epoch": 5.267090620031796, "grad_norm": 3.4177765599559145, "learning_rate": 4.0069812602979614e-05, "loss": 0.1387, "step": 3313 }, { "epoch": 5.268680445151033, "grad_norm": 4.74975715103323, "learning_rate": 4.007563021323543e-05, "loss": 0.1783, "step": 3314 }, { "epoch": 5.27027027027027, "grad_norm": 3.56572285334144, "learning_rate": 4.008144779789315e-05, "loss": 0.1493, "step": 3315 }, { "epoch": 5.271860095389507, "grad_norm": 3.8024960375836505, "learning_rate": 4.0087265354983745e-05, "loss": 0.2099, "step": 3316 }, { "epoch": 5.273449920508744, "grad_norm": 3.656701980603239, "learning_rate": 4.009308288253817e-05, "loss": 0.1295, "step": 3317 }, { "epoch": 5.275039745627981, "grad_norm": 5.733608800726178, "learning_rate": 4.009890037858742e-05, "loss": 0.1278, "step": 3318 }, { "epoch": 5.276629570747218, "grad_norm": 2.434737257393362, "learning_rate": 4.010471784116246e-05, "loss": 0.1535, "step": 3319 }, { "epoch": 5.278219395866454, "grad_norm": 4.698910794092207, "learning_rate": 4.011053526829431e-05, "loss": 0.1996, "step": 3320 }, { "epoch": 5.279809220985691, "grad_norm": 3.145563712189685, "learning_rate": 4.011635265801397e-05, "loss": 0.1789, "step": 3321 }, { "epoch": 5.281399046104928, "grad_norm": 4.3804363845573215, "learning_rate": 4.0122170008352475e-05, "loss": 0.1816, "step": 3322 }, { "epoch": 5.282988871224165, "grad_norm": 3.1806527403775884, "learning_rate": 4.012798731734086e-05, "loss": 0.2061, "step": 3323 }, { "epoch": 5.284578696343402, "grad_norm": 3.556633213133545, "learning_rate": 4.013380458301016e-05, "loss": 0.2213, "step": 3324 }, { "epoch": 5.286168521462639, "grad_norm": 1.9485092067399932, "learning_rate": 4.0139621803391454e-05, "loss": 0.1373, "step": 3325 }, { "epoch": 5.287758346581876, "grad_norm": 2.0451088156073176, "learning_rate": 4.0145438976515825e-05, "loss": 0.1632, "step": 3326 }, { "epoch": 5.289348171701113, "grad_norm": 17.237479277775094, "learning_rate": 4.0151256100414376e-05, "loss": 0.4156, "step": 3327 }, { "epoch": 5.290937996820349, "grad_norm": 43.2954541541336, "learning_rate": 4.015707317311821e-05, "loss": 0.3911, "step": 3328 }, { "epoch": 5.292527821939586, "grad_norm": 3.6154481441004047, "learning_rate": 4.0162890192658464e-05, "loss": 0.2335, "step": 3329 }, { "epoch": 5.294117647058823, "grad_norm": 3.012067627211368, "learning_rate": 4.0168707157066274e-05, "loss": 0.2298, "step": 3330 }, { "epoch": 5.29570747217806, "grad_norm": 3.489115072049047, "learning_rate": 4.0174524064372836e-05, "loss": 0.1175, "step": 3331 }, { "epoch": 5.297297297297297, "grad_norm": 3.410082102229313, "learning_rate": 4.018034091260931e-05, "loss": 0.1145, "step": 3332 }, { "epoch": 5.298887122416534, "grad_norm": 9.052742602366273, "learning_rate": 4.0186157699806926e-05, "loss": 0.3153, "step": 3333 }, { "epoch": 5.300476947535771, "grad_norm": 17.17451838416837, "learning_rate": 4.01919744239969e-05, "loss": 0.397, "step": 3334 }, { "epoch": 5.302066772655008, "grad_norm": 5.752759364622487, "learning_rate": 4.0197791083210476e-05, "loss": 0.1658, "step": 3335 }, { "epoch": 5.3036565977742445, "grad_norm": 7.453149093219002, "learning_rate": 4.020360767547894e-05, "loss": 0.1605, "step": 3336 }, { "epoch": 5.3052464228934815, "grad_norm": 8.115578451092915, "learning_rate": 4.020942419883357e-05, "loss": 0.1908, "step": 3337 }, { "epoch": 5.3068362480127185, "grad_norm": 3.0685502823653583, "learning_rate": 4.02152406513057e-05, "loss": 0.1433, "step": 3338 }, { "epoch": 5.3084260731319555, "grad_norm": 5.393659004773812, "learning_rate": 4.022105703092665e-05, "loss": 0.2678, "step": 3339 }, { "epoch": 5.3100158982511925, "grad_norm": 9.39331757308684, "learning_rate": 4.0226873335727816e-05, "loss": 0.2255, "step": 3340 }, { "epoch": 5.3116057233704295, "grad_norm": 3.9868721866794203, "learning_rate": 4.023268956374057e-05, "loss": 0.2265, "step": 3341 }, { "epoch": 5.3131955484896665, "grad_norm": 8.424448771118728, "learning_rate": 4.0238505712996324e-05, "loss": 0.1699, "step": 3342 }, { "epoch": 5.314785373608903, "grad_norm": 4.619119798854262, "learning_rate": 4.024432178152654e-05, "loss": 0.1759, "step": 3343 }, { "epoch": 5.31637519872814, "grad_norm": 3.3315937944122394, "learning_rate": 4.025013776736267e-05, "loss": 0.1836, "step": 3344 }, { "epoch": 5.317965023847377, "grad_norm": 2.4252355387105062, "learning_rate": 4.0255953668536224e-05, "loss": 0.1514, "step": 3345 }, { "epoch": 5.319554848966614, "grad_norm": 8.488216370881416, "learning_rate": 4.026176948307873e-05, "loss": 0.2144, "step": 3346 }, { "epoch": 5.321144674085851, "grad_norm": 5.2124161327931295, "learning_rate": 4.026758520902175e-05, "loss": 0.152, "step": 3347 }, { "epoch": 5.322734499205088, "grad_norm": 2.7240027332725525, "learning_rate": 4.0273400844396865e-05, "loss": 0.1249, "step": 3348 }, { "epoch": 5.324324324324325, "grad_norm": 4.360065068984866, "learning_rate": 4.027921638723569e-05, "loss": 0.2305, "step": 3349 }, { "epoch": 5.325914149443562, "grad_norm": 5.014145145816082, "learning_rate": 4.0285031835569884e-05, "loss": 0.1435, "step": 3350 }, { "epoch": 5.327503974562799, "grad_norm": 3.873091993981591, "learning_rate": 4.029084718743112e-05, "loss": 0.1527, "step": 3351 }, { "epoch": 5.329093799682035, "grad_norm": 4.992634031784794, "learning_rate": 4.029666244085111e-05, "loss": 0.15, "step": 3352 }, { "epoch": 5.330683624801272, "grad_norm": 4.22407647573742, "learning_rate": 4.030247759386161e-05, "loss": 0.2143, "step": 3353 }, { "epoch": 5.332273449920509, "grad_norm": 2.5110114272929094, "learning_rate": 4.030829264449439e-05, "loss": 0.1263, "step": 3354 }, { "epoch": 5.333863275039746, "grad_norm": 5.4509518847331835, "learning_rate": 4.031410759078128e-05, "loss": 0.2156, "step": 3355 }, { "epoch": 5.335453100158983, "grad_norm": 4.9706294777102045, "learning_rate": 4.031992243075413e-05, "loss": 0.1412, "step": 3356 }, { "epoch": 5.33704292527822, "grad_norm": 3.375314329235515, "learning_rate": 4.0325737162444806e-05, "loss": 0.1563, "step": 3357 }, { "epoch": 5.338632750397457, "grad_norm": 31.85813605087713, "learning_rate": 4.033155178388526e-05, "loss": 4.2646, "step": 3358 }, { "epoch": 5.340222575516693, "grad_norm": 6.814988206293408, "learning_rate": 4.033736629310744e-05, "loss": 0.1767, "step": 3359 }, { "epoch": 5.34181240063593, "grad_norm": 4.548251702813716, "learning_rate": 4.0343180688143345e-05, "loss": 0.2183, "step": 3360 }, { "epoch": 5.343402225755167, "grad_norm": 5.716802161596852, "learning_rate": 4.034899496702501e-05, "loss": 0.2087, "step": 3361 }, { "epoch": 5.344992050874404, "grad_norm": 4.661047188475285, "learning_rate": 4.0354809127784515e-05, "loss": 0.1702, "step": 3362 }, { "epoch": 5.346581875993641, "grad_norm": 2.771958683021182, "learning_rate": 4.0360623168453986e-05, "loss": 0.1777, "step": 3363 }, { "epoch": 5.348171701112878, "grad_norm": 6.198725519731575, "learning_rate": 4.036643708706557e-05, "loss": 0.2361, "step": 3364 }, { "epoch": 5.349761526232115, "grad_norm": 5.942926854997057, "learning_rate": 4.037225088165146e-05, "loss": 0.1879, "step": 3365 }, { "epoch": 5.351351351351352, "grad_norm": 5.709530159618127, "learning_rate": 4.037806455024391e-05, "loss": 0.3747, "step": 3366 }, { "epoch": 5.352941176470588, "grad_norm": 5.921600595461843, "learning_rate": 4.03838780908752e-05, "loss": 0.1576, "step": 3367 }, { "epoch": 5.354531001589825, "grad_norm": 2.8304427046862135, "learning_rate": 4.038969150157766e-05, "loss": 0.1523, "step": 3368 }, { "epoch": 5.356120826709062, "grad_norm": 4.590225166731911, "learning_rate": 4.039550478038365e-05, "loss": 0.1767, "step": 3369 }, { "epoch": 5.357710651828299, "grad_norm": 7.07948111921339, "learning_rate": 4.04013179253256e-05, "loss": 0.1522, "step": 3370 }, { "epoch": 5.359300476947536, "grad_norm": 3.9079634668104393, "learning_rate": 4.040713093443596e-05, "loss": 0.1633, "step": 3371 }, { "epoch": 5.360890302066773, "grad_norm": 4.2952145187029585, "learning_rate": 4.0412943805747245e-05, "loss": 0.1844, "step": 3372 }, { "epoch": 5.36248012718601, "grad_norm": 22.065681080778937, "learning_rate": 4.0418756537292e-05, "loss": 2.134, "step": 3373 }, { "epoch": 5.364069952305247, "grad_norm": 3.1645902101629457, "learning_rate": 4.042456912710283e-05, "loss": 0.143, "step": 3374 }, { "epoch": 5.365659777424483, "grad_norm": 68.07460740384182, "learning_rate": 4.043038157321238e-05, "loss": 11.0958, "step": 3375 }, { "epoch": 5.36724960254372, "grad_norm": 84.63360063316766, "learning_rate": 4.043619387365336e-05, "loss": 17.6538, "step": 3376 }, { "epoch": 5.368839427662957, "grad_norm": 7.687785150814514, "learning_rate": 4.044200602645851e-05, "loss": 0.1898, "step": 3377 }, { "epoch": 5.370429252782194, "grad_norm": 6.8688803067296735, "learning_rate": 4.044781802966062e-05, "loss": 0.2827, "step": 3378 }, { "epoch": 5.372019077901431, "grad_norm": 4.861078091775898, "learning_rate": 4.045362988129254e-05, "loss": 0.1649, "step": 3379 }, { "epoch": 5.373608903020668, "grad_norm": 6.193300038722646, "learning_rate": 4.045944157938718e-05, "loss": 0.1798, "step": 3380 }, { "epoch": 5.375198728139905, "grad_norm": 7.897007442024807, "learning_rate": 4.046525312197747e-05, "loss": 0.1475, "step": 3381 }, { "epoch": 5.376788553259141, "grad_norm": 5.578984562435793, "learning_rate": 4.047106450709643e-05, "loss": 0.1526, "step": 3382 }, { "epoch": 5.378378378378378, "grad_norm": 6.946858136019721, "learning_rate": 4.047687573277711e-05, "loss": 0.169, "step": 3383 }, { "epoch": 5.379968203497615, "grad_norm": 6.99040305123249, "learning_rate": 4.048268679705262e-05, "loss": 0.188, "step": 3384 }, { "epoch": 5.381558028616852, "grad_norm": 6.178934717371057, "learning_rate": 4.048849769795613e-05, "loss": 0.1676, "step": 3385 }, { "epoch": 5.383147853736089, "grad_norm": 4.124958855854203, "learning_rate": 4.049430843352086e-05, "loss": 0.1746, "step": 3386 }, { "epoch": 5.384737678855326, "grad_norm": 629.043649003002, "learning_rate": 4.0500119001780084e-05, "loss": 10.3385, "step": 3387 }, { "epoch": 5.386327503974563, "grad_norm": 5.800127781007638, "learning_rate": 4.0505929400767134e-05, "loss": 0.2342, "step": 3388 }, { "epoch": 5.3879173290938, "grad_norm": 8.80352542521553, "learning_rate": 4.05117396285154e-05, "loss": 0.2009, "step": 3389 }, { "epoch": 5.389507154213036, "grad_norm": 9.504592614212847, "learning_rate": 4.051754968305833e-05, "loss": 0.2688, "step": 3390 }, { "epoch": 5.391096979332273, "grad_norm": 4.686426137358139, "learning_rate": 4.052335956242944e-05, "loss": 0.2281, "step": 3391 }, { "epoch": 5.39268680445151, "grad_norm": 4.764332275994334, "learning_rate": 4.052916926466229e-05, "loss": 0.2784, "step": 3392 }, { "epoch": 5.394276629570747, "grad_norm": 5.641877214753397, "learning_rate": 4.05349787877905e-05, "loss": 0.2065, "step": 3393 }, { "epoch": 5.395866454689984, "grad_norm": 4.75842647787935, "learning_rate": 4.0540788129847756e-05, "loss": 0.1942, "step": 3394 }, { "epoch": 5.397456279809221, "grad_norm": 5.419073537455076, "learning_rate": 4.0546597288867814e-05, "loss": 0.1938, "step": 3395 }, { "epoch": 5.399046104928458, "grad_norm": 5.153260302590068, "learning_rate": 4.0552406262884486e-05, "loss": 0.1726, "step": 3396 }, { "epoch": 5.400635930047695, "grad_norm": 5.793451002280416, "learning_rate": 4.055821504993164e-05, "loss": 0.2244, "step": 3397 }, { "epoch": 5.402225755166931, "grad_norm": 4.654020802446549, "learning_rate": 4.056402364804321e-05, "loss": 0.1899, "step": 3398 }, { "epoch": 5.403815580286168, "grad_norm": 3.369713826262213, "learning_rate": 4.05698320552532e-05, "loss": 0.1532, "step": 3399 }, { "epoch": 5.405405405405405, "grad_norm": 5.063153721530426, "learning_rate": 4.057564026959568e-05, "loss": 0.1907, "step": 3400 }, { "epoch": 5.406995230524642, "grad_norm": 4.728035001353066, "learning_rate": 4.058144828910476e-05, "loss": 0.1683, "step": 3401 }, { "epoch": 5.408585055643879, "grad_norm": 3.2048221268248507, "learning_rate": 4.058725611181465e-05, "loss": 0.1747, "step": 3402 }, { "epoch": 5.410174880763116, "grad_norm": 6.010211143757257, "learning_rate": 4.059306373575962e-05, "loss": 0.7913, "step": 3403 }, { "epoch": 5.411764705882353, "grad_norm": 2.51097635330686, "learning_rate": 4.059887115897398e-05, "loss": 0.1546, "step": 3404 }, { "epoch": 5.413354531001589, "grad_norm": 2.949495837613159, "learning_rate": 4.060467837949215e-05, "loss": 0.2043, "step": 3405 }, { "epoch": 5.414944356120826, "grad_norm": 3.636139195685939, "learning_rate": 4.0610485395348575e-05, "loss": 0.1881, "step": 3406 }, { "epoch": 5.416534181240063, "grad_norm": 3.8135306184627114, "learning_rate": 4.0616292204577794e-05, "loss": 0.2721, "step": 3407 }, { "epoch": 5.4181240063593, "grad_norm": 2.5776997873427323, "learning_rate": 4.062209880521443e-05, "loss": 0.1456, "step": 3408 }, { "epoch": 5.419713831478537, "grad_norm": 4.862697346584616, "learning_rate": 4.062790519529314e-05, "loss": 0.1784, "step": 3409 }, { "epoch": 5.421303656597774, "grad_norm": 23.804509056014602, "learning_rate": 4.063371137284868e-05, "loss": 2.0955, "step": 3410 }, { "epoch": 5.422893481717011, "grad_norm": 3.1810064246810312, "learning_rate": 4.063951733591586e-05, "loss": 0.1672, "step": 3411 }, { "epoch": 5.424483306836248, "grad_norm": 2.825769729708267, "learning_rate": 4.0645323082529576e-05, "loss": 0.2175, "step": 3412 }, { "epoch": 5.426073131955485, "grad_norm": 3.749217123506885, "learning_rate": 4.0651128610724813e-05, "loss": 0.1186, "step": 3413 }, { "epoch": 5.4276629570747215, "grad_norm": 4.737349738752162, "learning_rate": 4.065693391853658e-05, "loss": 0.1738, "step": 3414 }, { "epoch": 5.4292527821939585, "grad_norm": 2.3756235189647015, "learning_rate": 4.0662739004e-05, "loss": 0.1469, "step": 3415 }, { "epoch": 5.4308426073131955, "grad_norm": 1.9303143586524525, "learning_rate": 4.0668543865150274e-05, "loss": 0.1455, "step": 3416 }, { "epoch": 5.4324324324324325, "grad_norm": 3.8194861119842733, "learning_rate": 4.0674348500022654e-05, "loss": 0.1926, "step": 3417 }, { "epoch": 5.4340222575516695, "grad_norm": 4.737618187670062, "learning_rate": 4.0680152906652485e-05, "loss": 0.2661, "step": 3418 }, { "epoch": 5.4356120826709065, "grad_norm": 3.092654360607652, "learning_rate": 4.068595708307518e-05, "loss": 0.1544, "step": 3419 }, { "epoch": 5.4372019077901435, "grad_norm": 3.936886971436258, "learning_rate": 4.069176102732625e-05, "loss": 0.1809, "step": 3420 }, { "epoch": 5.43879173290938, "grad_norm": 5.107064397296858, "learning_rate": 4.069756473744125e-05, "loss": 0.185, "step": 3421 }, { "epoch": 5.440381558028617, "grad_norm": 4.854920272273713, "learning_rate": 4.070336821145586e-05, "loss": 0.1546, "step": 3422 }, { "epoch": 5.441971383147854, "grad_norm": 4.791076619573743, "learning_rate": 4.0709171447405785e-05, "loss": 0.203, "step": 3423 }, { "epoch": 5.443561208267091, "grad_norm": 3.0663372186779116, "learning_rate": 4.071497444332686e-05, "loss": 0.1726, "step": 3424 }, { "epoch": 5.4451510333863276, "grad_norm": 5.314565504562536, "learning_rate": 4.0720777197254975e-05, "loss": 0.1975, "step": 3425 }, { "epoch": 5.4467408585055646, "grad_norm": 2.090052674935401, "learning_rate": 4.072657970722611e-05, "loss": 0.2041, "step": 3426 }, { "epoch": 5.4483306836248016, "grad_norm": 5.59213317121822, "learning_rate": 4.073238197127632e-05, "loss": 0.1644, "step": 3427 }, { "epoch": 5.4499205087440385, "grad_norm": 2.7828319422145733, "learning_rate": 4.073818398744175e-05, "loss": 0.1732, "step": 3428 }, { "epoch": 5.451510333863275, "grad_norm": 3.91583310076718, "learning_rate": 4.074398575375863e-05, "loss": 0.1978, "step": 3429 }, { "epoch": 5.453100158982512, "grad_norm": 2.3144339957234217, "learning_rate": 4.0749787268263275e-05, "loss": 0.1445, "step": 3430 }, { "epoch": 5.454689984101749, "grad_norm": 4.143710219264504, "learning_rate": 4.075558852899208e-05, "loss": 0.2212, "step": 3431 }, { "epoch": 5.456279809220986, "grad_norm": 3.5193712714714507, "learning_rate": 4.076138953398153e-05, "loss": 0.1956, "step": 3432 }, { "epoch": 5.457869634340223, "grad_norm": 4.586965476915138, "learning_rate": 4.076719028126819e-05, "loss": 0.1794, "step": 3433 }, { "epoch": 5.45945945945946, "grad_norm": 3.9206468694213323, "learning_rate": 4.077299076888872e-05, "loss": 0.1624, "step": 3434 }, { "epoch": 5.461049284578697, "grad_norm": 2.6218352222848385, "learning_rate": 4.077879099487986e-05, "loss": 0.1948, "step": 3435 }, { "epoch": 5.462639109697934, "grad_norm": 2.3794331758871445, "learning_rate": 4.0784590957278455e-05, "loss": 0.1657, "step": 3436 }, { "epoch": 5.46422893481717, "grad_norm": 4.473880039184331, "learning_rate": 4.079039065412141e-05, "loss": 0.1603, "step": 3437 }, { "epoch": 5.465818759936407, "grad_norm": 6.210250484301298, "learning_rate": 4.079619008344576e-05, "loss": 0.2471, "step": 3438 }, { "epoch": 5.467408585055644, "grad_norm": 1.874490578051546, "learning_rate": 4.080198924328859e-05, "loss": 0.1566, "step": 3439 }, { "epoch": 5.468998410174881, "grad_norm": 3.504553237225042, "learning_rate": 4.08077881316871e-05, "loss": 0.2066, "step": 3440 }, { "epoch": 5.470588235294118, "grad_norm": 3.1977245279355153, "learning_rate": 4.0813586746678584e-05, "loss": 0.1604, "step": 3441 }, { "epoch": 5.472178060413355, "grad_norm": 2.2377346091992556, "learning_rate": 4.081938508630041e-05, "loss": 0.1283, "step": 3442 }, { "epoch": 5.473767885532592, "grad_norm": 2.7994478914366847, "learning_rate": 4.0825183148590054e-05, "loss": 0.1715, "step": 3443 }, { "epoch": 5.475357710651828, "grad_norm": 77.7064826261626, "learning_rate": 4.083098093158508e-05, "loss": 2.062, "step": 3444 }, { "epoch": 5.476947535771065, "grad_norm": 4.515084357690424, "learning_rate": 4.083677843332315e-05, "loss": 0.2102, "step": 3445 }, { "epoch": 5.478537360890302, "grad_norm": 4.040298926171633, "learning_rate": 4.0842575651842024e-05, "loss": 0.1736, "step": 3446 }, { "epoch": 5.480127186009539, "grad_norm": 5.981077272348579, "learning_rate": 4.084837258517955e-05, "loss": 0.1643, "step": 3447 }, { "epoch": 5.481717011128776, "grad_norm": 5.658248989481495, "learning_rate": 4.085416923137368e-05, "loss": 0.1601, "step": 3448 }, { "epoch": 5.483306836248013, "grad_norm": 3.328226001593728, "learning_rate": 4.085996558846244e-05, "loss": 0.1921, "step": 3449 }, { "epoch": 5.48489666136725, "grad_norm": 4.9879068672674265, "learning_rate": 4.0865761654484e-05, "loss": 0.1886, "step": 3450 }, { "epoch": 5.486486486486487, "grad_norm": 4.378168725476973, "learning_rate": 4.087155742747659e-05, "loss": 0.1884, "step": 3451 }, { "epoch": 5.488076311605723, "grad_norm": 5.513338832678585, "learning_rate": 4.087735290547854e-05, "loss": 0.1193, "step": 3452 }, { "epoch": 5.48966613672496, "grad_norm": 3.4217011376955715, "learning_rate": 4.0883148086528305e-05, "loss": 0.1533, "step": 3453 }, { "epoch": 5.491255961844197, "grad_norm": 3.738968097429164, "learning_rate": 4.088894296866442e-05, "loss": 0.1932, "step": 3454 }, { "epoch": 5.492845786963434, "grad_norm": 3.4510870249090204, "learning_rate": 4.0894737549925524e-05, "loss": 0.1211, "step": 3455 }, { "epoch": 5.494435612082671, "grad_norm": 3.714451763025221, "learning_rate": 4.090053182835037e-05, "loss": 0.1301, "step": 3456 }, { "epoch": 5.496025437201908, "grad_norm": 4.610617076349642, "learning_rate": 4.09063258019778e-05, "loss": 0.1808, "step": 3457 }, { "epoch": 5.497615262321145, "grad_norm": 26.090143820000243, "learning_rate": 4.0912119468846766e-05, "loss": 2.3678, "step": 3458 }, { "epoch": 5.499205087440382, "grad_norm": 4.372755050191866, "learning_rate": 4.091791282699632e-05, "loss": 0.1868, "step": 3459 }, { "epoch": 5.500794912559618, "grad_norm": 4.030548647089388, "learning_rate": 4.092370587446562e-05, "loss": 0.1755, "step": 3460 }, { "epoch": 5.502384737678855, "grad_norm": 4.3315802951379245, "learning_rate": 4.092949860929392e-05, "loss": 0.1769, "step": 3461 }, { "epoch": 5.503974562798092, "grad_norm": 4.689817135888319, "learning_rate": 4.0935291029520606e-05, "loss": 0.2921, "step": 3462 }, { "epoch": 5.505564387917329, "grad_norm": 4.266144627485116, "learning_rate": 4.094108313318514e-05, "loss": 0.2374, "step": 3463 }, { "epoch": 5.507154213036566, "grad_norm": 3.820662567916575, "learning_rate": 4.0946874918327116e-05, "loss": 0.1415, "step": 3464 }, { "epoch": 5.508744038155803, "grad_norm": 2.643315299360915, "learning_rate": 4.095266638298622e-05, "loss": 0.1536, "step": 3465 }, { "epoch": 5.51033386327504, "grad_norm": 4.521595746406973, "learning_rate": 4.0958457525202244e-05, "loss": 0.1904, "step": 3466 }, { "epoch": 5.511923688394276, "grad_norm": 4.505696567813395, "learning_rate": 4.09642483430151e-05, "loss": 0.1652, "step": 3467 }, { "epoch": 5.513513513513513, "grad_norm": 4.014245951067984, "learning_rate": 4.097003883446481e-05, "loss": 0.1935, "step": 3468 }, { "epoch": 5.51510333863275, "grad_norm": 5.781521091656406, "learning_rate": 4.09758289975915e-05, "loss": 0.1777, "step": 3469 }, { "epoch": 5.516693163751987, "grad_norm": 3.6578266230706102, "learning_rate": 4.098161883043541e-05, "loss": 0.1162, "step": 3470 }, { "epoch": 5.518282988871224, "grad_norm": 2.3132288442970528, "learning_rate": 4.098740833103688e-05, "loss": 0.1108, "step": 3471 }, { "epoch": 5.519872813990461, "grad_norm": 5.407748163798726, "learning_rate": 4.0993197497436386e-05, "loss": 0.1801, "step": 3472 }, { "epoch": 5.521462639109698, "grad_norm": 3.329793890699531, "learning_rate": 4.099898632767451e-05, "loss": 0.1318, "step": 3473 }, { "epoch": 5.523052464228935, "grad_norm": 5.910665660379687, "learning_rate": 4.1004774819791934e-05, "loss": 0.1569, "step": 3474 }, { "epoch": 5.524642289348172, "grad_norm": 5.050827384682833, "learning_rate": 4.101056297182947e-05, "loss": 0.3092, "step": 3475 }, { "epoch": 5.526232114467408, "grad_norm": 4.0948264063279165, "learning_rate": 4.1016350781828025e-05, "loss": 0.1782, "step": 3476 }, { "epoch": 5.527821939586645, "grad_norm": 5.630487810895971, "learning_rate": 4.102213824782864e-05, "loss": 0.216, "step": 3477 }, { "epoch": 5.529411764705882, "grad_norm": 2.800741145942806, "learning_rate": 4.102792536787247e-05, "loss": 0.1505, "step": 3478 }, { "epoch": 5.531001589825119, "grad_norm": 3.194379471610461, "learning_rate": 4.103371214000079e-05, "loss": 0.1597, "step": 3479 }, { "epoch": 5.532591414944356, "grad_norm": 4.740916322156127, "learning_rate": 4.103949856225497e-05, "loss": 0.1697, "step": 3480 }, { "epoch": 5.534181240063593, "grad_norm": 5.1889471271261, "learning_rate": 4.1045284632676536e-05, "loss": 0.1675, "step": 3481 }, { "epoch": 5.53577106518283, "grad_norm": 2.5931350980961354, "learning_rate": 4.1051070349307106e-05, "loss": 0.1266, "step": 3482 }, { "epoch": 5.537360890302066, "grad_norm": 3.2627854694719414, "learning_rate": 4.105685571018841e-05, "loss": 0.1738, "step": 3483 }, { "epoch": 5.538950715421303, "grad_norm": 3.6638465657779156, "learning_rate": 4.106264071336233e-05, "loss": 0.1588, "step": 3484 }, { "epoch": 5.54054054054054, "grad_norm": 1.7559859394128108, "learning_rate": 4.1068425356870854e-05, "loss": 0.1651, "step": 3485 }, { "epoch": 5.542130365659777, "grad_norm": 6.152629230582684, "learning_rate": 4.1074209638756075e-05, "loss": 0.1495, "step": 3486 }, { "epoch": 5.543720190779014, "grad_norm": 3.565802993346733, "learning_rate": 4.107999355706023e-05, "loss": 0.196, "step": 3487 }, { "epoch": 5.545310015898251, "grad_norm": 4.108758945349951, "learning_rate": 4.108577710982568e-05, "loss": 0.1624, "step": 3488 }, { "epoch": 5.546899841017488, "grad_norm": 4.029915692082075, "learning_rate": 4.109156029509488e-05, "loss": 0.1662, "step": 3489 }, { "epoch": 5.548489666136725, "grad_norm": 5.382498226749465, "learning_rate": 4.1097343110910455e-05, "loss": 0.167, "step": 3490 }, { "epoch": 5.550079491255962, "grad_norm": 5.575061430864564, "learning_rate": 4.110312555531512e-05, "loss": 0.1843, "step": 3491 }, { "epoch": 5.5516693163751984, "grad_norm": 3.269901997475535, "learning_rate": 4.110890762635173e-05, "loss": 0.1276, "step": 3492 }, { "epoch": 5.5532591414944354, "grad_norm": 4.133496963063792, "learning_rate": 4.1114689322063256e-05, "loss": 0.1842, "step": 3493 }, { "epoch": 5.5548489666136724, "grad_norm": 8.018012174165118, "learning_rate": 4.112047064049281e-05, "loss": 0.2136, "step": 3494 }, { "epoch": 5.556438791732909, "grad_norm": 2.866702799949682, "learning_rate": 4.112625157968363e-05, "loss": 0.1271, "step": 3495 }, { "epoch": 5.558028616852146, "grad_norm": 10.186479284214233, "learning_rate": 4.1132032137679066e-05, "loss": 0.1777, "step": 3496 }, { "epoch": 5.559618441971383, "grad_norm": 16.182862053960115, "learning_rate": 4.113781231252262e-05, "loss": 1.9994, "step": 3497 }, { "epoch": 5.56120826709062, "grad_norm": 12.086501949131218, "learning_rate": 4.1143592102257905e-05, "loss": 0.1936, "step": 3498 }, { "epoch": 5.5627980922098565, "grad_norm": 6.252209674515089, "learning_rate": 4.114937150492866e-05, "loss": 0.1851, "step": 3499 }, { "epoch": 5.5643879173290935, "grad_norm": 2.869925137699074, "learning_rate": 4.115515051857879e-05, "loss": 0.1697, "step": 3500 }, { "epoch": 5.5659777424483305, "grad_norm": 6.882674873492677, "learning_rate": 4.1160929141252305e-05, "loss": 0.1742, "step": 3501 }, { "epoch": 5.5675675675675675, "grad_norm": 10.152593929931726, "learning_rate": 4.1166707370993335e-05, "loss": 0.2197, "step": 3502 }, { "epoch": 5.5691573926868045, "grad_norm": 4.81208611317315, "learning_rate": 4.117248520584616e-05, "loss": 0.1172, "step": 3503 }, { "epoch": 5.5707472178060415, "grad_norm": 9.503223835166155, "learning_rate": 4.117826264385521e-05, "loss": 0.1773, "step": 3504 }, { "epoch": 5.5723370429252785, "grad_norm": 4.262640492987117, "learning_rate": 4.118403968306502e-05, "loss": 0.152, "step": 3505 }, { "epoch": 5.573926868044515, "grad_norm": 5.755438896354904, "learning_rate": 4.1189816321520256e-05, "loss": 0.1557, "step": 3506 }, { "epoch": 5.575516693163752, "grad_norm": 6.117060521763252, "learning_rate": 4.119559255726576e-05, "loss": 0.1416, "step": 3507 }, { "epoch": 5.577106518282989, "grad_norm": 6.845977426783046, "learning_rate": 4.1201368388346474e-05, "loss": 0.1778, "step": 3508 }, { "epoch": 5.578696343402226, "grad_norm": 2.8193206146673324, "learning_rate": 4.120714381280749e-05, "loss": 0.1704, "step": 3509 }, { "epoch": 5.580286168521463, "grad_norm": 5.274255981726564, "learning_rate": 4.1212918828694036e-05, "loss": 0.1602, "step": 3510 }, { "epoch": 5.5818759936407, "grad_norm": 6.495705653383769, "learning_rate": 4.1218693434051476e-05, "loss": 0.1593, "step": 3511 }, { "epoch": 5.583465818759937, "grad_norm": 3.4638060279956697, "learning_rate": 4.122446762692532e-05, "loss": 0.1601, "step": 3512 }, { "epoch": 5.585055643879174, "grad_norm": 4.189082616840884, "learning_rate": 4.1230241405361206e-05, "loss": 0.1283, "step": 3513 }, { "epoch": 5.586645468998411, "grad_norm": 5.025504710962899, "learning_rate": 4.1236014767404926e-05, "loss": 0.1797, "step": 3514 }, { "epoch": 5.588235294117647, "grad_norm": 4.4796608788917895, "learning_rate": 4.124178771110241e-05, "loss": 0.1708, "step": 3515 }, { "epoch": 5.589825119236884, "grad_norm": 4.028955413443789, "learning_rate": 4.124756023449971e-05, "loss": 0.1355, "step": 3516 }, { "epoch": 5.591414944356121, "grad_norm": 5.343291678228311, "learning_rate": 4.125333233564305e-05, "loss": 0.1673, "step": 3517 }, { "epoch": 5.593004769475358, "grad_norm": 3.3760154092971026, "learning_rate": 4.125910401257877e-05, "loss": 0.1639, "step": 3518 }, { "epoch": 5.594594594594595, "grad_norm": 4.265866276845395, "learning_rate": 4.1264875263353375e-05, "loss": 0.1854, "step": 3519 }, { "epoch": 5.596184419713832, "grad_norm": 4.233028644070434, "learning_rate": 4.127064608601351e-05, "loss": 0.1335, "step": 3520 }, { "epoch": 5.597774244833069, "grad_norm": 2.540447137073106, "learning_rate": 4.1276416478605945e-05, "loss": 0.1658, "step": 3521 }, { "epoch": 5.599364069952305, "grad_norm": 5.280107269674043, "learning_rate": 4.128218643917763e-05, "loss": 0.2377, "step": 3522 }, { "epoch": 5.600953895071542, "grad_norm": 5.135579492734996, "learning_rate": 4.128795596577563e-05, "loss": 0.2164, "step": 3523 }, { "epoch": 5.602543720190779, "grad_norm": 2.768460961072922, "learning_rate": 4.129372505644717e-05, "loss": 0.1126, "step": 3524 }, { "epoch": 5.604133545310016, "grad_norm": 2.2740281553732395, "learning_rate": 4.129949370923963e-05, "loss": 0.1401, "step": 3525 }, { "epoch": 5.605723370429253, "grad_norm": 3.723624952247425, "learning_rate": 4.1305261922200514e-05, "loss": 0.1469, "step": 3526 }, { "epoch": 5.60731319554849, "grad_norm": 2.0350564121791783, "learning_rate": 4.131102969337751e-05, "loss": 0.2077, "step": 3527 }, { "epoch": 5.608903020667727, "grad_norm": 3.3865523363243315, "learning_rate": 4.1316797020818426e-05, "loss": 0.1417, "step": 3528 }, { "epoch": 5.610492845786963, "grad_norm": 5.037453867759402, "learning_rate": 4.132256390257123e-05, "loss": 0.1559, "step": 3529 }, { "epoch": 5.6120826709062, "grad_norm": 3.7287961061159094, "learning_rate": 4.132833033668404e-05, "loss": 0.1704, "step": 3530 }, { "epoch": 5.613672496025437, "grad_norm": 7.850719308824216, "learning_rate": 4.133409632120513e-05, "loss": 0.119, "step": 3531 }, { "epoch": 5.615262321144674, "grad_norm": 4.862917991230707, "learning_rate": 4.133986185418292e-05, "loss": 0.1487, "step": 3532 }, { "epoch": 5.616852146263911, "grad_norm": 3.0442252194192285, "learning_rate": 4.134562693366599e-05, "loss": 0.137, "step": 3533 }, { "epoch": 5.618441971383148, "grad_norm": 3.8491770703181136, "learning_rate": 4.135139155770307e-05, "loss": 0.1897, "step": 3534 }, { "epoch": 5.620031796502385, "grad_norm": 4.0024578731714175, "learning_rate": 4.1357155724343045e-05, "loss": 0.1691, "step": 3535 }, { "epoch": 5.621621621621622, "grad_norm": 4.682485120463948, "learning_rate": 4.136291943163495e-05, "loss": 0.1519, "step": 3536 }, { "epoch": 5.623211446740859, "grad_norm": 2.458240983344537, "learning_rate": 4.136868267762797e-05, "loss": 0.1609, "step": 3537 }, { "epoch": 5.624801271860095, "grad_norm": 9.68324067321005, "learning_rate": 4.137444546037147e-05, "loss": 0.1657, "step": 3538 }, { "epoch": 5.626391096979332, "grad_norm": 3.215412148341009, "learning_rate": 4.138020777791495e-05, "loss": 0.1722, "step": 3539 }, { "epoch": 5.627980922098569, "grad_norm": 6.878029685423107, "learning_rate": 4.138596962830806e-05, "loss": 0.2372, "step": 3540 }, { "epoch": 5.629570747217806, "grad_norm": 4.115927930756329, "learning_rate": 4.1391731009600655e-05, "loss": 0.1461, "step": 3541 }, { "epoch": 5.631160572337043, "grad_norm": 4.951904190929962, "learning_rate": 4.139749191984269e-05, "loss": 0.2153, "step": 3542 }, { "epoch": 5.63275039745628, "grad_norm": 13.418561158520081, "learning_rate": 4.1403252357084316e-05, "loss": 0.3356, "step": 3543 }, { "epoch": 5.634340222575517, "grad_norm": 3.2824328690176996, "learning_rate": 4.140901231937583e-05, "loss": 0.1495, "step": 3544 }, { "epoch": 5.635930047694753, "grad_norm": 6.529084884491945, "learning_rate": 4.141477180476769e-05, "loss": 0.1622, "step": 3545 }, { "epoch": 5.63751987281399, "grad_norm": 6.981152502970928, "learning_rate": 4.142053081131053e-05, "loss": 0.2736, "step": 3546 }, { "epoch": 5.639109697933227, "grad_norm": 2.5822003361734747, "learning_rate": 4.1426289337055115e-05, "loss": 0.143, "step": 3547 }, { "epoch": 5.640699523052464, "grad_norm": 5.861439918078993, "learning_rate": 4.1432047380052415e-05, "loss": 0.1654, "step": 3548 }, { "epoch": 5.642289348171701, "grad_norm": 13.288406931426305, "learning_rate": 4.143780493835353e-05, "loss": 1.2675, "step": 3549 }, { "epoch": 5.643879173290938, "grad_norm": 5.2120525299928575, "learning_rate": 4.144356201000973e-05, "loss": 0.143, "step": 3550 }, { "epoch": 5.645468998410175, "grad_norm": 8.893154766437872, "learning_rate": 4.144931859307247e-05, "loss": 0.2851, "step": 3551 }, { "epoch": 5.647058823529412, "grad_norm": 7.137810325615889, "learning_rate": 4.1455074685593344e-05, "loss": 0.2875, "step": 3552 }, { "epoch": 5.648648648648649, "grad_norm": 8.881645599930087, "learning_rate": 4.146083028562412e-05, "loss": 0.1904, "step": 3553 }, { "epoch": 5.650238473767885, "grad_norm": 4.62432636268524, "learning_rate": 4.1466585391216735e-05, "loss": 0.1551, "step": 3554 }, { "epoch": 5.651828298887122, "grad_norm": 5.171092264271625, "learning_rate": 4.1472340000423315e-05, "loss": 0.2183, "step": 3555 }, { "epoch": 5.653418124006359, "grad_norm": 3.9362504483298464, "learning_rate": 4.1478094111296106e-05, "loss": 0.1671, "step": 3556 }, { "epoch": 5.655007949125596, "grad_norm": 2.701500458143243, "learning_rate": 4.148384772188757e-05, "loss": 0.159, "step": 3557 }, { "epoch": 5.656597774244833, "grad_norm": 5.981794997363775, "learning_rate": 4.148960083025031e-05, "loss": 0.2941, "step": 3558 }, { "epoch": 5.65818759936407, "grad_norm": 3.532599263157308, "learning_rate": 4.14953534344371e-05, "loss": 0.1597, "step": 3559 }, { "epoch": 5.659777424483307, "grad_norm": 4.042307834639393, "learning_rate": 4.15011055325009e-05, "loss": 0.1455, "step": 3560 }, { "epoch": 5.661367249602543, "grad_norm": 5.934255130100581, "learning_rate": 4.150685712249483e-05, "loss": 0.1878, "step": 3561 }, { "epoch": 5.66295707472178, "grad_norm": 4.493916496764399, "learning_rate": 4.1512608202472196e-05, "loss": 0.2118, "step": 3562 }, { "epoch": 5.664546899841017, "grad_norm": 7.626155022752274, "learning_rate": 4.151835877048645e-05, "loss": 0.2025, "step": 3563 }, { "epoch": 5.666136724960254, "grad_norm": 4.808823162394276, "learning_rate": 4.152410882459124e-05, "loss": 0.2459, "step": 3564 }, { "epoch": 5.667726550079491, "grad_norm": 5.684340727044658, "learning_rate": 4.152985836284038e-05, "loss": 0.1667, "step": 3565 }, { "epoch": 5.669316375198728, "grad_norm": 5.310663674205226, "learning_rate": 4.153560738328786e-05, "loss": 0.2217, "step": 3566 }, { "epoch": 5.670906200317965, "grad_norm": 4.285982633120311, "learning_rate": 4.154135588398785e-05, "loss": 0.1844, "step": 3567 }, { "epoch": 5.672496025437201, "grad_norm": 5.853740228976494, "learning_rate": 4.154710386299468e-05, "loss": 0.3292, "step": 3568 }, { "epoch": 5.674085850556438, "grad_norm": 2.4129165594917357, "learning_rate": 4.155285131836288e-05, "loss": 0.1655, "step": 3569 }, { "epoch": 5.675675675675675, "grad_norm": 2.84498988791173, "learning_rate": 4.155859824814713e-05, "loss": 0.1474, "step": 3570 }, { "epoch": 5.677265500794912, "grad_norm": 5.441554907570572, "learning_rate": 4.156434465040231e-05, "loss": 0.2917, "step": 3571 }, { "epoch": 5.678855325914149, "grad_norm": 3.7338267088847683, "learning_rate": 4.1570090523183476e-05, "loss": 0.2343, "step": 3572 }, { "epoch": 5.680445151033386, "grad_norm": 4.866911027686962, "learning_rate": 4.1575835864545846e-05, "loss": 0.1897, "step": 3573 }, { "epoch": 5.682034976152623, "grad_norm": 4.540194655482006, "learning_rate": 4.158158067254484e-05, "loss": 0.1764, "step": 3574 }, { "epoch": 5.68362480127186, "grad_norm": 4.6418071580685165, "learning_rate": 4.158732494523604e-05, "loss": 0.179, "step": 3575 }, { "epoch": 5.685214626391097, "grad_norm": 6.466668194920761, "learning_rate": 4.159306868067522e-05, "loss": 0.2702, "step": 3576 }, { "epoch": 5.6868044515103335, "grad_norm": 4.354666345274069, "learning_rate": 4.159881187691835e-05, "loss": 0.2326, "step": 3577 }, { "epoch": 5.6883942766295705, "grad_norm": 6.5937533384199085, "learning_rate": 4.160455453202154e-05, "loss": 0.1824, "step": 3578 }, { "epoch": 5.6899841017488075, "grad_norm": 4.6294136006138515, "learning_rate": 4.1610296644041135e-05, "loss": 0.1773, "step": 3579 }, { "epoch": 5.6915739268680445, "grad_norm": 5.155299628758655, "learning_rate": 4.161603821103361e-05, "loss": 0.2244, "step": 3580 }, { "epoch": 5.6931637519872815, "grad_norm": 6.6661039647989995, "learning_rate": 4.162177923105567e-05, "loss": 0.1957, "step": 3581 }, { "epoch": 5.6947535771065185, "grad_norm": 4.906444088300017, "learning_rate": 4.162751970216419e-05, "loss": 0.1311, "step": 3582 }, { "epoch": 5.6963434022257555, "grad_norm": 4.373817815093437, "learning_rate": 4.163325962241622e-05, "loss": 0.1196, "step": 3583 }, { "epoch": 5.697933227344992, "grad_norm": 4.302527051624377, "learning_rate": 4.1638998989869015e-05, "loss": 0.1549, "step": 3584 }, { "epoch": 5.699523052464229, "grad_norm": 5.1294306981653275, "learning_rate": 4.1644737802579986e-05, "loss": 0.265, "step": 3585 }, { "epoch": 5.701112877583466, "grad_norm": 4.2444931529491186, "learning_rate": 4.165047605860678e-05, "loss": 0.1624, "step": 3586 }, { "epoch": 5.702702702702703, "grad_norm": 2.8026001866276324, "learning_rate": 4.165621375600719e-05, "loss": 0.1405, "step": 3587 }, { "epoch": 5.70429252782194, "grad_norm": 6.991211036086354, "learning_rate": 4.166195089283921e-05, "loss": 0.1556, "step": 3588 }, { "epoch": 5.705882352941177, "grad_norm": 3.83167452029789, "learning_rate": 4.1667687467161024e-05, "loss": 0.1986, "step": 3589 }, { "epoch": 5.707472178060414, "grad_norm": 3.4441179490614333, "learning_rate": 4.167342347703102e-05, "loss": 0.1844, "step": 3590 }, { "epoch": 5.709062003179651, "grad_norm": 6.941645808711311, "learning_rate": 4.1679158920507774e-05, "loss": 0.2263, "step": 3591 }, { "epoch": 5.710651828298887, "grad_norm": 4.929715151184763, "learning_rate": 4.168489379565002e-05, "loss": 0.1374, "step": 3592 }, { "epoch": 5.712241653418124, "grad_norm": 4.437708232779739, "learning_rate": 4.169062810051674e-05, "loss": 0.2626, "step": 3593 }, { "epoch": 5.713831478537361, "grad_norm": 4.2083631330075075, "learning_rate": 4.169636183316706e-05, "loss": 0.1629, "step": 3594 }, { "epoch": 5.715421303656598, "grad_norm": 6.213039573154681, "learning_rate": 4.170209499166033e-05, "loss": 0.2183, "step": 3595 }, { "epoch": 5.717011128775835, "grad_norm": 3.983177329738075, "learning_rate": 4.170782757405607e-05, "loss": 0.1587, "step": 3596 }, { "epoch": 5.718600953895072, "grad_norm": 3.734816843258369, "learning_rate": 4.171355957841401e-05, "loss": 0.1399, "step": 3597 }, { "epoch": 5.720190779014309, "grad_norm": 4.507721694416351, "learning_rate": 4.17192910027941e-05, "loss": 0.1915, "step": 3598 }, { "epoch": 5.721780604133546, "grad_norm": 2.90596554102352, "learning_rate": 4.172502184525642e-05, "loss": 0.1824, "step": 3599 }, { "epoch": 5.723370429252782, "grad_norm": 2.7374824302826006, "learning_rate": 4.173075210386132e-05, "loss": 0.1884, "step": 3600 }, { "epoch": 5.724960254372019, "grad_norm": 16.4312468200261, "learning_rate": 4.173648177666931e-05, "loss": 9.0676, "step": 3601 }, { "epoch": 5.726550079491256, "grad_norm": 6.095352735102084, "learning_rate": 4.174221086174108e-05, "loss": 0.186, "step": 3602 }, { "epoch": 5.728139904610493, "grad_norm": 5.866689213660126, "learning_rate": 4.1747939357137565e-05, "loss": 0.148, "step": 3603 }, { "epoch": 5.72972972972973, "grad_norm": 2.7980511466346267, "learning_rate": 4.175366726091987e-05, "loss": 0.16, "step": 3604 }, { "epoch": 5.731319554848967, "grad_norm": 10.811645603901455, "learning_rate": 4.175939457114931e-05, "loss": 0.2012, "step": 3605 }, { "epoch": 5.732909379968204, "grad_norm": 4.505592112826139, "learning_rate": 4.176512128588739e-05, "loss": 0.1632, "step": 3606 }, { "epoch": 5.73449920508744, "grad_norm": 5.965046141745751, "learning_rate": 4.177084740319584e-05, "loss": 0.1502, "step": 3607 }, { "epoch": 5.736089030206677, "grad_norm": 10.814215843387212, "learning_rate": 4.177657292113655e-05, "loss": 0.3067, "step": 3608 }, { "epoch": 5.737678855325914, "grad_norm": 7.647187139727761, "learning_rate": 4.1782297837771665e-05, "loss": 0.3066, "step": 3609 }, { "epoch": 5.739268680445151, "grad_norm": 8.99357238066339, "learning_rate": 4.17880221511635e-05, "loss": 0.2102, "step": 3610 }, { "epoch": 5.740858505564388, "grad_norm": 9.40344301353052, "learning_rate": 4.179374585937458e-05, "loss": 0.1646, "step": 3611 }, { "epoch": 5.742448330683625, "grad_norm": 4.42341678843568, "learning_rate": 4.179946896046763e-05, "loss": 0.2147, "step": 3612 }, { "epoch": 5.744038155802862, "grad_norm": 5.6226941767604, "learning_rate": 4.18051914525056e-05, "loss": 0.1859, "step": 3613 }, { "epoch": 5.745627980922099, "grad_norm": 8.704121780433018, "learning_rate": 4.181091333355163e-05, "loss": 0.163, "step": 3614 }, { "epoch": 5.747217806041336, "grad_norm": 4.222689601477409, "learning_rate": 4.181663460166907e-05, "loss": 0.1378, "step": 3615 }, { "epoch": 5.748807631160572, "grad_norm": 3.6905314494685157, "learning_rate": 4.1822355254921475e-05, "loss": 0.159, "step": 3616 }, { "epoch": 5.750397456279809, "grad_norm": 7.292567704846656, "learning_rate": 4.182807529137262e-05, "loss": 0.2096, "step": 3617 }, { "epoch": 5.751987281399046, "grad_norm": 5.133862597059989, "learning_rate": 4.183379470908646e-05, "loss": 0.1501, "step": 3618 }, { "epoch": 5.753577106518283, "grad_norm": 4.682314580058747, "learning_rate": 4.1839513506127204e-05, "loss": 0.204, "step": 3619 }, { "epoch": 5.75516693163752, "grad_norm": 8.007037441243162, "learning_rate": 4.184523168055923e-05, "loss": 0.1654, "step": 3620 }, { "epoch": 5.756756756756757, "grad_norm": 3.8517680696524415, "learning_rate": 4.185094923044715e-05, "loss": 0.2107, "step": 3621 }, { "epoch": 5.758346581875994, "grad_norm": 3.5611696403569746, "learning_rate": 4.185666615385577e-05, "loss": 0.201, "step": 3622 }, { "epoch": 5.75993640699523, "grad_norm": 5.301085064891199, "learning_rate": 4.1862382448850136e-05, "loss": 0.1614, "step": 3623 }, { "epoch": 5.761526232114467, "grad_norm": 4.190785583028749, "learning_rate": 4.186809811349548e-05, "loss": 0.1476, "step": 3624 }, { "epoch": 5.763116057233704, "grad_norm": 2.5783138992477928, "learning_rate": 4.187381314585725e-05, "loss": 0.1681, "step": 3625 }, { "epoch": 5.764705882352941, "grad_norm": 4.841903769849468, "learning_rate": 4.187952754400112e-05, "loss": 0.1415, "step": 3626 }, { "epoch": 5.766295707472178, "grad_norm": 5.644910761060418, "learning_rate": 4.188524130599298e-05, "loss": 0.1385, "step": 3627 }, { "epoch": 5.767885532591415, "grad_norm": 3.7197968032903277, "learning_rate": 4.189095442989892e-05, "loss": 0.1648, "step": 3628 }, { "epoch": 5.769475357710652, "grad_norm": 4.933122926630562, "learning_rate": 4.1896666913785244e-05, "loss": 0.1943, "step": 3629 }, { "epoch": 5.771065182829888, "grad_norm": 3.0394631460685404, "learning_rate": 4.190237875571851e-05, "loss": 0.1561, "step": 3630 }, { "epoch": 5.772655007949125, "grad_norm": 2.6675420105977388, "learning_rate": 4.190808995376545e-05, "loss": 0.1707, "step": 3631 }, { "epoch": 5.774244833068362, "grad_norm": 3.0741117857822764, "learning_rate": 4.1913800505993026e-05, "loss": 0.1354, "step": 3632 }, { "epoch": 5.775834658187599, "grad_norm": 2.170031227202194, "learning_rate": 4.191951041046844e-05, "loss": 0.1616, "step": 3633 }, { "epoch": 5.777424483306836, "grad_norm": 2.6378062907382405, "learning_rate": 4.1925219665259075e-05, "loss": 0.1781, "step": 3634 }, { "epoch": 5.779014308426073, "grad_norm": 4.130640877044012, "learning_rate": 4.1930928268432566e-05, "loss": 0.1895, "step": 3635 }, { "epoch": 5.78060413354531, "grad_norm": 2.718405567404681, "learning_rate": 4.1936636218056766e-05, "loss": 0.159, "step": 3636 }, { "epoch": 5.782193958664547, "grad_norm": 3.6614055085148958, "learning_rate": 4.1942343512199716e-05, "loss": 0.1385, "step": 3637 }, { "epoch": 5.783783783783784, "grad_norm": 2.2212284178480983, "learning_rate": 4.194805014892973e-05, "loss": 0.151, "step": 3638 }, { "epoch": 5.78537360890302, "grad_norm": 4.752488312801556, "learning_rate": 4.195375612631531e-05, "loss": 0.1485, "step": 3639 }, { "epoch": 5.786963434022257, "grad_norm": 5.233987900513094, "learning_rate": 4.195946144242518e-05, "loss": 0.1385, "step": 3640 }, { "epoch": 5.788553259141494, "grad_norm": 3.835924410297876, "learning_rate": 4.196516609532831e-05, "loss": 0.1855, "step": 3641 }, { "epoch": 5.790143084260731, "grad_norm": 7.3630206708722605, "learning_rate": 4.1970870083093864e-05, "loss": 0.1937, "step": 3642 }, { "epoch": 5.791732909379968, "grad_norm": 5.103351926101738, "learning_rate": 4.1976573403791265e-05, "loss": 0.1901, "step": 3643 }, { "epoch": 5.793322734499205, "grad_norm": 2.0332053867266997, "learning_rate": 4.198227605549014e-05, "loss": 0.1696, "step": 3644 }, { "epoch": 5.794912559618442, "grad_norm": 5.134492673278592, "learning_rate": 4.198797803626035e-05, "loss": 0.1853, "step": 3645 }, { "epoch": 5.796502384737678, "grad_norm": 4.020376674562339, "learning_rate": 4.199367934417198e-05, "loss": 0.1604, "step": 3646 }, { "epoch": 5.798092209856915, "grad_norm": 6.150763198713712, "learning_rate": 4.199937997729533e-05, "loss": 0.192, "step": 3647 }, { "epoch": 5.799682034976152, "grad_norm": 1.5029670089694867, "learning_rate": 4.200507993370097e-05, "loss": 0.1646, "step": 3648 }, { "epoch": 5.801271860095389, "grad_norm": 5.136485816862838, "learning_rate": 4.2010779211459644e-05, "loss": 0.168, "step": 3649 }, { "epoch": 5.802861685214626, "grad_norm": 4.096748123695933, "learning_rate": 4.2016477808642375e-05, "loss": 0.1734, "step": 3650 }, { "epoch": 5.804451510333863, "grad_norm": 4.1015645073249125, "learning_rate": 4.2022175723320374e-05, "loss": 0.1435, "step": 3651 }, { "epoch": 5.8060413354531, "grad_norm": 2.8565002553040713, "learning_rate": 4.202787295356512e-05, "loss": 0.1578, "step": 3652 }, { "epoch": 5.807631160572337, "grad_norm": 6.172245962402591, "learning_rate": 4.2033569497448307e-05, "loss": 0.21, "step": 3653 }, { "epoch": 5.809220985691574, "grad_norm": 4.586271363448587, "learning_rate": 4.203926535304185e-05, "loss": 0.1613, "step": 3654 }, { "epoch": 5.8108108108108105, "grad_norm": 3.6669606277815334, "learning_rate": 4.20449605184179e-05, "loss": 0.184, "step": 3655 }, { "epoch": 5.8124006359300475, "grad_norm": 2.3453732874220212, "learning_rate": 4.2050654991648876e-05, "loss": 0.1688, "step": 3656 }, { "epoch": 5.8139904610492845, "grad_norm": 3.15105188634312, "learning_rate": 4.2056348770807386e-05, "loss": 0.1918, "step": 3657 }, { "epoch": 5.8155802861685215, "grad_norm": 1.3710226305801365, "learning_rate": 4.2062041853966295e-05, "loss": 0.1335, "step": 3658 }, { "epoch": 5.8171701112877585, "grad_norm": 5.571447556836932, "learning_rate": 4.2067734239198706e-05, "loss": 19.3559, "step": 3659 }, { "epoch": 5.8187599364069955, "grad_norm": 3.754204877966765, "learning_rate": 4.207342592457795e-05, "loss": 0.158, "step": 3660 }, { "epoch": 5.8203497615262325, "grad_norm": 1.7876462456491027, "learning_rate": 4.20791169081776e-05, "loss": 0.189, "step": 3661 }, { "epoch": 5.821939586645469, "grad_norm": 5.157772226611338, "learning_rate": 4.2084807188071455e-05, "loss": 0.1695, "step": 3662 }, { "epoch": 5.823529411764706, "grad_norm": 2.2879042327694927, "learning_rate": 4.2090496762333564e-05, "loss": 0.2395, "step": 3663 }, { "epoch": 5.825119236883943, "grad_norm": 6.207792802245383, "learning_rate": 4.209618562903822e-05, "loss": 0.1475, "step": 3664 }, { "epoch": 5.82670906200318, "grad_norm": 4.548627381998895, "learning_rate": 4.210187378625994e-05, "loss": 0.2324, "step": 3665 }, { "epoch": 5.828298887122417, "grad_norm": 4.089463655503371, "learning_rate": 4.210756123207349e-05, "loss": 0.156, "step": 3666 }, { "epoch": 5.829888712241654, "grad_norm": 4.545709586328353, "learning_rate": 4.211324796455389e-05, "loss": 0.1718, "step": 3667 }, { "epoch": 5.831478537360891, "grad_norm": 4.1825321890928695, "learning_rate": 4.2118933981776365e-05, "loss": 0.1843, "step": 3668 }, { "epoch": 5.833068362480127, "grad_norm": 4.68811710804842, "learning_rate": 4.212461928181641e-05, "loss": 0.1868, "step": 3669 }, { "epoch": 5.834658187599364, "grad_norm": 2.957886344049319, "learning_rate": 4.2130303862749766e-05, "loss": 0.1636, "step": 3670 }, { "epoch": 5.836248012718601, "grad_norm": 2.5055723589093764, "learning_rate": 4.21359877226524e-05, "loss": 0.203, "step": 3671 }, { "epoch": 5.837837837837838, "grad_norm": 5.503628711945985, "learning_rate": 4.214167085960053e-05, "loss": 0.1728, "step": 3672 }, { "epoch": 5.839427662957075, "grad_norm": 2.4761138214406757, "learning_rate": 4.214735327167063e-05, "loss": 0.1749, "step": 3673 }, { "epoch": 5.841017488076312, "grad_norm": 3.8324593024155953, "learning_rate": 4.21530349569394e-05, "loss": 0.2022, "step": 3674 }, { "epoch": 5.842607313195549, "grad_norm": 3.3779456908078527, "learning_rate": 4.21587159134838e-05, "loss": 0.2066, "step": 3675 }, { "epoch": 5.844197138314786, "grad_norm": 4.142300893011948, "learning_rate": 4.2164396139381035e-05, "loss": 0.2548, "step": 3676 }, { "epoch": 5.845786963434023, "grad_norm": 55.74061589934113, "learning_rate": 4.2170075632708536e-05, "loss": 3.7771, "step": 3677 }, { "epoch": 5.847376788553259, "grad_norm": 4.722009271551694, "learning_rate": 4.217575439154402e-05, "loss": 0.2466, "step": 3678 }, { "epoch": 5.848966613672496, "grad_norm": 112.5796298611852, "learning_rate": 4.218143241396543e-05, "loss": 5.5487, "step": 3679 }, { "epoch": 5.850556438791733, "grad_norm": 1.8379519740167987, "learning_rate": 4.218710969805095e-05, "loss": 0.2294, "step": 3680 }, { "epoch": 5.85214626391097, "grad_norm": 5.350662107833079, "learning_rate": 4.2192786241879035e-05, "loss": 0.1786, "step": 3681 }, { "epoch": 5.853736089030207, "grad_norm": 4.441891500131396, "learning_rate": 4.219846204352838e-05, "loss": 0.1824, "step": 3682 }, { "epoch": 5.855325914149444, "grad_norm": 2.235357280709743, "learning_rate": 4.220413710107792e-05, "loss": 0.2013, "step": 3683 }, { "epoch": 5.856915739268681, "grad_norm": 3.377067360510304, "learning_rate": 4.220981141260687e-05, "loss": 0.2125, "step": 3684 }, { "epoch": 5.858505564387917, "grad_norm": 6.0018675676175945, "learning_rate": 4.2215484976194676e-05, "loss": 0.1867, "step": 3685 }, { "epoch": 5.860095389507154, "grad_norm": 5.46688675069841, "learning_rate": 4.222115778992103e-05, "loss": 0.1488, "step": 3686 }, { "epoch": 5.861685214626391, "grad_norm": 4.312011571430304, "learning_rate": 4.2226829851865914e-05, "loss": 0.2006, "step": 3687 }, { "epoch": 5.863275039745628, "grad_norm": 4.081494671227908, "learning_rate": 4.223250116010952e-05, "loss": 0.1897, "step": 3688 }, { "epoch": 5.864864864864865, "grad_norm": 6.5192452463739965, "learning_rate": 4.2238171712732315e-05, "loss": 0.2936, "step": 3689 }, { "epoch": 5.866454689984102, "grad_norm": 7.854059671076194, "learning_rate": 4.224384150781504e-05, "loss": 0.2286, "step": 3690 }, { "epoch": 5.868044515103339, "grad_norm": 12.282106122123663, "learning_rate": 4.224951054343865e-05, "loss": 38.4781, "step": 3691 }, { "epoch": 5.869634340222575, "grad_norm": 3.7273867264211713, "learning_rate": 4.22551788176844e-05, "loss": 0.1426, "step": 3692 }, { "epoch": 5.871224165341812, "grad_norm": 3.237353340352837, "learning_rate": 4.226084632863379e-05, "loss": 0.1644, "step": 3693 }, { "epoch": 5.872813990461049, "grad_norm": 4.3123572075119405, "learning_rate": 4.226651307436855e-05, "loss": 0.3162, "step": 3694 }, { "epoch": 5.874403815580286, "grad_norm": 4.65363587225841, "learning_rate": 4.227217905297071e-05, "loss": 0.2123, "step": 3695 }, { "epoch": 5.875993640699523, "grad_norm": 7.113432791231884, "learning_rate": 4.227784426252253e-05, "loss": 0.1511, "step": 3696 }, { "epoch": 5.87758346581876, "grad_norm": 6.228660445769009, "learning_rate": 4.2283508701106556e-05, "loss": 0.1795, "step": 3697 }, { "epoch": 5.879173290937997, "grad_norm": 4.419004441665432, "learning_rate": 4.2289172366805576e-05, "loss": 0.1769, "step": 3698 }, { "epoch": 5.880763116057234, "grad_norm": 77.04134937815041, "learning_rate": 4.229483525770263e-05, "loss": 9.7127, "step": 3699 }, { "epoch": 5.882352941176471, "grad_norm": 5.204151768428491, "learning_rate": 4.2300497371881045e-05, "loss": 0.1744, "step": 3700 }, { "epoch": 5.883942766295707, "grad_norm": 557.6987847378355, "learning_rate": 4.2306158707424404e-05, "loss": 10.3648, "step": 3701 }, { "epoch": 5.885532591414944, "grad_norm": 4.0083445869943555, "learning_rate": 4.231181926241654e-05, "loss": 0.2367, "step": 3702 }, { "epoch": 5.887122416534181, "grad_norm": 5.390395128645863, "learning_rate": 4.231747903494158e-05, "loss": 0.2592, "step": 3703 }, { "epoch": 5.888712241653418, "grad_norm": 5.07342740750419, "learning_rate": 4.232313802308386e-05, "loss": 0.2216, "step": 3704 }, { "epoch": 5.890302066772655, "grad_norm": 3.873942790563805, "learning_rate": 4.232879622492806e-05, "loss": 0.1298, "step": 3705 }, { "epoch": 5.891891891891892, "grad_norm": 3.515298669432799, "learning_rate": 4.2334453638559054e-05, "loss": 0.2394, "step": 3706 }, { "epoch": 5.893481717011129, "grad_norm": 3.6253910965596354, "learning_rate": 4.2340110262062025e-05, "loss": 0.181, "step": 3707 }, { "epoch": 5.895071542130365, "grad_norm": 5.126090489685165, "learning_rate": 4.234576609352241e-05, "loss": 0.1671, "step": 3708 }, { "epoch": 5.896661367249602, "grad_norm": 2.976235701475839, "learning_rate": 4.235142113102591e-05, "loss": 0.1726, "step": 3709 }, { "epoch": 5.898251192368839, "grad_norm": 2.8892368960462314, "learning_rate": 4.2357075372658494e-05, "loss": 0.2211, "step": 3710 }, { "epoch": 5.899841017488076, "grad_norm": 2.3610001074176266, "learning_rate": 4.236272881650642e-05, "loss": 0.1848, "step": 3711 }, { "epoch": 5.901430842607313, "grad_norm": 2.8092422261131005, "learning_rate": 4.2368381460656185e-05, "loss": 0.12, "step": 3712 }, { "epoch": 5.90302066772655, "grad_norm": 2.6907451327296195, "learning_rate": 4.23740333031946e-05, "loss": 0.2126, "step": 3713 }, { "epoch": 5.904610492845787, "grad_norm": 3.80747826109959, "learning_rate": 4.2379684342208697e-05, "loss": 0.2311, "step": 3714 }, { "epoch": 5.906200317965024, "grad_norm": 3.5710296811269915, "learning_rate": 4.238533457578581e-05, "loss": 0.1813, "step": 3715 }, { "epoch": 5.907790143084261, "grad_norm": 4.327553754116943, "learning_rate": 4.2390984002013544e-05, "loss": 0.1797, "step": 3716 }, { "epoch": 5.909379968203497, "grad_norm": 2.30806952481913, "learning_rate": 4.239663261897977e-05, "loss": 0.1729, "step": 3717 }, { "epoch": 5.910969793322734, "grad_norm": 2.8982249634874213, "learning_rate": 4.2402280424772635e-05, "loss": 0.1522, "step": 3718 }, { "epoch": 5.912559618441971, "grad_norm": 2.8675575433437497, "learning_rate": 4.240792741748056e-05, "loss": 0.1956, "step": 3719 }, { "epoch": 5.914149443561208, "grad_norm": 3.476578792593314, "learning_rate": 4.2413573595192254e-05, "loss": 0.1578, "step": 3720 }, { "epoch": 5.915739268680445, "grad_norm": 2.85438464188752, "learning_rate": 4.241921895599668e-05, "loss": 0.1821, "step": 3721 }, { "epoch": 5.917329093799682, "grad_norm": 3.9875756057812177, "learning_rate": 4.2424863497983084e-05, "loss": 0.149, "step": 3722 }, { "epoch": 5.918918918918919, "grad_norm": 4.796303298476455, "learning_rate": 4.2430507219241e-05, "loss": 0.1605, "step": 3723 }, { "epoch": 5.920508744038155, "grad_norm": 2.957871735305267, "learning_rate": 4.2436150117860225e-05, "loss": 0.1578, "step": 3724 }, { "epoch": 5.922098569157392, "grad_norm": 4.831291360747747, "learning_rate": 4.244179219193085e-05, "loss": 0.206, "step": 3725 }, { "epoch": 5.923688394276629, "grad_norm": 4.004169953641362, "learning_rate": 4.244743343954324e-05, "loss": 0.15, "step": 3726 }, { "epoch": 5.925278219395866, "grad_norm": 3.041288320257261, "learning_rate": 4.2453073858788024e-05, "loss": 0.156, "step": 3727 }, { "epoch": 5.926868044515103, "grad_norm": 4.102219068126756, "learning_rate": 4.245871344775614e-05, "loss": 0.1791, "step": 3728 }, { "epoch": 5.92845786963434, "grad_norm": 4.0877470988890785, "learning_rate": 4.246435220453878e-05, "loss": 0.1721, "step": 3729 }, { "epoch": 5.930047694753577, "grad_norm": 51.12812197381091, "learning_rate": 4.246999012722743e-05, "loss": 2.4659, "step": 3730 }, { "epoch": 5.9316375198728135, "grad_norm": 5.882338398829938, "learning_rate": 4.247562721391386e-05, "loss": 0.1512, "step": 3731 }, { "epoch": 5.9332273449920505, "grad_norm": 4.494236819111463, "learning_rate": 4.248126346269012e-05, "loss": 0.2701, "step": 3732 }, { "epoch": 5.9348171701112875, "grad_norm": 3.7634633745708266, "learning_rate": 4.2486898871648554e-05, "loss": 0.1951, "step": 3733 }, { "epoch": 5.9364069952305245, "grad_norm": 7.040683414468272, "learning_rate": 4.249253343888176e-05, "loss": 0.1439, "step": 3734 }, { "epoch": 5.9379968203497615, "grad_norm": 3.9761027567711085, "learning_rate": 4.249816716248265e-05, "loss": 0.1799, "step": 3735 }, { "epoch": 5.9395866454689985, "grad_norm": 3.084035835740625, "learning_rate": 4.2503800040544416e-05, "loss": 0.1663, "step": 3736 }, { "epoch": 5.9411764705882355, "grad_norm": 3.116249908625141, "learning_rate": 4.250943207116053e-05, "loss": 0.1905, "step": 3737 }, { "epoch": 5.9427662957074725, "grad_norm": 4.624749649184074, "learning_rate": 4.251506325242475e-05, "loss": 0.1802, "step": 3738 }, { "epoch": 5.9443561208267095, "grad_norm": 3.4751079001942182, "learning_rate": 4.252069358243114e-05, "loss": 0.2014, "step": 3739 }, { "epoch": 5.945945945945946, "grad_norm": 2.572043675234543, "learning_rate": 4.252632305927402e-05, "loss": 0.1075, "step": 3740 }, { "epoch": 5.947535771065183, "grad_norm": 2.8649430851578006, "learning_rate": 4.253195168104802e-05, "loss": 0.2527, "step": 3741 }, { "epoch": 5.94912559618442, "grad_norm": 3.653292125997511, "learning_rate": 4.253757944584806e-05, "loss": 0.2116, "step": 3742 }, { "epoch": 5.950715421303657, "grad_norm": 5.120975617348639, "learning_rate": 4.254320635176934e-05, "loss": 0.1657, "step": 3743 }, { "epoch": 5.952305246422894, "grad_norm": 2.939695898245953, "learning_rate": 4.254883239690736e-05, "loss": 0.1413, "step": 3744 }, { "epoch": 5.953895071542131, "grad_norm": 2.4483540505135637, "learning_rate": 4.255445757935791e-05, "loss": 0.1663, "step": 3745 }, { "epoch": 5.955484896661368, "grad_norm": 4.8001667011050415, "learning_rate": 4.2560081897217055e-05, "loss": 0.147, "step": 3746 }, { "epoch": 5.957074721780604, "grad_norm": 4.3750835110291355, "learning_rate": 4.256570534858119e-05, "loss": 0.2163, "step": 3747 }, { "epoch": 5.958664546899841, "grad_norm": 3.9059794962963736, "learning_rate": 4.257132793154696e-05, "loss": 0.1692, "step": 3748 }, { "epoch": 5.960254372019078, "grad_norm": 1.8419324093293465, "learning_rate": 4.2576949644211345e-05, "loss": 0.1245, "step": 3749 }, { "epoch": 5.961844197138315, "grad_norm": 2.2664057914440763, "learning_rate": 4.258257048467157e-05, "loss": 0.1471, "step": 3750 }, { "epoch": 5.963434022257552, "grad_norm": 3.9948490752862997, "learning_rate": 4.258819045102521e-05, "loss": 0.2003, "step": 3751 }, { "epoch": 5.965023847376789, "grad_norm": 1.506121889002566, "learning_rate": 4.25938095413701e-05, "loss": 0.1513, "step": 3752 }, { "epoch": 5.966613672496026, "grad_norm": 1.8804431970219035, "learning_rate": 4.259942775380438e-05, "loss": 0.1951, "step": 3753 }, { "epoch": 5.968203497615263, "grad_norm": 3.0869523543033974, "learning_rate": 4.2605045086426484e-05, "loss": 0.1521, "step": 3754 }, { "epoch": 5.9697933227345, "grad_norm": 3.3455771580419897, "learning_rate": 4.2610661537335166e-05, "loss": 0.1543, "step": 3755 }, { "epoch": 5.971383147853736, "grad_norm": 2.7361885839594513, "learning_rate": 4.261627710462944e-05, "loss": 0.1956, "step": 3756 }, { "epoch": 5.972972972972973, "grad_norm": 2.2746978027066094, "learning_rate": 4.2621891786408646e-05, "loss": 0.1316, "step": 3757 }, { "epoch": 5.97456279809221, "grad_norm": 2.3136108858193105, "learning_rate": 4.262750558077243e-05, "loss": 0.1511, "step": 3758 }, { "epoch": 5.976152623211447, "grad_norm": 2.191026673350536, "learning_rate": 4.263311848582071e-05, "loss": 0.1526, "step": 3759 }, { "epoch": 5.977742448330684, "grad_norm": 1.773620167310186, "learning_rate": 4.263873049965373e-05, "loss": 0.1086, "step": 3760 }, { "epoch": 5.979332273449921, "grad_norm": 3.1303147144288346, "learning_rate": 4.2644341620372026e-05, "loss": 0.1216, "step": 3761 }, { "epoch": 5.980922098569158, "grad_norm": 2.554524742452868, "learning_rate": 4.264995184607642e-05, "loss": 0.1268, "step": 3762 }, { "epoch": 5.982511923688394, "grad_norm": 3.4511912303530687, "learning_rate": 4.2655561174868094e-05, "loss": 0.1607, "step": 3763 }, { "epoch": 5.984101748807631, "grad_norm": 2.049998293096109, "learning_rate": 4.266116960484845e-05, "loss": 0.1454, "step": 3764 }, { "epoch": 5.985691573926868, "grad_norm": 1.9038803610007013, "learning_rate": 4.2666777134119265e-05, "loss": 0.1614, "step": 3765 }, { "epoch": 5.987281399046105, "grad_norm": 2.549695902163392, "learning_rate": 4.267238376078257e-05, "loss": 0.1444, "step": 3766 }, { "epoch": 5.988871224165342, "grad_norm": 4.7439712943432735, "learning_rate": 4.2677989482940745e-05, "loss": 0.1939, "step": 3767 }, { "epoch": 5.990461049284579, "grad_norm": 1.8812991892629263, "learning_rate": 4.2683594298696454e-05, "loss": 0.1992, "step": 3768 }, { "epoch": 5.992050874403816, "grad_norm": 2.931392826241659, "learning_rate": 4.268919820615266e-05, "loss": 0.2233, "step": 3769 }, { "epoch": 5.993640699523052, "grad_norm": 5.387775404104411, "learning_rate": 4.269480120341265e-05, "loss": 0.171, "step": 3770 }, { "epoch": 5.995230524642289, "grad_norm": 2.298986764644481, "learning_rate": 4.2700403288580016e-05, "loss": 0.2234, "step": 3771 }, { "epoch": 5.996820349761526, "grad_norm": 2.0852416473100566, "learning_rate": 4.270600445975863e-05, "loss": 0.1084, "step": 3772 }, { "epoch": 5.998410174880763, "grad_norm": 3.6764515676553198, "learning_rate": 4.2711604715052736e-05, "loss": 0.1454, "step": 3773 }, { "epoch": 6.0, "grad_norm": 5.318312095701312, "learning_rate": 4.271720405256683e-05, "loss": 0.1595, "step": 3774 }, { "epoch": 6.001589825119237, "grad_norm": 3.4884402915311212, "learning_rate": 4.272280247040575e-05, "loss": 0.1269, "step": 3775 }, { "epoch": 6.003179650238474, "grad_norm": 4.243773906660166, "learning_rate": 4.272839996667461e-05, "loss": 0.2005, "step": 3776 }, { "epoch": 6.004769475357711, "grad_norm": 3.9229473405231774, "learning_rate": 4.2733996539478886e-05, "loss": 0.1372, "step": 3777 }, { "epoch": 6.006359300476947, "grad_norm": 2.794396756281207, "learning_rate": 4.2739592186924327e-05, "loss": 0.095, "step": 3778 }, { "epoch": 6.007949125596184, "grad_norm": 6.262069749219166, "learning_rate": 4.274518690711701e-05, "loss": 0.1596, "step": 3779 }, { "epoch": 6.009538950715421, "grad_norm": 3.0122092617712553, "learning_rate": 4.275078069816334e-05, "loss": 0.1602, "step": 3780 }, { "epoch": 6.011128775834658, "grad_norm": 4.726415659973259, "learning_rate": 4.2756373558169995e-05, "loss": 0.1146, "step": 3781 }, { "epoch": 6.012718600953895, "grad_norm": 2.006408911366836, "learning_rate": 4.2761965485244006e-05, "loss": 0.2206, "step": 3782 }, { "epoch": 6.014308426073132, "grad_norm": 3.75415602853274, "learning_rate": 4.2767556477492727e-05, "loss": 0.2018, "step": 3783 }, { "epoch": 6.015898251192369, "grad_norm": 2.936446543471104, "learning_rate": 4.2773146533023784e-05, "loss": 0.1493, "step": 3784 }, { "epoch": 6.017488076311606, "grad_norm": 2.857405260675501, "learning_rate": 4.2778735649945145e-05, "loss": 0.1678, "step": 3785 }, { "epoch": 6.019077901430842, "grad_norm": 3.226097668427512, "learning_rate": 4.278432382636511e-05, "loss": 0.1649, "step": 3786 }, { "epoch": 6.020667726550079, "grad_norm": 3.2867825026679647, "learning_rate": 4.2789911060392296e-05, "loss": 0.2736, "step": 3787 }, { "epoch": 6.022257551669316, "grad_norm": 2.7559008930501903, "learning_rate": 4.2795497350135596e-05, "loss": 0.2252, "step": 3788 }, { "epoch": 6.023847376788553, "grad_norm": 1.8436379578720412, "learning_rate": 4.2801082693704266e-05, "loss": 0.1713, "step": 3789 }, { "epoch": 6.02543720190779, "grad_norm": 1.6217653421494564, "learning_rate": 4.280666708920788e-05, "loss": 0.1475, "step": 3790 }, { "epoch": 6.027027027027027, "grad_norm": 6.250305306558637, "learning_rate": 4.281225053475631e-05, "loss": 0.1969, "step": 3791 }, { "epoch": 6.028616852146264, "grad_norm": 2.5982143918651195, "learning_rate": 4.2817833028459764e-05, "loss": 0.1261, "step": 3792 }, { "epoch": 6.030206677265501, "grad_norm": 3.8688566625873055, "learning_rate": 4.2823414568428767e-05, "loss": 0.1747, "step": 3793 }, { "epoch": 6.031796502384737, "grad_norm": 3.37110905728922, "learning_rate": 4.2828995152774175e-05, "loss": 0.1707, "step": 3794 }, { "epoch": 6.033386327503974, "grad_norm": 3.0048936340114394, "learning_rate": 4.283457477960716e-05, "loss": 0.185, "step": 3795 }, { "epoch": 6.034976152623211, "grad_norm": 4.196159303763704, "learning_rate": 4.284015344703923e-05, "loss": 0.1623, "step": 3796 }, { "epoch": 6.036565977742448, "grad_norm": 2.678654712535462, "learning_rate": 4.284573115318219e-05, "loss": 0.1648, "step": 3797 }, { "epoch": 6.038155802861685, "grad_norm": 4.783984180879707, "learning_rate": 4.28513078961482e-05, "loss": 0.1303, "step": 3798 }, { "epoch": 6.039745627980922, "grad_norm": 4.980025871326622, "learning_rate": 4.285688367404974e-05, "loss": 0.2358, "step": 3799 }, { "epoch": 6.041335453100159, "grad_norm": 4.118072292197232, "learning_rate": 4.28624584849996e-05, "loss": 0.1562, "step": 3800 }, { "epoch": 6.042925278219396, "grad_norm": 2.2256264261706815, "learning_rate": 4.28680323271109e-05, "loss": 0.119, "step": 3801 }, { "epoch": 6.044515103338632, "grad_norm": 3.0027988203392533, "learning_rate": 4.287360519849712e-05, "loss": 0.1348, "step": 3802 }, { "epoch": 6.046104928457869, "grad_norm": 2.718964319786782, "learning_rate": 4.287917709727203e-05, "loss": 0.1351, "step": 3803 }, { "epoch": 6.047694753577106, "grad_norm": 3.140571198690672, "learning_rate": 4.288474802154975e-05, "loss": 0.1632, "step": 3804 }, { "epoch": 6.049284578696343, "grad_norm": 2.3037945455579787, "learning_rate": 4.2890317969444724e-05, "loss": 0.1907, "step": 3805 }, { "epoch": 6.05087440381558, "grad_norm": 3.0103910103955545, "learning_rate": 4.289588693907171e-05, "loss": 0.1673, "step": 3806 }, { "epoch": 6.052464228934817, "grad_norm": 3.390010101457781, "learning_rate": 4.290145492854583e-05, "loss": 0.1579, "step": 3807 }, { "epoch": 6.054054054054054, "grad_norm": 2.88111431493689, "learning_rate": 4.290702193598253e-05, "loss": 0.1487, "step": 3808 }, { "epoch": 6.0556438791732905, "grad_norm": 3.1973676098415282, "learning_rate": 4.291258795949756e-05, "loss": 0.1349, "step": 3809 }, { "epoch": 6.0572337042925275, "grad_norm": 3.575724119152044, "learning_rate": 4.2918152997207024e-05, "loss": 0.1079, "step": 3810 }, { "epoch": 6.0588235294117645, "grad_norm": 71.33525197738116, "learning_rate": 4.292371704722737e-05, "loss": 10.6124, "step": 3811 }, { "epoch": 6.0604133545310015, "grad_norm": 2.183241093832023, "learning_rate": 4.292928010767536e-05, "loss": 0.1934, "step": 3812 }, { "epoch": 6.0620031796502385, "grad_norm": 1.5925657311808048, "learning_rate": 4.2934842176668104e-05, "loss": 0.1391, "step": 3813 }, { "epoch": 6.0635930047694755, "grad_norm": 6.127758633144182, "learning_rate": 4.294040325232304e-05, "loss": 0.2086, "step": 3814 }, { "epoch": 6.0651828298887125, "grad_norm": 2.509417515341288, "learning_rate": 4.294596333275795e-05, "loss": 0.1249, "step": 3815 }, { "epoch": 6.0667726550079495, "grad_norm": 3.2945839919856383, "learning_rate": 4.295152241609094e-05, "loss": 0.1487, "step": 3816 }, { "epoch": 6.068362480127186, "grad_norm": 3.444841072552306, "learning_rate": 4.295708050044047e-05, "loss": 0.1742, "step": 3817 }, { "epoch": 6.069952305246423, "grad_norm": 3.834275363744185, "learning_rate": 4.296263758392532e-05, "loss": 0.1568, "step": 3818 }, { "epoch": 6.07154213036566, "grad_norm": 6.642829753042326, "learning_rate": 4.296819366466463e-05, "loss": 0.1626, "step": 3819 }, { "epoch": 6.073131955484897, "grad_norm": 2.8470201065656724, "learning_rate": 4.2973748740777864e-05, "loss": 0.201, "step": 3820 }, { "epoch": 6.074721780604134, "grad_norm": 10.09150841123969, "learning_rate": 4.297930281038482e-05, "loss": 1.2174, "step": 3821 }, { "epoch": 6.076311605723371, "grad_norm": 6.875786685178163, "learning_rate": 4.2984855871605664e-05, "loss": 0.1341, "step": 3822 }, { "epoch": 6.077901430842608, "grad_norm": 5.0376635553470255, "learning_rate": 4.299040792256086e-05, "loss": 0.1678, "step": 3823 }, { "epoch": 6.079491255961845, "grad_norm": 4.191154685507322, "learning_rate": 4.299595896137127e-05, "loss": 0.2037, "step": 3824 }, { "epoch": 6.081081081081081, "grad_norm": 7.7529853259891, "learning_rate": 4.300150898615806e-05, "loss": 0.1533, "step": 3825 }, { "epoch": 6.082670906200318, "grad_norm": 4.303897390322826, "learning_rate": 4.300705799504273e-05, "loss": 0.0918, "step": 3826 }, { "epoch": 6.084260731319555, "grad_norm": 10.221934450044976, "learning_rate": 4.301260598614716e-05, "loss": 0.3481, "step": 3827 }, { "epoch": 6.085850556438792, "grad_norm": 3.4908912422427583, "learning_rate": 4.3018152957593545e-05, "loss": 0.1298, "step": 3828 }, { "epoch": 6.087440381558029, "grad_norm": 7.8154249835252925, "learning_rate": 4.3023698907504446e-05, "loss": 0.2107, "step": 3829 }, { "epoch": 6.089030206677266, "grad_norm": 4.977298396692504, "learning_rate": 4.302924383400275e-05, "loss": 0.1186, "step": 3830 }, { "epoch": 6.090620031796503, "grad_norm": 3.9760716562457334, "learning_rate": 4.3034787735211704e-05, "loss": 0.1621, "step": 3831 }, { "epoch": 6.09220985691574, "grad_norm": 9.35904315254086, "learning_rate": 4.3040330609254906e-05, "loss": 0.1956, "step": 3832 }, { "epoch": 6.093799682034976, "grad_norm": 3.6522555121735336, "learning_rate": 4.3045872454256286e-05, "loss": 0.1183, "step": 3833 }, { "epoch": 6.095389507154213, "grad_norm": 7.651878149124375, "learning_rate": 4.305141326834012e-05, "loss": 0.1708, "step": 3834 }, { "epoch": 6.09697933227345, "grad_norm": 4.502420400954156, "learning_rate": 4.305695304963106e-05, "loss": 0.2289, "step": 3835 }, { "epoch": 6.098569157392687, "grad_norm": 3.5525505976336174, "learning_rate": 4.306249179625408e-05, "loss": 0.1679, "step": 3836 }, { "epoch": 6.100158982511924, "grad_norm": 5.062809461364913, "learning_rate": 4.3068029506334525e-05, "loss": 0.1086, "step": 3837 }, { "epoch": 6.101748807631161, "grad_norm": 150.93394429795555, "learning_rate": 4.307356617799807e-05, "loss": 2.1138, "step": 3838 }, { "epoch": 6.103338632750398, "grad_norm": 5.7193338505265565, "learning_rate": 4.307910180937076e-05, "loss": 0.1904, "step": 3839 }, { "epoch": 6.104928457869635, "grad_norm": 13.067778126022434, "learning_rate": 4.308463639857898e-05, "loss": 1.6698, "step": 3840 }, { "epoch": 6.106518282988871, "grad_norm": 6.513797903891124, "learning_rate": 4.309016994374948e-05, "loss": 1.3768, "step": 3841 }, { "epoch": 6.108108108108108, "grad_norm": 4.648228525682147, "learning_rate": 4.309570244300934e-05, "loss": 0.2673, "step": 3842 }, { "epoch": 6.109697933227345, "grad_norm": 2.902675380691719, "learning_rate": 4.310123389448601e-05, "loss": 0.1636, "step": 3843 }, { "epoch": 6.111287758346582, "grad_norm": 3.6576679521238633, "learning_rate": 4.310676429630732e-05, "loss": 0.1829, "step": 3844 }, { "epoch": 6.112877583465819, "grad_norm": 3.301364465893495, "learning_rate": 4.31122936466014e-05, "loss": 0.1763, "step": 3845 }, { "epoch": 6.114467408585056, "grad_norm": 5.347002075878882, "learning_rate": 4.311782194349678e-05, "loss": 0.1684, "step": 3846 }, { "epoch": 6.116057233704293, "grad_norm": 2.941263037780606, "learning_rate": 4.3123349185122325e-05, "loss": 0.1962, "step": 3847 }, { "epoch": 6.117647058823529, "grad_norm": 4.588665714552765, "learning_rate": 4.312887536960727e-05, "loss": 0.2269, "step": 3848 }, { "epoch": 6.119236883942766, "grad_norm": 5.391831968229123, "learning_rate": 4.31344004950812e-05, "loss": 0.1778, "step": 3849 }, { "epoch": 6.120826709062003, "grad_norm": 4.862871169589613, "learning_rate": 4.3139924559674054e-05, "loss": 0.1299, "step": 3850 }, { "epoch": 6.12241653418124, "grad_norm": 3.951072356042652, "learning_rate": 4.314544756151614e-05, "loss": 0.208, "step": 3851 }, { "epoch": 6.124006359300477, "grad_norm": 3.6290899603033955, "learning_rate": 4.3150969498738125e-05, "loss": 0.2119, "step": 3852 }, { "epoch": 6.125596184419714, "grad_norm": 7.00841532181519, "learning_rate": 4.315649036947103e-05, "loss": 0.2783, "step": 3853 }, { "epoch": 6.127186009538951, "grad_norm": 3.7113663186183263, "learning_rate": 4.316201017184623e-05, "loss": 0.1852, "step": 3854 }, { "epoch": 6.128775834658188, "grad_norm": 3.3012612358963604, "learning_rate": 4.31675289039955e-05, "loss": 0.1777, "step": 3855 }, { "epoch": 6.130365659777424, "grad_norm": 8.017046671679289, "learning_rate": 4.317304656405092e-05, "loss": 0.2217, "step": 3856 }, { "epoch": 6.131955484896661, "grad_norm": 70.68526557637794, "learning_rate": 4.317856315014498e-05, "loss": 3.707, "step": 3857 }, { "epoch": 6.133545310015898, "grad_norm": 3.042082201955096, "learning_rate": 4.3184078660410507e-05, "loss": 0.2327, "step": 3858 }, { "epoch": 6.135135135135135, "grad_norm": 4.84759485176823, "learning_rate": 4.31895930929807e-05, "loss": 0.1903, "step": 3859 }, { "epoch": 6.136724960254372, "grad_norm": 3.6893624471946547, "learning_rate": 4.319510644598913e-05, "loss": 0.1567, "step": 3860 }, { "epoch": 6.138314785373609, "grad_norm": 2.603669842919212, "learning_rate": 4.320061871756972e-05, "loss": 0.1913, "step": 3861 }, { "epoch": 6.139904610492846, "grad_norm": 6.2915017321647735, "learning_rate": 4.320612990585676e-05, "loss": 0.1737, "step": 3862 }, { "epoch": 6.141494435612083, "grad_norm": 5.185503510259138, "learning_rate": 4.321164000898493e-05, "loss": 0.2781, "step": 3863 }, { "epoch": 6.143084260731319, "grad_norm": 3.3337601476475736, "learning_rate": 4.321714902508925e-05, "loss": 0.1484, "step": 3864 }, { "epoch": 6.144674085850556, "grad_norm": 8.966649868343184, "learning_rate": 4.322265695230511e-05, "loss": 0.2052, "step": 3865 }, { "epoch": 6.146263910969793, "grad_norm": 4.802419078747724, "learning_rate": 4.3228163788768295e-05, "loss": 0.1456, "step": 3866 }, { "epoch": 6.14785373608903, "grad_norm": 4.239593222104979, "learning_rate": 4.3233669532614915e-05, "loss": 0.1216, "step": 3867 }, { "epoch": 6.149443561208267, "grad_norm": 7.273711956229438, "learning_rate": 4.323917418198149e-05, "loss": 0.2475, "step": 3868 }, { "epoch": 6.151033386327504, "grad_norm": 4.407103720587932, "learning_rate": 4.3244677735004905e-05, "loss": 0.1232, "step": 3869 }, { "epoch": 6.152623211446741, "grad_norm": 3.1849859604425705, "learning_rate": 4.325018018982239e-05, "loss": 0.1436, "step": 3870 }, { "epoch": 6.154213036565977, "grad_norm": 3.5664583600182356, "learning_rate": 4.3255681544571564e-05, "loss": 0.1683, "step": 3871 }, { "epoch": 6.155802861685214, "grad_norm": 4.233853595089014, "learning_rate": 4.3261181797390426e-05, "loss": 0.1624, "step": 3872 }, { "epoch": 6.157392686804451, "grad_norm": 2.9067073598039035, "learning_rate": 4.3266680946417345e-05, "loss": 0.1538, "step": 3873 }, { "epoch": 6.158982511923688, "grad_norm": 6.710175670754754, "learning_rate": 4.327217898979104e-05, "loss": 0.1674, "step": 3874 }, { "epoch": 6.160572337042925, "grad_norm": 11.514694121061552, "learning_rate": 4.3277675925650634e-05, "loss": 0.4228, "step": 3875 }, { "epoch": 6.162162162162162, "grad_norm": 10.555504160079467, "learning_rate": 4.328317175213561e-05, "loss": 0.2889, "step": 3876 }, { "epoch": 6.163751987281399, "grad_norm": 3.3750221255545134, "learning_rate": 4.328866646738583e-05, "loss": 0.1417, "step": 3877 }, { "epoch": 6.165341812400636, "grad_norm": 2.7676116640245776, "learning_rate": 4.329416006954154e-05, "loss": 0.1512, "step": 3878 }, { "epoch": 6.166931637519872, "grad_norm": 2.383154568289766, "learning_rate": 4.329965255674334e-05, "loss": 0.1176, "step": 3879 }, { "epoch": 6.168521462639109, "grad_norm": 2.129204080011837, "learning_rate": 4.3305143927132236e-05, "loss": 0.1248, "step": 3880 }, { "epoch": 6.170111287758346, "grad_norm": 3.5627789313469482, "learning_rate": 4.331063417884958e-05, "loss": 0.2035, "step": 3881 }, { "epoch": 6.171701112877583, "grad_norm": 3.2863234685301315, "learning_rate": 4.331612331003714e-05, "loss": 0.1618, "step": 3882 }, { "epoch": 6.17329093799682, "grad_norm": 3.9247289422727683, "learning_rate": 4.332161131883703e-05, "loss": 0.1537, "step": 3883 }, { "epoch": 6.174880763116057, "grad_norm": 2.3924770675834552, "learning_rate": 4.332709820339177e-05, "loss": 0.1695, "step": 3884 }, { "epoch": 6.176470588235294, "grad_norm": 2.2163912888938047, "learning_rate": 4.333258396184424e-05, "loss": 0.1422, "step": 3885 }, { "epoch": 6.178060413354531, "grad_norm": 2.747925814600689, "learning_rate": 4.333806859233771e-05, "loss": 0.2105, "step": 3886 }, { "epoch": 6.1796502384737675, "grad_norm": 3.768341278931603, "learning_rate": 4.334355209301584e-05, "loss": 0.1332, "step": 3887 }, { "epoch": 6.1812400635930045, "grad_norm": 1.8780248470998868, "learning_rate": 4.3349034462022646e-05, "loss": 0.1549, "step": 3888 }, { "epoch": 6.1828298887122415, "grad_norm": 3.397112397573954, "learning_rate": 4.335451569750255e-05, "loss": 0.1509, "step": 3889 }, { "epoch": 6.1844197138314785, "grad_norm": 4.14598729826173, "learning_rate": 4.3359995797600367e-05, "loss": 0.1771, "step": 3890 }, { "epoch": 6.1860095389507155, "grad_norm": 2.3819821296917314, "learning_rate": 4.3365474760461266e-05, "loss": 0.2591, "step": 3891 }, { "epoch": 6.1875993640699525, "grad_norm": 8.589642173477007, "learning_rate": 4.337095258423082e-05, "loss": 0.2104, "step": 3892 }, { "epoch": 6.1891891891891895, "grad_norm": 3.121592684422043, "learning_rate": 4.337642926705499e-05, "loss": 0.1396, "step": 3893 }, { "epoch": 6.1907790143084265, "grad_norm": 4.456598861147551, "learning_rate": 4.3381904807080114e-05, "loss": 0.1414, "step": 3894 }, { "epoch": 6.192368839427663, "grad_norm": 5.568981772603662, "learning_rate": 4.3387379202452916e-05, "loss": 0.1577, "step": 3895 }, { "epoch": 6.1939586645469, "grad_norm": 3.159193650495683, "learning_rate": 4.339285245132051e-05, "loss": 0.1398, "step": 3896 }, { "epoch": 6.195548489666137, "grad_norm": 4.968549981966246, "learning_rate": 4.339832455183042e-05, "loss": 0.1584, "step": 3897 }, { "epoch": 6.197138314785374, "grad_norm": 5.384828594762002, "learning_rate": 4.34037955021305e-05, "loss": 0.1825, "step": 3898 }, { "epoch": 6.198728139904611, "grad_norm": 3.884476370622544, "learning_rate": 4.3409265300369066e-05, "loss": 0.1376, "step": 3899 }, { "epoch": 6.200317965023848, "grad_norm": 6.0543881244866675, "learning_rate": 4.341473394469477e-05, "loss": 0.1256, "step": 3900 }, { "epoch": 6.201907790143085, "grad_norm": 3.312737600605519, "learning_rate": 4.342020143325669e-05, "loss": 0.13, "step": 3901 }, { "epoch": 6.203497615262322, "grad_norm": 2.5131444217689736, "learning_rate": 4.342566776420426e-05, "loss": 0.1321, "step": 3902 }, { "epoch": 6.205087440381558, "grad_norm": 2.689499730366301, "learning_rate": 4.3431132935687345e-05, "loss": 0.1661, "step": 3903 }, { "epoch": 6.206677265500795, "grad_norm": 4.062436930000234, "learning_rate": 4.343659694585616e-05, "loss": 0.1587, "step": 3904 }, { "epoch": 6.208267090620032, "grad_norm": 3.751191597129353, "learning_rate": 4.344205979286136e-05, "loss": 0.1388, "step": 3905 }, { "epoch": 6.209856915739269, "grad_norm": 25.09997476149906, "learning_rate": 4.3447521474853946e-05, "loss": 2.8884, "step": 3906 }, { "epoch": 6.211446740858506, "grad_norm": 5.1030592467879075, "learning_rate": 4.345298198998535e-05, "loss": 0.2002, "step": 3907 }, { "epoch": 6.213036565977743, "grad_norm": 2.8430710119335934, "learning_rate": 4.345844133640738e-05, "loss": 0.2205, "step": 3908 }, { "epoch": 6.21462639109698, "grad_norm": 3.477673301205507, "learning_rate": 4.3463899512272245e-05, "loss": 0.1673, "step": 3909 }, { "epoch": 6.216216216216216, "grad_norm": 2.4435639432317577, "learning_rate": 4.346935651573256e-05, "loss": 0.1755, "step": 3910 }, { "epoch": 6.217806041335453, "grad_norm": 5.640203551544657, "learning_rate": 4.347481234494132e-05, "loss": 0.1982, "step": 3911 }, { "epoch": 6.21939586645469, "grad_norm": 2.621056197909092, "learning_rate": 4.348026699805191e-05, "loss": 0.1807, "step": 3912 }, { "epoch": 6.220985691573927, "grad_norm": 4.201967597534844, "learning_rate": 4.3485720473218155e-05, "loss": 0.1649, "step": 3913 }, { "epoch": 6.222575516693164, "grad_norm": 4.25098358469494, "learning_rate": 4.349117276859423e-05, "loss": 0.1553, "step": 3914 }, { "epoch": 6.224165341812401, "grad_norm": 2.830675159770114, "learning_rate": 4.349662388233474e-05, "loss": 0.1976, "step": 3915 }, { "epoch": 6.225755166931638, "grad_norm": 4.468595334849825, "learning_rate": 4.350207381259468e-05, "loss": 0.1684, "step": 3916 }, { "epoch": 6.227344992050875, "grad_norm": 4.3087209837071425, "learning_rate": 4.3507522557529436e-05, "loss": 0.2082, "step": 3917 }, { "epoch": 6.228934817170111, "grad_norm": 1.6172787904498622, "learning_rate": 4.3512970115294824e-05, "loss": 0.1722, "step": 3918 }, { "epoch": 6.230524642289348, "grad_norm": 3.9494249201520426, "learning_rate": 4.3518416484047024e-05, "loss": 0.1643, "step": 3919 }, { "epoch": 6.232114467408585, "grad_norm": 6.5709518797510045, "learning_rate": 4.352386166194264e-05, "loss": 0.2218, "step": 3920 }, { "epoch": 6.233704292527822, "grad_norm": 3.4198094920248145, "learning_rate": 4.352930564713869e-05, "loss": 0.206, "step": 3921 }, { "epoch": 6.235294117647059, "grad_norm": 4.866579287383695, "learning_rate": 4.353474843779257e-05, "loss": 0.1813, "step": 3922 }, { "epoch": 6.236883942766296, "grad_norm": 5.707231703452576, "learning_rate": 4.35401900320621e-05, "loss": 0.1756, "step": 3923 }, { "epoch": 6.238473767885533, "grad_norm": 29.071499033451776, "learning_rate": 4.3545630428105496e-05, "loss": 4.3931, "step": 3924 }, { "epoch": 6.24006359300477, "grad_norm": 4.465577590757787, "learning_rate": 4.355106962408137e-05, "loss": 0.1738, "step": 3925 }, { "epoch": 6.241653418124006, "grad_norm": 2.1650333085123443, "learning_rate": 4.355650761814877e-05, "loss": 0.1553, "step": 3926 }, { "epoch": 6.243243243243243, "grad_norm": 2.2962628034425334, "learning_rate": 4.356194440846712e-05, "loss": 0.1518, "step": 3927 }, { "epoch": 6.24483306836248, "grad_norm": 1.7792538560580018, "learning_rate": 4.3567379993196256e-05, "loss": 0.1259, "step": 3928 }, { "epoch": 6.246422893481717, "grad_norm": 3.5304963914868286, "learning_rate": 4.357281437049644e-05, "loss": 0.1179, "step": 3929 }, { "epoch": 6.248012718600954, "grad_norm": 2.9884946226557916, "learning_rate": 4.357824753852833e-05, "loss": 0.161, "step": 3930 }, { "epoch": 6.249602543720191, "grad_norm": 2.1067982257152598, "learning_rate": 4.3583679495453e-05, "loss": 0.1654, "step": 3931 }, { "epoch": 6.251192368839428, "grad_norm": 31.12845886160391, "learning_rate": 4.3589110239431935e-05, "loss": 4.0818, "step": 3932 }, { "epoch": 6.252782193958664, "grad_norm": 2.597854733457868, "learning_rate": 4.3594539768626994e-05, "loss": 0.1427, "step": 3933 }, { "epoch": 6.254372019077901, "grad_norm": 2.3924966673761277, "learning_rate": 4.359996808120051e-05, "loss": 0.1923, "step": 3934 }, { "epoch": 6.255961844197138, "grad_norm": 1.9340228087519844, "learning_rate": 4.360539517531519e-05, "loss": 0.1727, "step": 3935 }, { "epoch": 6.257551669316375, "grad_norm": 2.5076155002887695, "learning_rate": 4.361082104913414e-05, "loss": 0.2053, "step": 3936 }, { "epoch": 6.259141494435612, "grad_norm": 2.6511006976190123, "learning_rate": 4.361624570082092e-05, "loss": 0.9781, "step": 3937 }, { "epoch": 6.260731319554849, "grad_norm": 2.17775592772492, "learning_rate": 4.362166912853948e-05, "loss": 0.1698, "step": 3938 }, { "epoch": 6.262321144674086, "grad_norm": 2.849841478849213, "learning_rate": 4.362709133045417e-05, "loss": 0.1176, "step": 3939 }, { "epoch": 6.263910969793323, "grad_norm": 3.4625957848509277, "learning_rate": 4.3632512304729785e-05, "loss": 0.1651, "step": 3940 }, { "epoch": 6.26550079491256, "grad_norm": 3.409503292909845, "learning_rate": 4.363793204953151e-05, "loss": 0.1591, "step": 3941 }, { "epoch": 6.267090620031796, "grad_norm": 2.7455618796468944, "learning_rate": 4.364335056302498e-05, "loss": 0.1837, "step": 3942 }, { "epoch": 6.268680445151033, "grad_norm": 2.299385118871967, "learning_rate": 4.3648767843376195e-05, "loss": 0.2417, "step": 3943 }, { "epoch": 6.27027027027027, "grad_norm": 6.974947695290759, "learning_rate": 4.365418388875163e-05, "loss": 0.1618, "step": 3944 }, { "epoch": 6.271860095389507, "grad_norm": 20.139283973390842, "learning_rate": 4.3659598697318125e-05, "loss": 1.8979, "step": 3945 }, { "epoch": 6.273449920508744, "grad_norm": 4.46792457222011, "learning_rate": 4.3665012267242977e-05, "loss": 0.1517, "step": 3946 }, { "epoch": 6.275039745627981, "grad_norm": 5.430668558918035, "learning_rate": 4.3670424596693885e-05, "loss": 0.179, "step": 3947 }, { "epoch": 6.276629570747218, "grad_norm": 8.806930581085638, "learning_rate": 4.367583568383897e-05, "loss": 0.1527, "step": 3948 }, { "epoch": 6.278219395866454, "grad_norm": 3.085863624247372, "learning_rate": 4.368124552684678e-05, "loss": 0.1367, "step": 3949 }, { "epoch": 6.279809220985691, "grad_norm": 7.220349875183313, "learning_rate": 4.368665412388628e-05, "loss": 0.1754, "step": 3950 }, { "epoch": 6.281399046104928, "grad_norm": 5.658028104718708, "learning_rate": 4.369206147312685e-05, "loss": 0.3334, "step": 3951 }, { "epoch": 6.282988871224165, "grad_norm": 5.539576982911385, "learning_rate": 4.369746757273829e-05, "loss": 0.1863, "step": 3952 }, { "epoch": 6.284578696343402, "grad_norm": 3.4115569762090647, "learning_rate": 4.3702872420890856e-05, "loss": 0.1841, "step": 3953 }, { "epoch": 6.286168521462639, "grad_norm": 2.95133926036608, "learning_rate": 4.370827601575518e-05, "loss": 0.1684, "step": 3954 }, { "epoch": 6.287758346581876, "grad_norm": 4.357053699679026, "learning_rate": 4.3713678355502345e-05, "loss": 0.1422, "step": 3955 }, { "epoch": 6.289348171701113, "grad_norm": 4.757250158603028, "learning_rate": 4.371907943830387e-05, "loss": 0.1497, "step": 3956 }, { "epoch": 6.290937996820349, "grad_norm": 2.129048609284599, "learning_rate": 4.372447926233166e-05, "loss": 0.1621, "step": 3957 }, { "epoch": 6.292527821939586, "grad_norm": 4.40707957764367, "learning_rate": 4.372987782575809e-05, "loss": 0.1522, "step": 3958 }, { "epoch": 6.294117647058823, "grad_norm": 3.2921704874318585, "learning_rate": 4.373527512675593e-05, "loss": 0.1064, "step": 3959 }, { "epoch": 6.29570747217806, "grad_norm": 4.194949316250937, "learning_rate": 4.37406711634984e-05, "loss": 0.1633, "step": 3960 }, { "epoch": 6.297297297297297, "grad_norm": 3.4304651168810727, "learning_rate": 4.3746065934159124e-05, "loss": 0.1651, "step": 3961 }, { "epoch": 6.298887122416534, "grad_norm": 5.450821105527862, "learning_rate": 4.3751459436912175e-05, "loss": 0.1768, "step": 3962 }, { "epoch": 6.300476947535771, "grad_norm": 2.3794906448305384, "learning_rate": 4.3756851669932046e-05, "loss": 0.1338, "step": 3963 }, { "epoch": 6.302066772655008, "grad_norm": 2.077951110374839, "learning_rate": 4.376224263139366e-05, "loss": 0.1758, "step": 3964 }, { "epoch": 6.3036565977742445, "grad_norm": 6.124251727409857, "learning_rate": 4.376763231947237e-05, "loss": 0.1776, "step": 3965 }, { "epoch": 6.3052464228934815, "grad_norm": 2.855246983547, "learning_rate": 4.377302073234397e-05, "loss": 0.1882, "step": 3966 }, { "epoch": 6.3068362480127185, "grad_norm": 2.574696234319509, "learning_rate": 4.3778407868184675e-05, "loss": 0.1668, "step": 3967 }, { "epoch": 6.3084260731319555, "grad_norm": 2.766116413131408, "learning_rate": 4.3783793725171124e-05, "loss": 0.1244, "step": 3968 }, { "epoch": 6.3100158982511925, "grad_norm": 4.226374302376006, "learning_rate": 4.3789178301480416e-05, "loss": 0.2008, "step": 3969 }, { "epoch": 6.3116057233704295, "grad_norm": 2.5496433325477605, "learning_rate": 4.3794561595290055e-05, "loss": 0.1431, "step": 3970 }, { "epoch": 6.3131955484896665, "grad_norm": 4.024387961300335, "learning_rate": 4.379994360477799e-05, "loss": 0.2409, "step": 3971 }, { "epoch": 6.314785373608903, "grad_norm": 2.9855026237023234, "learning_rate": 4.380532432812262e-05, "loss": 0.1572, "step": 3972 }, { "epoch": 6.31637519872814, "grad_norm": 2.504949952793185, "learning_rate": 4.3810703763502744e-05, "loss": 0.1943, "step": 3973 }, { "epoch": 6.317965023847377, "grad_norm": 4.456025650969578, "learning_rate": 4.381608190909764e-05, "loss": 0.1773, "step": 3974 }, { "epoch": 6.319554848966614, "grad_norm": 4.230966444694455, "learning_rate": 4.3821458763086973e-05, "loss": 0.1655, "step": 3975 }, { "epoch": 6.321144674085851, "grad_norm": 3.889743287859853, "learning_rate": 4.3826834323650894e-05, "loss": 0.1514, "step": 3976 }, { "epoch": 6.322734499205088, "grad_norm": 5.482141271530701, "learning_rate": 4.383220858896997e-05, "loss": 0.1833, "step": 3977 }, { "epoch": 6.324324324324325, "grad_norm": 3.3728747554798133, "learning_rate": 4.383758155722521e-05, "loss": 0.1779, "step": 3978 }, { "epoch": 6.325914149443562, "grad_norm": 4.013211988904892, "learning_rate": 4.3842953226598035e-05, "loss": 0.1982, "step": 3979 }, { "epoch": 6.327503974562799, "grad_norm": 3.0494297197057545, "learning_rate": 4.3848323595270355e-05, "loss": 0.1605, "step": 3980 }, { "epoch": 6.329093799682035, "grad_norm": 4.567958516742571, "learning_rate": 4.385369266142448e-05, "loss": 0.2152, "step": 3981 }, { "epoch": 6.330683624801272, "grad_norm": 4.599613776330691, "learning_rate": 4.3859060423243186e-05, "loss": 0.1508, "step": 3982 }, { "epoch": 6.332273449920509, "grad_norm": 3.4676506573444463, "learning_rate": 4.3864426878909674e-05, "loss": 0.1543, "step": 3983 }, { "epoch": 6.333863275039746, "grad_norm": 4.927328704589414, "learning_rate": 4.386979202660759e-05, "loss": 0.1763, "step": 3984 }, { "epoch": 6.335453100158983, "grad_norm": 4.313460723722417, "learning_rate": 4.387515586452103e-05, "loss": 0.2037, "step": 3985 }, { "epoch": 6.33704292527822, "grad_norm": 4.337815171072039, "learning_rate": 4.388051839083453e-05, "loss": 0.1858, "step": 3986 }, { "epoch": 6.338632750397457, "grad_norm": 3.6383564827206554, "learning_rate": 4.388587960373307e-05, "loss": 0.1315, "step": 3987 }, { "epoch": 6.340222575516693, "grad_norm": 1.654123914660149, "learning_rate": 4.3891239501402065e-05, "loss": 0.1903, "step": 3988 }, { "epoch": 6.34181240063593, "grad_norm": 7.734505371407981, "learning_rate": 4.389659808202739e-05, "loss": 0.1664, "step": 3989 }, { "epoch": 6.343402225755167, "grad_norm": 2.724978710975828, "learning_rate": 4.390195534379536e-05, "loss": 0.1593, "step": 3990 }, { "epoch": 6.344992050874404, "grad_norm": 5.500497667992429, "learning_rate": 4.390731128489274e-05, "loss": 0.1781, "step": 3991 }, { "epoch": 6.346581875993641, "grad_norm": 4.034119971973808, "learning_rate": 4.391266590350673e-05, "loss": 0.1882, "step": 3992 }, { "epoch": 6.348171701112878, "grad_norm": 12.54149412168064, "learning_rate": 4.391801919782499e-05, "loss": 1.9821, "step": 3993 }, { "epoch": 6.349761526232115, "grad_norm": 17.3376403394392, "learning_rate": 4.3923371166035616e-05, "loss": 0.9703, "step": 3994 }, { "epoch": 6.351351351351352, "grad_norm": 3.017023413090191, "learning_rate": 4.392872180632717e-05, "loss": 0.203, "step": 3995 }, { "epoch": 6.352941176470588, "grad_norm": 3.296320082005013, "learning_rate": 4.393407111688865e-05, "loss": 0.1349, "step": 3996 }, { "epoch": 6.354531001589825, "grad_norm": 3.994641021796883, "learning_rate": 4.3939419095909514e-05, "loss": 0.1319, "step": 3997 }, { "epoch": 6.356120826709062, "grad_norm": 5.184669169997193, "learning_rate": 4.394476574157965e-05, "loss": 0.2358, "step": 3998 }, { "epoch": 6.357710651828299, "grad_norm": 2.877361955874784, "learning_rate": 4.395011105208944e-05, "loss": 0.11, "step": 3999 }, { "epoch": 6.359300476947536, "grad_norm": 4.2293162396193695, "learning_rate": 4.395545502562965e-05, "loss": 0.2166, "step": 4000 }, { "epoch": 6.360890302066773, "grad_norm": 10.963684929100229, "learning_rate": 4.3960797660391575e-05, "loss": 0.1714, "step": 4001 }, { "epoch": 6.36248012718601, "grad_norm": 5.59487236743823, "learning_rate": 4.39661389545669e-05, "loss": 0.2562, "step": 4002 }, { "epoch": 6.364069952305247, "grad_norm": 10.061653467332205, "learning_rate": 4.397147890634781e-05, "loss": 0.171, "step": 4003 }, { "epoch": 6.365659777424483, "grad_norm": 8.622192094674991, "learning_rate": 4.3976817513926916e-05, "loss": 0.2221, "step": 4004 }, { "epoch": 6.36724960254372, "grad_norm": 5.7998971194707245, "learning_rate": 4.398215477549728e-05, "loss": 0.1493, "step": 4005 }, { "epoch": 6.368839427662957, "grad_norm": 4.458834685644778, "learning_rate": 4.3987490689252466e-05, "loss": 0.1542, "step": 4006 }, { "epoch": 6.370429252782194, "grad_norm": 3.3330938767392038, "learning_rate": 4.399282525338643e-05, "loss": 0.1468, "step": 4007 }, { "epoch": 6.372019077901431, "grad_norm": 6.740973887431507, "learning_rate": 4.399815846609363e-05, "loss": 0.2361, "step": 4008 }, { "epoch": 6.373608903020668, "grad_norm": 10.789707915495368, "learning_rate": 4.400349032556895e-05, "loss": 0.1725, "step": 4009 }, { "epoch": 6.375198728139905, "grad_norm": 4.806425141030187, "learning_rate": 4.400882083000777e-05, "loss": 0.1707, "step": 4010 }, { "epoch": 6.376788553259141, "grad_norm": 9.725136352271948, "learning_rate": 4.40141499776059e-05, "loss": 0.1953, "step": 4011 }, { "epoch": 6.378378378378378, "grad_norm": 6.753796184065153, "learning_rate": 4.4019477766559604e-05, "loss": 0.1999, "step": 4012 }, { "epoch": 6.379968203497615, "grad_norm": 8.556776131280252, "learning_rate": 4.402480419506563e-05, "loss": 0.1924, "step": 4013 }, { "epoch": 6.381558028616852, "grad_norm": 5.99420785941086, "learning_rate": 4.403012926132118e-05, "loss": 0.184, "step": 4014 }, { "epoch": 6.383147853736089, "grad_norm": 6.778864723894779, "learning_rate": 4.40354529635239e-05, "loss": 0.1947, "step": 4015 }, { "epoch": 6.384737678855326, "grad_norm": 9.074649302615452, "learning_rate": 4.4040775299871915e-05, "loss": 0.1685, "step": 4016 }, { "epoch": 6.386327503974563, "grad_norm": 8.00117527891903, "learning_rate": 4.404609626856381e-05, "loss": 0.1327, "step": 4017 }, { "epoch": 6.3879173290938, "grad_norm": 5.350642809195212, "learning_rate": 4.405141586779863e-05, "loss": 0.1674, "step": 4018 }, { "epoch": 6.389507154213036, "grad_norm": 2.755580722538118, "learning_rate": 4.405673409577587e-05, "loss": 0.1586, "step": 4019 }, { "epoch": 6.391096979332273, "grad_norm": 14.758678022053596, "learning_rate": 4.406205095069552e-05, "loss": 0.2181, "step": 4020 }, { "epoch": 6.39268680445151, "grad_norm": 5.092218507342331, "learning_rate": 4.4067366430758e-05, "loss": 0.1369, "step": 4021 }, { "epoch": 6.394276629570747, "grad_norm": 4.934550163339906, "learning_rate": 4.407268053416423e-05, "loss": 0.225, "step": 4022 }, { "epoch": 6.395866454689984, "grad_norm": 3.70864030634627, "learning_rate": 4.4077993259115566e-05, "loss": 0.1717, "step": 4023 }, { "epoch": 6.397456279809221, "grad_norm": 17.158089789920272, "learning_rate": 4.408330460381385e-05, "loss": 0.22, "step": 4024 }, { "epoch": 6.399046104928458, "grad_norm": 5.59861792737576, "learning_rate": 4.408861456646138e-05, "loss": 0.2167, "step": 4025 }, { "epoch": 6.400635930047695, "grad_norm": 5.502191655174385, "learning_rate": 4.409392314526093e-05, "loss": 0.2082, "step": 4026 }, { "epoch": 6.402225755166931, "grad_norm": 7.3341683384059175, "learning_rate": 4.4099230338415726e-05, "loss": 0.2589, "step": 4027 }, { "epoch": 6.403815580286168, "grad_norm": 7.251789428759672, "learning_rate": 4.410453614412949e-05, "loss": 0.1912, "step": 4028 }, { "epoch": 6.405405405405405, "grad_norm": 43.25777964597279, "learning_rate": 4.4109840560606396e-05, "loss": 1.5185, "step": 4029 }, { "epoch": 6.406995230524642, "grad_norm": 5.571300065924355, "learning_rate": 4.411514358605109e-05, "loss": 0.1964, "step": 4030 }, { "epoch": 6.408585055643879, "grad_norm": 7.2224813416629585, "learning_rate": 4.4120445218668686e-05, "loss": 0.1734, "step": 4031 }, { "epoch": 6.410174880763116, "grad_norm": 3.9078555752465776, "learning_rate": 4.4125745456664776e-05, "loss": 0.1691, "step": 4032 }, { "epoch": 6.411764705882353, "grad_norm": 4.271061963032607, "learning_rate": 4.4131044298245425e-05, "loss": 0.1303, "step": 4033 }, { "epoch": 6.413354531001589, "grad_norm": 178.11054594536967, "learning_rate": 4.4136341741617154e-05, "loss": 1.4875, "step": 4034 }, { "epoch": 6.414944356120826, "grad_norm": 5.612295686705271, "learning_rate": 4.414163778498698e-05, "loss": 0.2064, "step": 4035 }, { "epoch": 6.416534181240063, "grad_norm": 6.098859908724126, "learning_rate": 4.414693242656239e-05, "loss": 0.151, "step": 4036 }, { "epoch": 6.4181240063593, "grad_norm": 3.203231709855121, "learning_rate": 4.4152225664551336e-05, "loss": 0.1848, "step": 4037 }, { "epoch": 6.419713831478537, "grad_norm": 7.3406589784435265, "learning_rate": 4.4157517497162246e-05, "loss": 0.1805, "step": 4038 }, { "epoch": 6.421303656597774, "grad_norm": 3.000364300952209, "learning_rate": 4.416280792260401e-05, "loss": 0.2096, "step": 4039 }, { "epoch": 6.422893481717011, "grad_norm": 2.4078971634233493, "learning_rate": 4.4168096939086046e-05, "loss": 0.2157, "step": 4040 }, { "epoch": 6.424483306836248, "grad_norm": 4.9542284457853745, "learning_rate": 4.417338454481818e-05, "loss": 0.2444, "step": 4041 }, { "epoch": 6.426073131955485, "grad_norm": 4.1261543714723, "learning_rate": 4.417867073801077e-05, "loss": 0.2163, "step": 4042 }, { "epoch": 6.4276629570747215, "grad_norm": 16.22378974982509, "learning_rate": 4.418395551687462e-05, "loss": 1.5064, "step": 4043 }, { "epoch": 6.4292527821939585, "grad_norm": 3.1201888770204733, "learning_rate": 4.418923887962103e-05, "loss": 0.2057, "step": 4044 }, { "epoch": 6.4308426073131955, "grad_norm": 5.743872144233768, "learning_rate": 4.4194520824461776e-05, "loss": 0.2076, "step": 4045 }, { "epoch": 6.4324324324324325, "grad_norm": 2.292652076744471, "learning_rate": 4.41998013496091e-05, "loss": 0.1991, "step": 4046 }, { "epoch": 6.4340222575516695, "grad_norm": 62.23614271105842, "learning_rate": 4.4205080453275736e-05, "loss": 12.2324, "step": 4047 }, { "epoch": 6.4356120826709065, "grad_norm": 14.816067755420178, "learning_rate": 4.421035813367491e-05, "loss": 1.5532, "step": 4048 }, { "epoch": 6.4372019077901435, "grad_norm": 3.2830181903811444, "learning_rate": 4.421563438902031e-05, "loss": 0.2201, "step": 4049 }, { "epoch": 6.43879173290938, "grad_norm": 3.148439388115308, "learning_rate": 4.422090921752612e-05, "loss": 0.2115, "step": 4050 }, { "epoch": 6.440381558028617, "grad_norm": 1.7855128026666314, "learning_rate": 4.4226182617406995e-05, "loss": 0.1558, "step": 4051 }, { "epoch": 6.441971383147854, "grad_norm": 3.9914612145466246, "learning_rate": 4.4231454586878086e-05, "loss": 0.2462, "step": 4052 }, { "epoch": 6.443561208267091, "grad_norm": 5.328733875502032, "learning_rate": 4.423672512415502e-05, "loss": 0.1665, "step": 4053 }, { "epoch": 6.4451510333863276, "grad_norm": 2.748623432625521, "learning_rate": 4.4241994227453904e-05, "loss": 0.2055, "step": 4054 }, { "epoch": 6.4467408585055646, "grad_norm": 8.622985381943458, "learning_rate": 4.424726189499135e-05, "loss": 1.1694, "step": 4055 }, { "epoch": 6.4483306836248016, "grad_norm": 4.704606660522426, "learning_rate": 4.425252812498443e-05, "loss": 0.2733, "step": 4056 }, { "epoch": 6.4499205087440385, "grad_norm": 2.805896888808438, "learning_rate": 4.425779291565073e-05, "loss": 0.2365, "step": 4057 }, { "epoch": 6.451510333863275, "grad_norm": 3.6055006153034563, "learning_rate": 4.426305626520829e-05, "loss": 0.1396, "step": 4058 }, { "epoch": 6.453100158982512, "grad_norm": 4.269089139710243, "learning_rate": 4.4268318171875684e-05, "loss": 0.2135, "step": 4059 }, { "epoch": 6.454689984101749, "grad_norm": 5.546140080024612, "learning_rate": 4.4273578633871925e-05, "loss": 0.1761, "step": 4060 }, { "epoch": 6.456279809220986, "grad_norm": 3.8133186533716206, "learning_rate": 4.4278837649416544e-05, "loss": 0.203, "step": 4061 }, { "epoch": 6.457869634340223, "grad_norm": 3.8188878877071226, "learning_rate": 4.428409521672955e-05, "loss": 0.1932, "step": 4062 }, { "epoch": 6.45945945945946, "grad_norm": 3.9564649464204384, "learning_rate": 4.4289351334031464e-05, "loss": 0.1436, "step": 4063 }, { "epoch": 6.461049284578697, "grad_norm": 2.428846125978776, "learning_rate": 4.429460599954325e-05, "loss": 0.1468, "step": 4064 }, { "epoch": 6.462639109697934, "grad_norm": 2.8251006798995064, "learning_rate": 4.429985921148643e-05, "loss": 0.1621, "step": 4065 }, { "epoch": 6.46422893481717, "grad_norm": 5.032954910167469, "learning_rate": 4.430511096808295e-05, "loss": 0.2437, "step": 4066 }, { "epoch": 6.465818759936407, "grad_norm": 2.985456058827167, "learning_rate": 4.43103612675553e-05, "loss": 0.1605, "step": 4067 }, { "epoch": 6.467408585055644, "grad_norm": 2.921707745916031, "learning_rate": 4.4315610108126446e-05, "loss": 0.1594, "step": 4068 }, { "epoch": 6.468998410174881, "grad_norm": 3.363755283958462, "learning_rate": 4.432085748801983e-05, "loss": 0.1371, "step": 4069 }, { "epoch": 6.470588235294118, "grad_norm": 3.2244524822955696, "learning_rate": 4.43261034054594e-05, "loss": 0.1337, "step": 4070 }, { "epoch": 6.472178060413355, "grad_norm": 3.7252584576350145, "learning_rate": 4.433134785866963e-05, "loss": 0.1768, "step": 4071 }, { "epoch": 6.473767885532592, "grad_norm": 3.82189591423742, "learning_rate": 4.4336590845875444e-05, "loss": 0.2335, "step": 4072 }, { "epoch": 6.475357710651828, "grad_norm": 2.4914664165617113, "learning_rate": 4.434183236530228e-05, "loss": 0.247, "step": 4073 }, { "epoch": 6.476947535771065, "grad_norm": 1.9220398760638604, "learning_rate": 4.4347072415176083e-05, "loss": 0.1562, "step": 4074 }, { "epoch": 6.478537360890302, "grad_norm": 2.6129678566228964, "learning_rate": 4.435231099372328e-05, "loss": 0.1163, "step": 4075 }, { "epoch": 6.480127186009539, "grad_norm": 3.158570473310375, "learning_rate": 4.4357548099170795e-05, "loss": 0.2054, "step": 4076 }, { "epoch": 6.481717011128776, "grad_norm": 2.75595312418156, "learning_rate": 4.436278372974607e-05, "loss": 0.2441, "step": 4077 }, { "epoch": 6.483306836248013, "grad_norm": 2.619607084824096, "learning_rate": 4.436801788367702e-05, "loss": 0.1714, "step": 4078 }, { "epoch": 6.48489666136725, "grad_norm": 80.29574445038882, "learning_rate": 4.437325055919209e-05, "loss": 7.6357, "step": 4079 }, { "epoch": 6.486486486486487, "grad_norm": 2.6933402905203927, "learning_rate": 4.43784817545202e-05, "loss": 0.1652, "step": 4080 }, { "epoch": 6.488076311605723, "grad_norm": 4.3964445509262395, "learning_rate": 4.438371146789078e-05, "loss": 0.4119, "step": 4081 }, { "epoch": 6.48966613672496, "grad_norm": 2.810487179656018, "learning_rate": 4.438893969753376e-05, "loss": 0.1489, "step": 4082 }, { "epoch": 6.491255961844197, "grad_norm": 3.7697934540951024, "learning_rate": 4.4394166441679573e-05, "loss": 0.1572, "step": 4083 }, { "epoch": 6.492845786963434, "grad_norm": 3.362985870045082, "learning_rate": 4.439939169855915e-05, "loss": 0.1453, "step": 4084 }, { "epoch": 6.494435612082671, "grad_norm": 1.9330572843655938, "learning_rate": 4.440461546640395e-05, "loss": 0.1307, "step": 4085 }, { "epoch": 6.496025437201908, "grad_norm": 2.2284663087291996, "learning_rate": 4.44098377434459e-05, "loss": 0.1303, "step": 4086 }, { "epoch": 6.497615262321145, "grad_norm": 3.2198652724415675, "learning_rate": 4.441505852791745e-05, "loss": 0.1768, "step": 4087 }, { "epoch": 6.499205087440382, "grad_norm": 4.2465171958495675, "learning_rate": 4.442027781805156e-05, "loss": 0.1894, "step": 4088 }, { "epoch": 6.500794912559618, "grad_norm": 1.656733093189461, "learning_rate": 4.442549561208169e-05, "loss": 0.175, "step": 4089 }, { "epoch": 6.502384737678855, "grad_norm": 4.7541366258711015, "learning_rate": 4.44307119082418e-05, "loss": 0.1559, "step": 4090 }, { "epoch": 6.503974562798092, "grad_norm": 3.550223802163937, "learning_rate": 4.443592670476636e-05, "loss": 0.1655, "step": 4091 }, { "epoch": 6.505564387917329, "grad_norm": 2.0190405420450666, "learning_rate": 4.444113999989036e-05, "loss": 0.1326, "step": 4092 }, { "epoch": 6.507154213036566, "grad_norm": 16.778922298941115, "learning_rate": 4.4446351791849274e-05, "loss": 40.6076, "step": 4093 }, { "epoch": 6.508744038155803, "grad_norm": 8.912058388609987, "learning_rate": 4.445156207887911e-05, "loss": 0.2417, "step": 4094 }, { "epoch": 6.51033386327504, "grad_norm": 10.466234380750231, "learning_rate": 4.445677085921639e-05, "loss": 0.2404, "step": 4095 }, { "epoch": 6.511923688394276, "grad_norm": 3.2471050920624673, "learning_rate": 4.446197813109809e-05, "loss": 0.1984, "step": 4096 }, { "epoch": 6.513513513513513, "grad_norm": 6.534490689874421, "learning_rate": 4.446718389276176e-05, "loss": 0.2481, "step": 4097 }, { "epoch": 6.51510333863275, "grad_norm": 7.415748213280736, "learning_rate": 4.4472388142445455e-05, "loss": 0.186, "step": 4098 }, { "epoch": 6.516693163751987, "grad_norm": 6.390034897923217, "learning_rate": 4.4477590878387696e-05, "loss": 0.2453, "step": 4099 }, { "epoch": 6.518282988871224, "grad_norm": 1.8076776368588225, "learning_rate": 4.448279209882756e-05, "loss": 0.1412, "step": 4100 }, { "epoch": 6.519872813990461, "grad_norm": 6.73793588970458, "learning_rate": 4.448799180200462e-05, "loss": 0.2103, "step": 4101 }, { "epoch": 6.521462639109698, "grad_norm": 7.246268308641273, "learning_rate": 4.449318998615897e-05, "loss": 0.1775, "step": 4102 }, { "epoch": 6.523052464228935, "grad_norm": 4.199056388954719, "learning_rate": 4.44983866495312e-05, "loss": 0.1396, "step": 4103 }, { "epoch": 6.524642289348172, "grad_norm": 2.666221083753239, "learning_rate": 4.450358179036244e-05, "loss": 0.2034, "step": 4104 }, { "epoch": 6.526232114467408, "grad_norm": 6.818891438556802, "learning_rate": 4.450877540689431e-05, "loss": 0.1695, "step": 4105 }, { "epoch": 6.527821939586645, "grad_norm": 4.801019119669153, "learning_rate": 4.451396749736897e-05, "loss": 0.2766, "step": 4106 }, { "epoch": 6.529411764705882, "grad_norm": 6.050132280219142, "learning_rate": 4.451915806002909e-05, "loss": 0.2046, "step": 4107 }, { "epoch": 6.531001589825119, "grad_norm": 2.3548909308987698, "learning_rate": 4.452434709311783e-05, "loss": 0.1924, "step": 4108 }, { "epoch": 6.532591414944356, "grad_norm": 4.562494610782942, "learning_rate": 4.452953459487891e-05, "loss": 0.2507, "step": 4109 }, { "epoch": 6.534181240063593, "grad_norm": 4.993069996354965, "learning_rate": 4.4534720563556546e-05, "loss": 0.1648, "step": 4110 }, { "epoch": 6.53577106518283, "grad_norm": 5.5897542234483595, "learning_rate": 4.4539904997395466e-05, "loss": 0.1828, "step": 4111 }, { "epoch": 6.537360890302066, "grad_norm": 3.7133003140479626, "learning_rate": 4.454508789464094e-05, "loss": 0.1886, "step": 4112 }, { "epoch": 6.538950715421303, "grad_norm": 5.903340925157806, "learning_rate": 4.455026925353874e-05, "loss": 0.4549, "step": 4113 }, { "epoch": 6.54054054054054, "grad_norm": 4.124933569149193, "learning_rate": 4.4555449072335154e-05, "loss": 0.1934, "step": 4114 }, { "epoch": 6.542130365659777, "grad_norm": 7.042423286041375, "learning_rate": 4.456062734927702e-05, "loss": 0.2105, "step": 4115 }, { "epoch": 6.543720190779014, "grad_norm": 1.8138529423485292, "learning_rate": 4.4565804082611656e-05, "loss": 0.1597, "step": 4116 }, { "epoch": 6.545310015898251, "grad_norm": 1.932735412333997, "learning_rate": 4.4570979270586945e-05, "loss": 0.1222, "step": 4117 }, { "epoch": 6.546899841017488, "grad_norm": 5.439764390808079, "learning_rate": 4.4576152911451264e-05, "loss": 0.1371, "step": 4118 }, { "epoch": 6.548489666136725, "grad_norm": 3.053685340926211, "learning_rate": 4.458132500345352e-05, "loss": 0.1883, "step": 4119 }, { "epoch": 6.550079491255962, "grad_norm": 4.0139912979387695, "learning_rate": 4.4586495544843146e-05, "loss": 0.1535, "step": 4120 }, { "epoch": 6.5516693163751984, "grad_norm": 3.7522915852810192, "learning_rate": 4.4591664533870125e-05, "loss": 0.2158, "step": 4121 }, { "epoch": 6.5532591414944354, "grad_norm": 5.286735107164596, "learning_rate": 4.4596831968784905e-05, "loss": 0.1543, "step": 4122 }, { "epoch": 6.5548489666136724, "grad_norm": 24.81570323891182, "learning_rate": 4.460199784783852e-05, "loss": 0.6685, "step": 4123 }, { "epoch": 6.556438791732909, "grad_norm": 1.636222219007359, "learning_rate": 4.46071621692825e-05, "loss": 0.2099, "step": 4124 }, { "epoch": 6.558028616852146, "grad_norm": 2.6957889145113394, "learning_rate": 4.4612324931368906e-05, "loss": 0.1945, "step": 4125 }, { "epoch": 6.559618441971383, "grad_norm": 2.98671915047828, "learning_rate": 4.461748613235034e-05, "loss": 0.2493, "step": 4126 }, { "epoch": 6.56120826709062, "grad_norm": 2.230162438451041, "learning_rate": 4.462264577047992e-05, "loss": 0.2047, "step": 4127 }, { "epoch": 6.5627980922098565, "grad_norm": 2.312870576446253, "learning_rate": 4.4627803844011284e-05, "loss": 0.192, "step": 4128 }, { "epoch": 6.5643879173290935, "grad_norm": 6.47178567223315, "learning_rate": 4.463296035119862e-05, "loss": 0.7484, "step": 4129 }, { "epoch": 6.5659777424483305, "grad_norm": 4.390119438444852, "learning_rate": 4.463811529029664e-05, "loss": 0.1468, "step": 4130 }, { "epoch": 6.5675675675675675, "grad_norm": 91.16691167333389, "learning_rate": 4.4643268659560574e-05, "loss": 1.547, "step": 4131 }, { "epoch": 6.5691573926868045, "grad_norm": 3.7916995779786697, "learning_rate": 4.464842045724619e-05, "loss": 0.1329, "step": 4132 }, { "epoch": 6.5707472178060415, "grad_norm": 3.4851842886411224, "learning_rate": 4.465357068160982e-05, "loss": 0.1345, "step": 4133 }, { "epoch": 6.5723370429252785, "grad_norm": 2.0932544097302785, "learning_rate": 4.4658719330908266e-05, "loss": 0.1186, "step": 4134 }, { "epoch": 6.573926868044515, "grad_norm": 1.4160980390820952, "learning_rate": 4.466386640339892e-05, "loss": 0.2123, "step": 4135 }, { "epoch": 6.575516693163752, "grad_norm": 2.80095406252407, "learning_rate": 4.466901189733966e-05, "loss": 0.137, "step": 4136 }, { "epoch": 6.577106518282989, "grad_norm": 2.8024390289593004, "learning_rate": 4.467415581098895e-05, "loss": 0.1364, "step": 4137 }, { "epoch": 6.578696343402226, "grad_norm": 2.5120131719083494, "learning_rate": 4.4679298142605734e-05, "loss": 0.1438, "step": 4138 }, { "epoch": 6.580286168521463, "grad_norm": 3.509051078365209, "learning_rate": 4.4684438890449545e-05, "loss": 0.1624, "step": 4139 }, { "epoch": 6.5818759936407, "grad_norm": 2.5540530262876016, "learning_rate": 4.4689578052780405e-05, "loss": 0.1431, "step": 4140 }, { "epoch": 6.583465818759937, "grad_norm": 2.7274762298021016, "learning_rate": 4.469471562785891e-05, "loss": 0.1087, "step": 4141 }, { "epoch": 6.585055643879174, "grad_norm": 1.7849404948383736, "learning_rate": 4.469985161394617e-05, "loss": 0.183, "step": 4142 }, { "epoch": 6.586645468998411, "grad_norm": 1.950400597448263, "learning_rate": 4.470498600930383e-05, "loss": 0.1577, "step": 4143 }, { "epoch": 6.588235294117647, "grad_norm": 2.170789275413261, "learning_rate": 4.47101188121941e-05, "loss": 0.1968, "step": 4144 }, { "epoch": 6.589825119236884, "grad_norm": 1.922325604979357, "learning_rate": 4.4715250020879706e-05, "loss": 0.1515, "step": 4145 }, { "epoch": 6.591414944356121, "grad_norm": 3.041581714208334, "learning_rate": 4.472037963362391e-05, "loss": 0.1483, "step": 4146 }, { "epoch": 6.593004769475358, "grad_norm": 3.68883489382966, "learning_rate": 4.472550764869054e-05, "loss": 0.235, "step": 4147 }, { "epoch": 6.594594594594595, "grad_norm": 1.8267905228025465, "learning_rate": 4.473063406434394e-05, "loss": 0.1559, "step": 4148 }, { "epoch": 6.596184419713832, "grad_norm": 1.977957817884437, "learning_rate": 4.473575887884901e-05, "loss": 0.1853, "step": 4149 }, { "epoch": 6.597774244833069, "grad_norm": 3.2718560253419247, "learning_rate": 4.4740882090471163e-05, "loss": 0.2213, "step": 4150 }, { "epoch": 6.599364069952305, "grad_norm": 3.029076651509095, "learning_rate": 4.474600369747641e-05, "loss": 0.1025, "step": 4151 }, { "epoch": 6.600953895071542, "grad_norm": 1.3847918791207174, "learning_rate": 4.4751123698131245e-05, "loss": 0.1289, "step": 4152 }, { "epoch": 6.602543720190779, "grad_norm": 2.70714103735813, "learning_rate": 4.475624209070276e-05, "loss": 0.1472, "step": 4153 }, { "epoch": 6.604133545310016, "grad_norm": 3.2641256215652135, "learning_rate": 4.476135887345854e-05, "loss": 0.1951, "step": 4154 }, { "epoch": 6.605723370429253, "grad_norm": 2.803762746952952, "learning_rate": 4.4766474044666746e-05, "loss": 0.1639, "step": 4155 }, { "epoch": 6.60731319554849, "grad_norm": 1.4804773844359571, "learning_rate": 4.4771587602596086e-05, "loss": 0.163, "step": 4156 }, { "epoch": 6.608903020667727, "grad_norm": 2.161964297812788, "learning_rate": 4.47766995455158e-05, "loss": 0.1796, "step": 4157 }, { "epoch": 6.610492845786963, "grad_norm": 2.097577918500342, "learning_rate": 4.478180987169568e-05, "loss": 0.135, "step": 4158 }, { "epoch": 6.6120826709062, "grad_norm": 1.8879819006038643, "learning_rate": 4.478691857940607e-05, "loss": 0.1929, "step": 4159 }, { "epoch": 6.613672496025437, "grad_norm": 1.735963788081695, "learning_rate": 4.479202566691785e-05, "loss": 0.1537, "step": 4160 }, { "epoch": 6.615262321144674, "grad_norm": 1.5855322385465556, "learning_rate": 4.479713113250246e-05, "loss": 0.1633, "step": 4161 }, { "epoch": 6.616852146263911, "grad_norm": 1.402199399484262, "learning_rate": 4.4802234974431896e-05, "loss": 0.1688, "step": 4162 }, { "epoch": 6.618441971383148, "grad_norm": 2.702890457660564, "learning_rate": 4.480733719097867e-05, "loss": 0.1729, "step": 4163 }, { "epoch": 6.620031796502385, "grad_norm": 1.2603386268854597, "learning_rate": 4.481243778041588e-05, "loss": 0.1766, "step": 4164 }, { "epoch": 6.621621621621622, "grad_norm": 2.856529865742124, "learning_rate": 4.481753674101716e-05, "loss": 0.1712, "step": 4165 }, { "epoch": 6.623211446740859, "grad_norm": 3.67855357477022, "learning_rate": 4.4822634071056686e-05, "loss": 0.1659, "step": 4166 }, { "epoch": 6.624801271860095, "grad_norm": 2.153455469474261, "learning_rate": 4.4827729768809214e-05, "loss": 0.1477, "step": 4167 }, { "epoch": 6.626391096979332, "grad_norm": 5.153891845695849, "learning_rate": 4.4832823832550024e-05, "loss": 0.1441, "step": 4168 }, { "epoch": 6.627980922098569, "grad_norm": 2.214953228943529, "learning_rate": 4.483791626055497e-05, "loss": 0.2045, "step": 4169 }, { "epoch": 6.629570747217806, "grad_norm": 4.326888221329487, "learning_rate": 4.484300705110043e-05, "loss": 0.1558, "step": 4170 }, { "epoch": 6.631160572337043, "grad_norm": 2.0854850060483643, "learning_rate": 4.4848096202463376e-05, "loss": 0.1146, "step": 4171 }, { "epoch": 6.63275039745628, "grad_norm": 1.7508905779242037, "learning_rate": 4.48531837129213e-05, "loss": 0.1712, "step": 4172 }, { "epoch": 6.634340222575517, "grad_norm": 3.3784266176558533, "learning_rate": 4.485826958075227e-05, "loss": 0.1355, "step": 4173 }, { "epoch": 6.635930047694753, "grad_norm": 2.434623114463907, "learning_rate": 4.4863353804234906e-05, "loss": 0.1648, "step": 4174 }, { "epoch": 6.63751987281399, "grad_norm": 2.0246151551374063, "learning_rate": 4.486843638164838e-05, "loss": 0.202, "step": 4175 }, { "epoch": 6.639109697933227, "grad_norm": 2.7715544550132885, "learning_rate": 4.487351731127243e-05, "loss": 0.1842, "step": 4176 }, { "epoch": 6.640699523052464, "grad_norm": 3.272943522867485, "learning_rate": 4.4878596591387335e-05, "loss": 0.1369, "step": 4177 }, { "epoch": 6.642289348171701, "grad_norm": 5.164290188797228, "learning_rate": 4.488367422027394e-05, "loss": 0.1451, "step": 4178 }, { "epoch": 6.643879173290938, "grad_norm": 1.3550320986289068, "learning_rate": 4.4888750196213664e-05, "loss": 0.143, "step": 4179 }, { "epoch": 6.645468998410175, "grad_norm": 2.0748276021394583, "learning_rate": 4.489382451748846e-05, "loss": 0.1865, "step": 4180 }, { "epoch": 6.647058823529412, "grad_norm": 3.8053787930494853, "learning_rate": 4.4898897182380874e-05, "loss": 0.1852, "step": 4181 }, { "epoch": 6.648648648648649, "grad_norm": 1.405798674807752, "learning_rate": 4.4903968189173975e-05, "loss": 0.1522, "step": 4182 }, { "epoch": 6.650238473767885, "grad_norm": 4.3563684087195265, "learning_rate": 4.490903753615141e-05, "loss": 0.1578, "step": 4183 }, { "epoch": 6.651828298887122, "grad_norm": 1.3022724142111404, "learning_rate": 4.4914105221597396e-05, "loss": 0.1489, "step": 4184 }, { "epoch": 6.653418124006359, "grad_norm": 17.370899541976673, "learning_rate": 4.4919171243796705e-05, "loss": 2.6541, "step": 4185 }, { "epoch": 6.655007949125596, "grad_norm": 15.322852884599909, "learning_rate": 4.492423560103467e-05, "loss": 2.1362, "step": 4186 }, { "epoch": 6.656597774244833, "grad_norm": 2.636254950106106, "learning_rate": 4.492929829159719e-05, "loss": 0.1696, "step": 4187 }, { "epoch": 6.65818759936407, "grad_norm": 2.5892144107281476, "learning_rate": 4.4934359313770734e-05, "loss": 0.0853, "step": 4188 }, { "epoch": 6.659777424483307, "grad_norm": 3.585243968836552, "learning_rate": 4.493941866584231e-05, "loss": 0.1621, "step": 4189 }, { "epoch": 6.661367249602543, "grad_norm": 5.068043929543862, "learning_rate": 4.494447634609953e-05, "loss": 0.1613, "step": 4190 }, { "epoch": 6.66295707472178, "grad_norm": 4.3375980187874985, "learning_rate": 4.4949532352830546e-05, "loss": 0.2161, "step": 4191 }, { "epoch": 6.664546899841017, "grad_norm": 3.9491016709397675, "learning_rate": 4.4954586684324084e-05, "loss": 0.1731, "step": 4192 }, { "epoch": 6.666136724960254, "grad_norm": 3.9313800110846655, "learning_rate": 4.4959639338869424e-05, "loss": 0.1721, "step": 4193 }, { "epoch": 6.667726550079491, "grad_norm": 4.609925836392233, "learning_rate": 4.496469031475644e-05, "loss": 0.1738, "step": 4194 }, { "epoch": 6.669316375198728, "grad_norm": 2.419974275945473, "learning_rate": 4.4969739610275554e-05, "loss": 0.1931, "step": 4195 }, { "epoch": 6.670906200317965, "grad_norm": 2.934184736063685, "learning_rate": 4.4974787223717766e-05, "loss": 0.1555, "step": 4196 }, { "epoch": 6.672496025437201, "grad_norm": 4.254513886997416, "learning_rate": 4.4979833153374644e-05, "loss": 0.1962, "step": 4197 }, { "epoch": 6.674085850556438, "grad_norm": 2.598687127600904, "learning_rate": 4.4984877397538306e-05, "loss": 0.2055, "step": 4198 }, { "epoch": 6.675675675675675, "grad_norm": 3.5935872176578365, "learning_rate": 4.498991995450147e-05, "loss": 0.1868, "step": 4199 }, { "epoch": 6.677265500794912, "grad_norm": 2.500992619321777, "learning_rate": 4.4994960822557425e-05, "loss": 0.1168, "step": 4200 }, { "epoch": 6.678855325914149, "grad_norm": 2.6862905780065924, "learning_rate": 4.5e-05, "loss": 0.1639, "step": 4201 }, { "epoch": 6.680445151033386, "grad_norm": 4.832893228649769, "learning_rate": 4.500503748512363e-05, "loss": 0.1402, "step": 4202 }, { "epoch": 6.682034976152623, "grad_norm": 4.157398719065339, "learning_rate": 4.5010073276223296e-05, "loss": 0.1548, "step": 4203 }, { "epoch": 6.68362480127186, "grad_norm": 2.4766670360841734, "learning_rate": 4.5015107371594575e-05, "loss": 0.1744, "step": 4204 }, { "epoch": 6.685214626391097, "grad_norm": 2.7552527786534777, "learning_rate": 4.5020139769533606e-05, "loss": 0.1739, "step": 4205 }, { "epoch": 6.6868044515103335, "grad_norm": 4.45181114027743, "learning_rate": 4.50251704683371e-05, "loss": 0.2036, "step": 4206 }, { "epoch": 6.6883942766295705, "grad_norm": 5.509601070433082, "learning_rate": 4.5030199466302356e-05, "loss": 0.2051, "step": 4207 }, { "epoch": 6.6899841017488075, "grad_norm": 2.3072765541269975, "learning_rate": 4.5035226761727226e-05, "loss": 0.1374, "step": 4208 }, { "epoch": 6.6915739268680445, "grad_norm": 2.8937502254205936, "learning_rate": 4.504025235291017e-05, "loss": 0.1831, "step": 4209 }, { "epoch": 6.6931637519872815, "grad_norm": 5.099268008900632, "learning_rate": 4.50452762381502e-05, "loss": 0.221, "step": 4210 }, { "epoch": 6.6947535771065185, "grad_norm": 2.737278740735534, "learning_rate": 4.5050298415746904e-05, "loss": 0.1235, "step": 4211 }, { "epoch": 6.6963434022257555, "grad_norm": 2.934869150419045, "learning_rate": 4.5055318884000465e-05, "loss": 0.1407, "step": 4212 }, { "epoch": 6.697933227344992, "grad_norm": 2.3096486699797416, "learning_rate": 4.506033764121164e-05, "loss": 0.1469, "step": 4213 }, { "epoch": 6.699523052464229, "grad_norm": 5.501162671204545, "learning_rate": 4.506535468568176e-05, "loss": 0.2199, "step": 4214 }, { "epoch": 6.701112877583466, "grad_norm": 4.385089568425522, "learning_rate": 4.5070370015712725e-05, "loss": 0.1697, "step": 4215 }, { "epoch": 6.702702702702703, "grad_norm": 4.4453416900484255, "learning_rate": 4.507538362960704e-05, "loss": 0.1079, "step": 4216 }, { "epoch": 6.70429252782194, "grad_norm": 6.759986990227837, "learning_rate": 4.508039552566778e-05, "loss": 0.1811, "step": 4217 }, { "epoch": 6.705882352941177, "grad_norm": 3.5159415059676835, "learning_rate": 4.5085405702198595e-05, "loss": 0.2381, "step": 4218 }, { "epoch": 6.707472178060414, "grad_norm": 5.5315579210461925, "learning_rate": 4.509041415750372e-05, "loss": 0.191, "step": 4219 }, { "epoch": 6.709062003179651, "grad_norm": 4.728651012535913, "learning_rate": 4.5095420889887966e-05, "loss": 0.1841, "step": 4220 }, { "epoch": 6.710651828298887, "grad_norm": 4.274269797975611, "learning_rate": 4.510042589765676e-05, "loss": 0.1214, "step": 4221 }, { "epoch": 6.712241653418124, "grad_norm": 3.931606624782243, "learning_rate": 4.510542917911606e-05, "loss": 0.1772, "step": 4222 }, { "epoch": 6.713831478537361, "grad_norm": 6.196075958716857, "learning_rate": 4.511043073257246e-05, "loss": 0.3037, "step": 4223 }, { "epoch": 6.715421303656598, "grad_norm": 5.222759100952918, "learning_rate": 4.51154305563331e-05, "loss": 0.235, "step": 4224 }, { "epoch": 6.717011128775835, "grad_norm": 3.32644777557435, "learning_rate": 4.512042864870572e-05, "loss": 0.1294, "step": 4225 }, { "epoch": 6.718600953895072, "grad_norm": 2.9568315292353033, "learning_rate": 4.5125425007998656e-05, "loss": 0.1719, "step": 4226 }, { "epoch": 6.720190779014309, "grad_norm": 8.207785904820122, "learning_rate": 4.513041963252082e-05, "loss": 0.177, "step": 4227 }, { "epoch": 6.721780604133546, "grad_norm": 2.388638636840881, "learning_rate": 4.51354125205817e-05, "loss": 0.1841, "step": 4228 }, { "epoch": 6.723370429252782, "grad_norm": 5.455312028458112, "learning_rate": 4.514040367049141e-05, "loss": 0.2682, "step": 4229 }, { "epoch": 6.724960254372019, "grad_norm": 5.399574841592194, "learning_rate": 4.5145393080560596e-05, "loss": 0.1825, "step": 4230 }, { "epoch": 6.726550079491256, "grad_norm": 3.2279298308830953, "learning_rate": 4.515038074910055e-05, "loss": 0.1722, "step": 4231 }, { "epoch": 6.728139904610493, "grad_norm": 2.2987957091093056, "learning_rate": 4.515536667442311e-05, "loss": 0.15, "step": 4232 }, { "epoch": 6.72972972972973, "grad_norm": 4.5221904119019225, "learning_rate": 4.516035085484072e-05, "loss": 0.1829, "step": 4233 }, { "epoch": 6.731319554848967, "grad_norm": 3.9881168553638973, "learning_rate": 4.516533328866642e-05, "loss": 0.1808, "step": 4234 }, { "epoch": 6.732909379968204, "grad_norm": 5.250577837520792, "learning_rate": 4.5170313974213846e-05, "loss": 0.2266, "step": 4235 }, { "epoch": 6.73449920508744, "grad_norm": 3.761486198326603, "learning_rate": 4.51752929097972e-05, "loss": 0.1721, "step": 4236 }, { "epoch": 6.736089030206677, "grad_norm": 2.7902909318562514, "learning_rate": 4.51802700937313e-05, "loss": 0.163, "step": 4237 }, { "epoch": 6.737678855325914, "grad_norm": 2.296660053029683, "learning_rate": 4.518524552433156e-05, "loss": 0.2288, "step": 4238 }, { "epoch": 6.739268680445151, "grad_norm": 3.3586708776925844, "learning_rate": 4.5190219199913963e-05, "loss": 0.1861, "step": 4239 }, { "epoch": 6.740858505564388, "grad_norm": 2.341420325013109, "learning_rate": 4.5195191118795096e-05, "loss": 0.1609, "step": 4240 }, { "epoch": 6.742448330683625, "grad_norm": 3.3639614830927393, "learning_rate": 4.5200161279292155e-05, "loss": 0.7839, "step": 4241 }, { "epoch": 6.744038155802862, "grad_norm": 2.4842270030998814, "learning_rate": 4.520512967972292e-05, "loss": 0.2218, "step": 4242 }, { "epoch": 6.745627980922099, "grad_norm": 4.571691286436985, "learning_rate": 4.521009631840576e-05, "loss": 0.1635, "step": 4243 }, { "epoch": 6.747217806041336, "grad_norm": 6.129616391721861, "learning_rate": 4.5215061193659664e-05, "loss": 0.2904, "step": 4244 }, { "epoch": 6.748807631160572, "grad_norm": 3.6105297440774478, "learning_rate": 4.5220024303804185e-05, "loss": 0.1336, "step": 4245 }, { "epoch": 6.750397456279809, "grad_norm": 2.4811524700858616, "learning_rate": 4.522498564715949e-05, "loss": 0.2214, "step": 4246 }, { "epoch": 6.751987281399046, "grad_norm": 2.843715580604871, "learning_rate": 4.5229945222046355e-05, "loss": 0.2092, "step": 4247 }, { "epoch": 6.753577106518283, "grad_norm": 10.102758532659559, "learning_rate": 4.5234903026786134e-05, "loss": 0.2153, "step": 4248 }, { "epoch": 6.75516693163752, "grad_norm": 3.2370048772050226, "learning_rate": 4.523985905970079e-05, "loss": 0.1999, "step": 4249 }, { "epoch": 6.756756756756757, "grad_norm": 2.3467772695946514, "learning_rate": 4.52448133191129e-05, "loss": 0.1515, "step": 4250 }, { "epoch": 6.758346581875994, "grad_norm": 2.8136175582492573, "learning_rate": 4.52497658033456e-05, "loss": 0.2036, "step": 4251 }, { "epoch": 6.75993640699523, "grad_norm": 6.027317266342272, "learning_rate": 4.525471651072268e-05, "loss": 0.1909, "step": 4252 }, { "epoch": 6.761526232114467, "grad_norm": 4.682268661443079, "learning_rate": 4.525966543956849e-05, "loss": 0.1509, "step": 4253 }, { "epoch": 6.763116057233704, "grad_norm": 4.06777493323999, "learning_rate": 4.5264612588207996e-05, "loss": 0.2328, "step": 4254 }, { "epoch": 6.764705882352941, "grad_norm": 2.527735934494563, "learning_rate": 4.526955795496678e-05, "loss": 0.173, "step": 4255 }, { "epoch": 6.766295707472178, "grad_norm": 9.625407897919226, "learning_rate": 4.5274501538171e-05, "loss": 0.3158, "step": 4256 }, { "epoch": 6.767885532591415, "grad_norm": 5.445454892596262, "learning_rate": 4.5279443336147434e-05, "loss": 0.2017, "step": 4257 }, { "epoch": 6.769475357710652, "grad_norm": 4.944569783427609, "learning_rate": 4.5284383347223474e-05, "loss": 0.2005, "step": 4258 }, { "epoch": 6.771065182829888, "grad_norm": 2.237953711332626, "learning_rate": 4.52893215697271e-05, "loss": 0.1583, "step": 4259 }, { "epoch": 6.772655007949125, "grad_norm": 4.544457996370721, "learning_rate": 4.529425800198689e-05, "loss": 0.2304, "step": 4260 }, { "epoch": 6.774244833068362, "grad_norm": 4.140267830458746, "learning_rate": 4.5299192642332046e-05, "loss": 0.1413, "step": 4261 }, { "epoch": 6.775834658187599, "grad_norm": 4.083390595783748, "learning_rate": 4.530412548909239e-05, "loss": 0.1449, "step": 4262 }, { "epoch": 6.777424483306836, "grad_norm": 3.714350425437931, "learning_rate": 4.530905654059831e-05, "loss": 0.1614, "step": 4263 }, { "epoch": 6.779014308426073, "grad_norm": 30.794175521195413, "learning_rate": 4.5313985795180835e-05, "loss": 0.5791, "step": 4264 }, { "epoch": 6.78060413354531, "grad_norm": 2.6406991720028823, "learning_rate": 4.531891325117158e-05, "loss": 0.1726, "step": 4265 }, { "epoch": 6.782193958664547, "grad_norm": 2.9349091080933793, "learning_rate": 4.5323838906902786e-05, "loss": 0.2118, "step": 4266 }, { "epoch": 6.783783783783784, "grad_norm": 4.094321752173411, "learning_rate": 4.53287627607073e-05, "loss": 0.1605, "step": 4267 }, { "epoch": 6.78537360890302, "grad_norm": 3.032077374341222, "learning_rate": 4.533368481091858e-05, "loss": 0.19, "step": 4268 }, { "epoch": 6.786963434022257, "grad_norm": 3.3632791445552117, "learning_rate": 4.533860505587067e-05, "loss": 0.1788, "step": 4269 }, { "epoch": 6.788553259141494, "grad_norm": 4.389297209223793, "learning_rate": 4.5343523493898264e-05, "loss": 0.209, "step": 4270 }, { "epoch": 6.790143084260731, "grad_norm": 2.3514727887599713, "learning_rate": 4.534844012333665e-05, "loss": 0.2007, "step": 4271 }, { "epoch": 6.791732909379968, "grad_norm": 51.06739310629877, "learning_rate": 4.53533549425217e-05, "loss": 6.4069, "step": 4272 }, { "epoch": 6.793322734499205, "grad_norm": 4.980417288008446, "learning_rate": 4.535826794978997e-05, "loss": 0.1832, "step": 4273 }, { "epoch": 6.794912559618442, "grad_norm": 3.0695349264769654, "learning_rate": 4.5363179143478554e-05, "loss": 0.1907, "step": 4274 }, { "epoch": 6.796502384737678, "grad_norm": 5.374833210228057, "learning_rate": 4.536808852192519e-05, "loss": 0.2006, "step": 4275 }, { "epoch": 6.798092209856915, "grad_norm": 3.8594680338551015, "learning_rate": 4.537299608346824e-05, "loss": 0.1276, "step": 4276 }, { "epoch": 6.799682034976152, "grad_norm": 3.629918102998424, "learning_rate": 4.537790182644667e-05, "loss": 0.1877, "step": 4277 }, { "epoch": 6.801271860095389, "grad_norm": 3.2901623352864706, "learning_rate": 4.538280574920007e-05, "loss": 0.1619, "step": 4278 }, { "epoch": 6.802861685214626, "grad_norm": 4.572962645668463, "learning_rate": 4.538770785006863e-05, "loss": 0.1019, "step": 4279 }, { "epoch": 6.804451510333863, "grad_norm": 2.842924626115517, "learning_rate": 4.539260812739317e-05, "loss": 0.1859, "step": 4280 }, { "epoch": 6.8060413354531, "grad_norm": 2.3743410985348965, "learning_rate": 4.5397506579515124e-05, "loss": 0.1867, "step": 4281 }, { "epoch": 6.807631160572337, "grad_norm": 2.3935101520924036, "learning_rate": 4.540240320477655e-05, "loss": 0.1821, "step": 4282 }, { "epoch": 6.809220985691574, "grad_norm": 6.388679439023524, "learning_rate": 4.540729800152011e-05, "loss": 0.2968, "step": 4283 }, { "epoch": 6.8108108108108105, "grad_norm": 3.6507247505750353, "learning_rate": 4.5412190968089085e-05, "loss": 0.1913, "step": 4284 }, { "epoch": 6.8124006359300475, "grad_norm": 2.384285220432607, "learning_rate": 4.54170821028274e-05, "loss": 0.1533, "step": 4285 }, { "epoch": 6.8139904610492845, "grad_norm": 3.9267428597959815, "learning_rate": 4.5421971404079575e-05, "loss": 0.2098, "step": 4286 }, { "epoch": 6.8155802861685215, "grad_norm": 2.1829105065184695, "learning_rate": 4.542685887019075e-05, "loss": 0.119, "step": 4287 }, { "epoch": 6.8171701112877585, "grad_norm": 2.1415477130501004, "learning_rate": 4.5431744499506714e-05, "loss": 0.1225, "step": 4288 }, { "epoch": 6.8187599364069955, "grad_norm": 4.522357963564039, "learning_rate": 4.543662829037383e-05, "loss": 0.1602, "step": 4289 }, { "epoch": 6.8203497615262325, "grad_norm": 4.371273099884999, "learning_rate": 4.544151024113914e-05, "loss": 0.167, "step": 4290 }, { "epoch": 6.821939586645469, "grad_norm": 2.4110927872583803, "learning_rate": 4.544639035015027e-05, "loss": 0.1381, "step": 4291 }, { "epoch": 6.823529411764706, "grad_norm": 2.229744073403196, "learning_rate": 4.545126861575548e-05, "loss": 0.1652, "step": 4292 }, { "epoch": 6.825119236883943, "grad_norm": 6.7911831435048, "learning_rate": 4.545614503630365e-05, "loss": 0.1837, "step": 4293 }, { "epoch": 6.82670906200318, "grad_norm": 3.3756677187424606, "learning_rate": 4.546101961014429e-05, "loss": 0.1496, "step": 4294 }, { "epoch": 6.828298887122417, "grad_norm": 4.4074980558484205, "learning_rate": 4.546589233562754e-05, "loss": 0.1842, "step": 4295 }, { "epoch": 6.829888712241654, "grad_norm": 5.057782872498222, "learning_rate": 4.547076321110415e-05, "loss": 0.7432, "step": 4296 }, { "epoch": 6.831478537360891, "grad_norm": 4.949139918646627, "learning_rate": 4.54756322349255e-05, "loss": 0.1609, "step": 4297 }, { "epoch": 6.833068362480127, "grad_norm": 4.4383495922621155, "learning_rate": 4.548049940544362e-05, "loss": 0.146, "step": 4298 }, { "epoch": 6.834658187599364, "grad_norm": 3.133071970652931, "learning_rate": 4.548536472101114e-05, "loss": 0.1566, "step": 4299 }, { "epoch": 6.836248012718601, "grad_norm": 6.3056890865951605, "learning_rate": 4.5490228179981325e-05, "loss": 0.2553, "step": 4300 }, { "epoch": 6.837837837837838, "grad_norm": 2.3891160107537237, "learning_rate": 4.549508978070806e-05, "loss": 0.1564, "step": 4301 }, { "epoch": 6.839427662957075, "grad_norm": 2.613451198051224, "learning_rate": 4.549994952154589e-05, "loss": 0.1453, "step": 4302 }, { "epoch": 6.841017488076312, "grad_norm": 2.6797437097388714, "learning_rate": 4.550480740084996e-05, "loss": 0.2843, "step": 4303 }, { "epoch": 6.842607313195549, "grad_norm": 1.7930253974671073, "learning_rate": 4.5509663416976045e-05, "loss": 0.1174, "step": 4304 }, { "epoch": 6.844197138314786, "grad_norm": 2.8362761467484563, "learning_rate": 4.551451756828058e-05, "loss": 0.199, "step": 4305 }, { "epoch": 6.845786963434023, "grad_norm": 2.267866454877451, "learning_rate": 4.5519369853120585e-05, "loss": 0.2242, "step": 4306 }, { "epoch": 6.847376788553259, "grad_norm": 3.1216264561110316, "learning_rate": 4.5524220269853755e-05, "loss": 0.1918, "step": 4307 }, { "epoch": 6.848966613672496, "grad_norm": 2.6818501923676243, "learning_rate": 4.552906881683839e-05, "loss": 0.2023, "step": 4308 }, { "epoch": 6.850556438791733, "grad_norm": 3.2857185659601233, "learning_rate": 4.553391549243344e-05, "loss": 0.2123, "step": 4309 }, { "epoch": 6.85214626391097, "grad_norm": 3.349193875983658, "learning_rate": 4.553876029499848e-05, "loss": 0.1364, "step": 4310 }, { "epoch": 6.853736089030207, "grad_norm": 2.7059401189543677, "learning_rate": 4.5543603222893715e-05, "loss": 0.1409, "step": 4311 }, { "epoch": 6.855325914149444, "grad_norm": 3.9090392039715423, "learning_rate": 4.554844427447999e-05, "loss": 0.265, "step": 4312 }, { "epoch": 6.856915739268681, "grad_norm": 4.245928169312547, "learning_rate": 4.555328344811879e-05, "loss": 0.3164, "step": 4313 }, { "epoch": 6.858505564387917, "grad_norm": 2.986860661809424, "learning_rate": 4.5558120742172235e-05, "loss": 0.1299, "step": 4314 }, { "epoch": 6.860095389507154, "grad_norm": 3.8900717088854915, "learning_rate": 4.5562956155003055e-05, "loss": 0.1875, "step": 4315 }, { "epoch": 6.861685214626391, "grad_norm": 1.7722948444032445, "learning_rate": 4.5567789684974644e-05, "loss": 0.1188, "step": 4316 }, { "epoch": 6.863275039745628, "grad_norm": 3.590403468719313, "learning_rate": 4.557262133045105e-05, "loss": 0.1276, "step": 4317 }, { "epoch": 6.864864864864865, "grad_norm": 3.0243520817716476, "learning_rate": 4.5577451089796905e-05, "loss": 0.1848, "step": 4318 }, { "epoch": 6.866454689984102, "grad_norm": 2.9257651026889007, "learning_rate": 4.558227896137753e-05, "loss": 0.14, "step": 4319 }, { "epoch": 6.868044515103339, "grad_norm": 2.4600606204485387, "learning_rate": 4.558710494355886e-05, "loss": 0.1377, "step": 4320 }, { "epoch": 6.869634340222575, "grad_norm": 2.0427304241410353, "learning_rate": 4.5591929034707465e-05, "loss": 0.1713, "step": 4321 }, { "epoch": 6.871224165341812, "grad_norm": 4.692690811229895, "learning_rate": 4.5596751233190586e-05, "loss": 0.2372, "step": 4322 }, { "epoch": 6.872813990461049, "grad_norm": 2.409305668481138, "learning_rate": 4.560157153737607e-05, "loss": 0.1504, "step": 4323 }, { "epoch": 6.874403815580286, "grad_norm": 1.7477798440277965, "learning_rate": 4.560638994563242e-05, "loss": 0.1207, "step": 4324 }, { "epoch": 6.875993640699523, "grad_norm": 2.1574900068327767, "learning_rate": 4.561120645632878e-05, "loss": 0.1841, "step": 4325 }, { "epoch": 6.87758346581876, "grad_norm": 2.964160212310907, "learning_rate": 4.561602106783493e-05, "loss": 0.1716, "step": 4326 }, { "epoch": 6.879173290937997, "grad_norm": 2.627997605819045, "learning_rate": 4.56208337785213e-05, "loss": 0.1461, "step": 4327 }, { "epoch": 6.880763116057234, "grad_norm": 2.620367548782156, "learning_rate": 4.562564458675898e-05, "loss": 0.1811, "step": 4328 }, { "epoch": 6.882352941176471, "grad_norm": 1.804058886365865, "learning_rate": 4.563045349091967e-05, "loss": 0.1349, "step": 4329 }, { "epoch": 6.883942766295707, "grad_norm": 4.502258630834874, "learning_rate": 4.5635260489375715e-05, "loss": 0.155, "step": 4330 }, { "epoch": 6.885532591414944, "grad_norm": 1.795624252467874, "learning_rate": 4.564006558050015e-05, "loss": 0.1532, "step": 4331 }, { "epoch": 6.887122416534181, "grad_norm": 2.7049792818053437, "learning_rate": 4.56448687626666e-05, "loss": 0.1847, "step": 4332 }, { "epoch": 6.888712241653418, "grad_norm": 3.0312981410024817, "learning_rate": 4.564967003424938e-05, "loss": 0.1483, "step": 4333 }, { "epoch": 6.890302066772655, "grad_norm": 2.6143849248285536, "learning_rate": 4.565446939362343e-05, "loss": 0.1399, "step": 4334 }, { "epoch": 6.891891891891892, "grad_norm": 2.756393685075757, "learning_rate": 4.5659266839164335e-05, "loss": 0.2123, "step": 4335 }, { "epoch": 6.893481717011129, "grad_norm": 2.797721396692664, "learning_rate": 4.5664062369248324e-05, "loss": 0.2126, "step": 4336 }, { "epoch": 6.895071542130365, "grad_norm": 1.8284739094170694, "learning_rate": 4.5668855982252314e-05, "loss": 0.2238, "step": 4337 }, { "epoch": 6.896661367249602, "grad_norm": 1.6322523983765866, "learning_rate": 4.567364767655381e-05, "loss": 0.0908, "step": 4338 }, { "epoch": 6.898251192368839, "grad_norm": 3.8948783100518494, "learning_rate": 4.567843745053101e-05, "loss": 0.2126, "step": 4339 }, { "epoch": 6.899841017488076, "grad_norm": 2.308803879861451, "learning_rate": 4.5683225302562756e-05, "loss": 0.1385, "step": 4340 }, { "epoch": 6.901430842607313, "grad_norm": 1.9766678525387713, "learning_rate": 4.568801123102852e-05, "loss": 0.2095, "step": 4341 }, { "epoch": 6.90302066772655, "grad_norm": 3.126412347455305, "learning_rate": 4.569279523430844e-05, "loss": 0.1705, "step": 4342 }, { "epoch": 6.904610492845787, "grad_norm": 1.8700961476505393, "learning_rate": 4.569757731078332e-05, "loss": 0.158, "step": 4343 }, { "epoch": 6.906200317965024, "grad_norm": 2.0929624950407475, "learning_rate": 4.570235745883458e-05, "loss": 0.1595, "step": 4344 }, { "epoch": 6.907790143084261, "grad_norm": 2.4512566135424154, "learning_rate": 4.570713567684431e-05, "loss": 0.1365, "step": 4345 }, { "epoch": 6.909379968203497, "grad_norm": 2.7950146792060724, "learning_rate": 4.571191196319529e-05, "loss": 0.0956, "step": 4346 }, { "epoch": 6.910969793322734, "grad_norm": 2.3457808396877655, "learning_rate": 4.5716686316270885e-05, "loss": 0.161, "step": 4347 }, { "epoch": 6.912559618441971, "grad_norm": 2.0457646872665274, "learning_rate": 4.5721458734455165e-05, "loss": 0.1356, "step": 4348 }, { "epoch": 6.914149443561208, "grad_norm": 2.935246167895616, "learning_rate": 4.572622921613284e-05, "loss": 0.1651, "step": 4349 }, { "epoch": 6.915739268680445, "grad_norm": 3.3088219843452036, "learning_rate": 4.573099775968926e-05, "loss": 0.1215, "step": 4350 }, { "epoch": 6.917329093799682, "grad_norm": 3.051272148340767, "learning_rate": 4.573576436351046e-05, "loss": 0.2047, "step": 4351 }, { "epoch": 6.918918918918919, "grad_norm": 2.6660671100869835, "learning_rate": 4.574052902598312e-05, "loss": 0.1658, "step": 4352 }, { "epoch": 6.920508744038155, "grad_norm": 18.312242504997883, "learning_rate": 4.574529174549456e-05, "loss": 4.2673, "step": 4353 }, { "epoch": 6.922098569157392, "grad_norm": 2.006590439935249, "learning_rate": 4.575005252043279e-05, "loss": 0.1773, "step": 4354 }, { "epoch": 6.923688394276629, "grad_norm": 5.997622103161415, "learning_rate": 4.575481134918645e-05, "loss": 0.1614, "step": 4355 }, { "epoch": 6.925278219395866, "grad_norm": 2.060800157233172, "learning_rate": 4.5759568230144836e-05, "loss": 0.166, "step": 4356 }, { "epoch": 6.926868044515103, "grad_norm": 2.6705672162478544, "learning_rate": 4.5764323161697934e-05, "loss": 0.2583, "step": 4357 }, { "epoch": 6.92845786963434, "grad_norm": 5.309504896189564, "learning_rate": 4.576907614223637e-05, "loss": 0.2566, "step": 4358 }, { "epoch": 6.930047694753577, "grad_norm": 2.4965523809698427, "learning_rate": 4.577382717015143e-05, "loss": 0.1188, "step": 4359 }, { "epoch": 6.9316375198728135, "grad_norm": 4.635435485717884, "learning_rate": 4.577857624383506e-05, "loss": 0.173, "step": 4360 }, { "epoch": 6.9332273449920505, "grad_norm": 3.1016496451273863, "learning_rate": 4.5783323361679865e-05, "loss": 0.2129, "step": 4361 }, { "epoch": 6.9348171701112875, "grad_norm": 3.7841862035443903, "learning_rate": 4.5788068522079134e-05, "loss": 0.1225, "step": 4362 }, { "epoch": 6.9364069952305245, "grad_norm": 2.596130237009024, "learning_rate": 4.579281172342679e-05, "loss": 0.1778, "step": 4363 }, { "epoch": 6.9379968203497615, "grad_norm": 3.22668710238125, "learning_rate": 4.5797552964117436e-05, "loss": 0.1713, "step": 4364 }, { "epoch": 6.9395866454689985, "grad_norm": 5.741533631003023, "learning_rate": 4.580229224254633e-05, "loss": 0.1422, "step": 4365 }, { "epoch": 6.9411764705882355, "grad_norm": 2.6977819659551185, "learning_rate": 4.58070295571094e-05, "loss": 0.1603, "step": 4366 }, { "epoch": 6.9427662957074725, "grad_norm": 4.114255220532861, "learning_rate": 4.5811764906203236e-05, "loss": 0.1409, "step": 4367 }, { "epoch": 6.9443561208267095, "grad_norm": 3.6080354154907397, "learning_rate": 4.581649828822509e-05, "loss": 0.2079, "step": 4368 }, { "epoch": 6.945945945945946, "grad_norm": 2.4449513024466434, "learning_rate": 4.5821229701572894e-05, "loss": 0.1348, "step": 4369 }, { "epoch": 6.947535771065183, "grad_norm": 3.4226311384991783, "learning_rate": 4.5825959144645234e-05, "loss": 0.1838, "step": 4370 }, { "epoch": 6.94912559618442, "grad_norm": 3.4660953849956, "learning_rate": 4.583068661584135e-05, "loss": 0.1693, "step": 4371 }, { "epoch": 6.950715421303657, "grad_norm": 2.5245847713834575, "learning_rate": 4.5835412113561176e-05, "loss": 0.1629, "step": 4372 }, { "epoch": 6.952305246422894, "grad_norm": 1.7529822562961754, "learning_rate": 4.5840135636205306e-05, "loss": 0.1331, "step": 4373 }, { "epoch": 6.953895071542131, "grad_norm": 2.8279357368346245, "learning_rate": 4.584485718217499e-05, "loss": 0.1539, "step": 4374 }, { "epoch": 6.955484896661368, "grad_norm": 2.9353023050915423, "learning_rate": 4.584957674987215e-05, "loss": 0.1765, "step": 4375 }, { "epoch": 6.957074721780604, "grad_norm": 3.9166450703605307, "learning_rate": 4.585429433769941e-05, "loss": 0.1555, "step": 4376 }, { "epoch": 6.958664546899841, "grad_norm": 3.617644257049665, "learning_rate": 4.585900994406001e-05, "loss": 0.1482, "step": 4377 }, { "epoch": 6.960254372019078, "grad_norm": 1.334397987442756, "learning_rate": 4.58637235673579e-05, "loss": 0.1292, "step": 4378 }, { "epoch": 6.961844197138315, "grad_norm": 4.386647232528946, "learning_rate": 4.586843520599768e-05, "loss": 0.3152, "step": 4379 }, { "epoch": 6.963434022257552, "grad_norm": 2.4032855336139445, "learning_rate": 4.587314485838464e-05, "loss": 0.1583, "step": 4380 }, { "epoch": 6.965023847376789, "grad_norm": 2.418217534989904, "learning_rate": 4.587785252292473e-05, "loss": 0.121, "step": 4381 }, { "epoch": 6.966613672496026, "grad_norm": 2.844770569230615, "learning_rate": 4.588255819802458e-05, "loss": 0.226, "step": 4382 }, { "epoch": 6.968203497615263, "grad_norm": 3.4541924175132688, "learning_rate": 4.588726188209149e-05, "loss": 0.144, "step": 4383 }, { "epoch": 6.9697933227345, "grad_norm": 1.7675646516164376, "learning_rate": 4.589196357353343e-05, "loss": 0.816, "step": 4384 }, { "epoch": 6.971383147853736, "grad_norm": 30.255251619940378, "learning_rate": 4.589666327075904e-05, "loss": 1.8304, "step": 4385 }, { "epoch": 6.972972972972973, "grad_norm": 1.9616277655494896, "learning_rate": 4.5901360972177645e-05, "loss": 0.1718, "step": 4386 }, { "epoch": 6.97456279809221, "grad_norm": 2.8000848053725957, "learning_rate": 4.5906056676199253e-05, "loss": 0.1079, "step": 4387 }, { "epoch": 6.976152623211447, "grad_norm": 3.6697587363326387, "learning_rate": 4.591075038123454e-05, "loss": 0.1254, "step": 4388 }, { "epoch": 6.977742448330684, "grad_norm": 2.459458957843938, "learning_rate": 4.591544208569484e-05, "loss": 0.144, "step": 4389 }, { "epoch": 6.979332273449921, "grad_norm": 2.510459502298899, "learning_rate": 4.59201317879922e-05, "loss": 0.1521, "step": 4390 }, { "epoch": 6.980922098569158, "grad_norm": 2.804673072802171, "learning_rate": 4.5924819486539306e-05, "loss": 0.1493, "step": 4391 }, { "epoch": 6.982511923688394, "grad_norm": 4.787218101497528, "learning_rate": 4.592950517974956e-05, "loss": 0.2671, "step": 4392 }, { "epoch": 6.984101748807631, "grad_norm": 2.1654109483200656, "learning_rate": 4.593418886603702e-05, "loss": 0.1451, "step": 4393 }, { "epoch": 6.985691573926868, "grad_norm": 39.85436083876924, "learning_rate": 4.593887054381641e-05, "loss": 5.2202, "step": 4394 }, { "epoch": 6.987281399046105, "grad_norm": 3.0615244642623236, "learning_rate": 4.594355021150317e-05, "loss": 0.1731, "step": 4395 }, { "epoch": 6.988871224165342, "grad_norm": 2.473062325600342, "learning_rate": 4.594822786751341e-05, "loss": 0.1317, "step": 4396 }, { "epoch": 6.990461049284579, "grad_norm": 2.024385544797275, "learning_rate": 4.59529035102639e-05, "loss": 0.1622, "step": 4397 }, { "epoch": 6.992050874403816, "grad_norm": 1.3244823476591756, "learning_rate": 4.59575771381721e-05, "loss": 0.1428, "step": 4398 }, { "epoch": 6.993640699523052, "grad_norm": 3.2145959096281578, "learning_rate": 4.596224874965616e-05, "loss": 0.1296, "step": 4399 }, { "epoch": 6.995230524642289, "grad_norm": 3.106402634986056, "learning_rate": 4.596691834313491e-05, "loss": 0.1391, "step": 4400 }, { "epoch": 6.996820349761526, "grad_norm": 2.642970761003322, "learning_rate": 4.5971585917027865e-05, "loss": 0.1522, "step": 4401 }, { "epoch": 6.998410174880763, "grad_norm": 2.439585989951723, "learning_rate": 4.597625146975521e-05, "loss": 0.1956, "step": 4402 }, { "epoch": 7.0, "grad_norm": 2.2644959959799493, "learning_rate": 4.598091499973784e-05, "loss": 0.1246, "step": 4403 }, { "epoch": 7.001589825119237, "grad_norm": 2.8063127097032483, "learning_rate": 4.598557650539731e-05, "loss": 0.2117, "step": 4404 }, { "epoch": 7.003179650238474, "grad_norm": 3.262544497764086, "learning_rate": 4.599023598515586e-05, "loss": 0.1711, "step": 4405 }, { "epoch": 7.004769475357711, "grad_norm": 3.1992101213967628, "learning_rate": 4.599489343743644e-05, "loss": 0.139, "step": 4406 }, { "epoch": 7.006359300476947, "grad_norm": 2.9548266851897567, "learning_rate": 4.5999548860662666e-05, "loss": 0.1376, "step": 4407 }, { "epoch": 7.007949125596184, "grad_norm": 2.058194666306494, "learning_rate": 4.600420225325885e-05, "loss": 0.098, "step": 4408 }, { "epoch": 7.009538950715421, "grad_norm": 2.8322112027718696, "learning_rate": 4.600885361364997e-05, "loss": 0.1298, "step": 4409 }, { "epoch": 7.011128775834658, "grad_norm": 3.384376411773975, "learning_rate": 4.6013502940261725e-05, "loss": 0.1196, "step": 4410 }, { "epoch": 7.012718600953895, "grad_norm": 2.2573931578408577, "learning_rate": 4.601815023152049e-05, "loss": 0.1349, "step": 4411 }, { "epoch": 7.014308426073132, "grad_norm": 1.4201740950932924, "learning_rate": 4.602279548585331e-05, "loss": 0.1522, "step": 4412 }, { "epoch": 7.015898251192369, "grad_norm": 4.663154812206325, "learning_rate": 4.602743870168794e-05, "loss": 0.1891, "step": 4413 }, { "epoch": 7.017488076311606, "grad_norm": 1.8636234624581416, "learning_rate": 4.6032079877452826e-05, "loss": 0.1185, "step": 4414 }, { "epoch": 7.019077901430842, "grad_norm": 4.571617784434723, "learning_rate": 4.603671901157709e-05, "loss": 0.305, "step": 4415 }, { "epoch": 7.020667726550079, "grad_norm": 3.6193585557374814, "learning_rate": 4.604135610249057e-05, "loss": 0.1599, "step": 4416 }, { "epoch": 7.022257551669316, "grad_norm": 5.631572725751323, "learning_rate": 4.6045991148623757e-05, "loss": 0.2197, "step": 4417 }, { "epoch": 7.023847376788553, "grad_norm": 2.2466828085930466, "learning_rate": 4.605062414840786e-05, "loss": 0.1778, "step": 4418 }, { "epoch": 7.02543720190779, "grad_norm": 5.6730956783342625, "learning_rate": 4.6055255100274785e-05, "loss": 0.2279, "step": 4419 }, { "epoch": 7.027027027027027, "grad_norm": 2.732934122086105, "learning_rate": 4.6059884002657114e-05, "loss": 0.1328, "step": 4420 }, { "epoch": 7.028616852146264, "grad_norm": 3.498541347769334, "learning_rate": 4.606451085398814e-05, "loss": 0.2121, "step": 4421 }, { "epoch": 7.030206677265501, "grad_norm": 3.753298087283855, "learning_rate": 4.606913565270183e-05, "loss": 0.2023, "step": 4422 }, { "epoch": 7.031796502384737, "grad_norm": 2.759505137271759, "learning_rate": 4.607375839723287e-05, "loss": 0.1741, "step": 4423 }, { "epoch": 7.033386327503974, "grad_norm": 2.844621855127885, "learning_rate": 4.607837908601662e-05, "loss": 0.1697, "step": 4424 }, { "epoch": 7.034976152623211, "grad_norm": 3.286500935932206, "learning_rate": 4.608299771748915e-05, "loss": 0.1745, "step": 4425 }, { "epoch": 7.036565977742448, "grad_norm": 3.9546466249966064, "learning_rate": 4.608761429008721e-05, "loss": 0.1229, "step": 4426 }, { "epoch": 7.038155802861685, "grad_norm": 2.8562797807706612, "learning_rate": 4.609222880224827e-05, "loss": 0.1414, "step": 4427 }, { "epoch": 7.039745627980922, "grad_norm": 2.8704343978626445, "learning_rate": 4.609684125241047e-05, "loss": 0.2424, "step": 4428 }, { "epoch": 7.041335453100159, "grad_norm": 3.4505018146096313, "learning_rate": 4.610145163901268e-05, "loss": 0.1262, "step": 4429 }, { "epoch": 7.042925278219396, "grad_norm": 2.9065501601682526, "learning_rate": 4.610605996049444e-05, "loss": 0.202, "step": 4430 }, { "epoch": 7.044515103338632, "grad_norm": 4.27505240120418, "learning_rate": 4.6110666215296e-05, "loss": 0.2304, "step": 4431 }, { "epoch": 7.046104928457869, "grad_norm": 3.3345193428045206, "learning_rate": 4.6115270401858316e-05, "loss": 0.1183, "step": 4432 }, { "epoch": 7.047694753577106, "grad_norm": 3.5790686366209923, "learning_rate": 4.611987251862303e-05, "loss": 0.1733, "step": 4433 }, { "epoch": 7.049284578696343, "grad_norm": 2.7048849995532307, "learning_rate": 4.6124472564032496e-05, "loss": 0.1658, "step": 4434 }, { "epoch": 7.05087440381558, "grad_norm": 1.5311640051696793, "learning_rate": 4.612907053652977e-05, "loss": 0.2297, "step": 4435 }, { "epoch": 7.052464228934817, "grad_norm": 2.5752891669948843, "learning_rate": 4.6133666434558594e-05, "loss": 0.2322, "step": 4436 }, { "epoch": 7.054054054054054, "grad_norm": 2.0000478096881205, "learning_rate": 4.613826025656343e-05, "loss": 0.1277, "step": 4437 }, { "epoch": 7.0556438791732905, "grad_norm": 4.319689695264964, "learning_rate": 4.614285200098943e-05, "loss": 0.101, "step": 4438 }, { "epoch": 7.0572337042925275, "grad_norm": 2.6696702845722027, "learning_rate": 4.614744166628247e-05, "loss": 0.1587, "step": 4439 }, { "epoch": 7.0588235294117645, "grad_norm": 3.25174830051645, "learning_rate": 4.61520292508891e-05, "loss": 0.1781, "step": 4440 }, { "epoch": 7.0604133545310015, "grad_norm": 2.4414248427287144, "learning_rate": 4.6156614753256584e-05, "loss": 0.141, "step": 4441 }, { "epoch": 7.0620031796502385, "grad_norm": 3.4884001010139745, "learning_rate": 4.616119817183291e-05, "loss": 0.1188, "step": 4442 }, { "epoch": 7.0635930047694755, "grad_norm": 4.099324516951509, "learning_rate": 4.616577950506675e-05, "loss": 0.2052, "step": 4443 }, { "epoch": 7.0651828298887125, "grad_norm": 2.225131594211299, "learning_rate": 4.617035875140749e-05, "loss": 0.1428, "step": 4444 }, { "epoch": 7.0667726550079495, "grad_norm": 3.0379355326249655, "learning_rate": 4.6174935909305224e-05, "loss": 0.1969, "step": 4445 }, { "epoch": 7.068362480127186, "grad_norm": 4.443941278791309, "learning_rate": 4.617951097721073e-05, "loss": 0.2175, "step": 4446 }, { "epoch": 7.069952305246423, "grad_norm": 3.699733690276889, "learning_rate": 4.618408395357554e-05, "loss": 0.1024, "step": 4447 }, { "epoch": 7.07154213036566, "grad_norm": 2.2239882767532873, "learning_rate": 4.618865483685186e-05, "loss": 0.1843, "step": 4448 }, { "epoch": 7.073131955484897, "grad_norm": 3.947541433472577, "learning_rate": 4.61932236254926e-05, "loss": 0.1952, "step": 4449 }, { "epoch": 7.074721780604134, "grad_norm": 1.2906295939423535, "learning_rate": 4.61977903179514e-05, "loss": 0.1855, "step": 4450 }, { "epoch": 7.076311605723371, "grad_norm": 3.364772687511871, "learning_rate": 4.6202354912682606e-05, "loss": 0.2123, "step": 4451 }, { "epoch": 7.077901430842608, "grad_norm": 6.664777331963277, "learning_rate": 4.6206917408141246e-05, "loss": 0.2366, "step": 4452 }, { "epoch": 7.079491255961845, "grad_norm": 2.919927563316208, "learning_rate": 4.6211477802783106e-05, "loss": 0.1788, "step": 4453 }, { "epoch": 7.081081081081081, "grad_norm": 5.613842693189876, "learning_rate": 4.621603609506465e-05, "loss": 0.2068, "step": 4454 }, { "epoch": 7.082670906200318, "grad_norm": 4.076631500072799, "learning_rate": 4.622059228344304e-05, "loss": 0.1968, "step": 4455 }, { "epoch": 7.084260731319555, "grad_norm": 8.450672796717233, "learning_rate": 4.62251463663762e-05, "loss": 0.2146, "step": 4456 }, { "epoch": 7.085850556438792, "grad_norm": 4.506870293920581, "learning_rate": 4.6229698342322724e-05, "loss": 0.2742, "step": 4457 }, { "epoch": 7.087440381558029, "grad_norm": 2.498264932644508, "learning_rate": 4.623424820974193e-05, "loss": 0.151, "step": 4458 }, { "epoch": 7.089030206677266, "grad_norm": 5.699029986574716, "learning_rate": 4.623879596709386e-05, "loss": 0.3973, "step": 4459 }, { "epoch": 7.090620031796503, "grad_norm": 5.582698358253199, "learning_rate": 4.6243341612839264e-05, "loss": 0.1182, "step": 4460 }, { "epoch": 7.09220985691574, "grad_norm": 5.686112855405954, "learning_rate": 4.62478851454396e-05, "loss": 0.1874, "step": 4461 }, { "epoch": 7.093799682034976, "grad_norm": 1.7910649715417066, "learning_rate": 4.6252426563357055e-05, "loss": 0.1296, "step": 4462 }, { "epoch": 7.095389507154213, "grad_norm": 2.2442840128536274, "learning_rate": 4.6256965865054514e-05, "loss": 0.1852, "step": 4463 }, { "epoch": 7.09697933227345, "grad_norm": 4.7837453557363965, "learning_rate": 4.626150304899559e-05, "loss": 0.2075, "step": 4464 }, { "epoch": 7.098569157392687, "grad_norm": 5.090094857483868, "learning_rate": 4.62660381136446e-05, "loss": 0.1595, "step": 4465 }, { "epoch": 7.100158982511924, "grad_norm": 3.0695592364630557, "learning_rate": 4.627057105746662e-05, "loss": 0.1393, "step": 4466 }, { "epoch": 7.101748807631161, "grad_norm": 3.472145957957069, "learning_rate": 4.627510187892738e-05, "loss": 0.1652, "step": 4467 }, { "epoch": 7.103338632750398, "grad_norm": 3.1978564098920783, "learning_rate": 4.627963057649338e-05, "loss": 0.2059, "step": 4468 }, { "epoch": 7.104928457869635, "grad_norm": 4.277511868676584, "learning_rate": 4.6284157148631814e-05, "loss": 0.2404, "step": 4469 }, { "epoch": 7.106518282988871, "grad_norm": 4.054286437909406, "learning_rate": 4.6288681593810595e-05, "loss": 0.2484, "step": 4470 }, { "epoch": 7.108108108108108, "grad_norm": 3.366165667185676, "learning_rate": 4.6293203910498376e-05, "loss": 0.1928, "step": 4471 }, { "epoch": 7.109697933227345, "grad_norm": 2.115767120476511, "learning_rate": 4.6297724097164506e-05, "loss": 0.1959, "step": 4472 }, { "epoch": 7.111287758346582, "grad_norm": 2.838743539691398, "learning_rate": 4.630224215227907e-05, "loss": 0.1633, "step": 4473 }, { "epoch": 7.112877583465819, "grad_norm": 4.781675279326575, "learning_rate": 4.630675807431286e-05, "loss": 0.1667, "step": 4474 }, { "epoch": 7.114467408585056, "grad_norm": 2.631361946208207, "learning_rate": 4.631127186173742e-05, "loss": 0.1937, "step": 4475 }, { "epoch": 7.116057233704293, "grad_norm": 2.9041738967228303, "learning_rate": 4.6315783513024974e-05, "loss": 0.1734, "step": 4476 }, { "epoch": 7.117647058823529, "grad_norm": 2.614016178727657, "learning_rate": 4.6320293026648516e-05, "loss": 0.2086, "step": 4477 }, { "epoch": 7.119236883942766, "grad_norm": 2.6491194761109194, "learning_rate": 4.632480040108171e-05, "loss": 0.146, "step": 4478 }, { "epoch": 7.120826709062003, "grad_norm": 2.972856423711915, "learning_rate": 4.6329305634799e-05, "loss": 0.213, "step": 4479 }, { "epoch": 7.12241653418124, "grad_norm": 2.0786906037732327, "learning_rate": 4.63338087262755e-05, "loss": 0.125, "step": 4480 }, { "epoch": 7.124006359300477, "grad_norm": 3.142548082285499, "learning_rate": 4.6338309673987106e-05, "loss": 0.1819, "step": 4481 }, { "epoch": 7.125596184419714, "grad_norm": 2.1055150817651214, "learning_rate": 4.634280847641039e-05, "loss": 0.1843, "step": 4482 }, { "epoch": 7.127186009538951, "grad_norm": 2.056976264873237, "learning_rate": 4.634730513202268e-05, "loss": 0.1224, "step": 4483 }, { "epoch": 7.128775834658188, "grad_norm": 2.2468688521273537, "learning_rate": 4.635179963930201e-05, "loss": 0.1819, "step": 4484 }, { "epoch": 7.130365659777424, "grad_norm": 2.0236323052429412, "learning_rate": 4.6356291996727166e-05, "loss": 0.2021, "step": 4485 }, { "epoch": 7.131955484896661, "grad_norm": 1.742066523207537, "learning_rate": 4.6360782202777644e-05, "loss": 0.1904, "step": 4486 }, { "epoch": 7.133545310015898, "grad_norm": 2.6685780084474047, "learning_rate": 4.636527025593366e-05, "loss": 0.144, "step": 4487 }, { "epoch": 7.135135135135135, "grad_norm": 2.0984672572286844, "learning_rate": 4.636975615467618e-05, "loss": 0.2154, "step": 4488 }, { "epoch": 7.136724960254372, "grad_norm": 2.1125565860310225, "learning_rate": 4.63742398974869e-05, "loss": 0.1504, "step": 4489 }, { "epoch": 7.138314785373609, "grad_norm": 1.0770710762230538, "learning_rate": 4.637872148284821e-05, "loss": 0.1163, "step": 4490 }, { "epoch": 7.139904610492846, "grad_norm": 2.0214139607607136, "learning_rate": 4.638320090924328e-05, "loss": 0.1236, "step": 4491 }, { "epoch": 7.141494435612083, "grad_norm": 2.133888777348414, "learning_rate": 4.638767817515598e-05, "loss": 0.1889, "step": 4492 }, { "epoch": 7.143084260731319, "grad_norm": 2.2862250273158886, "learning_rate": 4.639215327907091e-05, "loss": 0.1889, "step": 4493 }, { "epoch": 7.144674085850556, "grad_norm": 1.6071193960483956, "learning_rate": 4.639662621947341e-05, "loss": 0.1834, "step": 4494 }, { "epoch": 7.146263910969793, "grad_norm": 2.48355387305992, "learning_rate": 4.6401096994849556e-05, "loss": 0.2011, "step": 4495 }, { "epoch": 7.14785373608903, "grad_norm": 2.757952979321054, "learning_rate": 4.6405565603686154e-05, "loss": 0.1253, "step": 4496 }, { "epoch": 7.149443561208267, "grad_norm": 2.0412122511600232, "learning_rate": 4.641003204447073e-05, "loss": 0.2129, "step": 4497 }, { "epoch": 7.151033386327504, "grad_norm": 2.3407493754177686, "learning_rate": 4.641449631569158e-05, "loss": 0.2297, "step": 4498 }, { "epoch": 7.152623211446741, "grad_norm": 2.243899694451982, "learning_rate": 4.641895841583769e-05, "loss": 0.154, "step": 4499 }, { "epoch": 7.154213036565977, "grad_norm": 2.0682966870310544, "learning_rate": 4.64234183433988e-05, "loss": 0.1617, "step": 4500 }, { "epoch": 7.155802861685214, "grad_norm": 2.875190177929625, "learning_rate": 4.64278760968654e-05, "loss": 0.2694, "step": 4501 }, { "epoch": 7.157392686804451, "grad_norm": 1.4901338481348805, "learning_rate": 4.643233167472868e-05, "loss": 0.1428, "step": 4502 }, { "epoch": 7.158982511923688, "grad_norm": 2.6961165905314775, "learning_rate": 4.6436785075480605e-05, "loss": 0.2041, "step": 4503 }, { "epoch": 7.160572337042925, "grad_norm": 1.6903660025169973, "learning_rate": 4.6441236297613866e-05, "loss": 0.2268, "step": 4504 }, { "epoch": 7.162162162162162, "grad_norm": 3.356744206964435, "learning_rate": 4.6445685339621865e-05, "loss": 0.178, "step": 4505 }, { "epoch": 7.163751987281399, "grad_norm": 1.752625616973835, "learning_rate": 4.645013219999878e-05, "loss": 0.1615, "step": 4506 }, { "epoch": 7.165341812400636, "grad_norm": 4.335386723596389, "learning_rate": 4.645457687723951e-05, "loss": 0.3305, "step": 4507 }, { "epoch": 7.166931637519872, "grad_norm": 3.124655167018783, "learning_rate": 4.645901936983968e-05, "loss": 0.1385, "step": 4508 }, { "epoch": 7.168521462639109, "grad_norm": 2.3954837278173526, "learning_rate": 4.646345967629567e-05, "loss": 0.2005, "step": 4509 }, { "epoch": 7.170111287758346, "grad_norm": 3.2038118335673436, "learning_rate": 4.64678977951046e-05, "loss": 0.1328, "step": 4510 }, { "epoch": 7.171701112877583, "grad_norm": 5.0895508949764645, "learning_rate": 4.647233372476433e-05, "loss": 0.228, "step": 4511 }, { "epoch": 7.17329093799682, "grad_norm": 1.5332138122199175, "learning_rate": 4.647676746377345e-05, "loss": 0.1768, "step": 4512 }, { "epoch": 7.174880763116057, "grad_norm": 5.624882137856937, "learning_rate": 4.648119901063131e-05, "loss": 0.1855, "step": 4513 }, { "epoch": 7.176470588235294, "grad_norm": 3.313121846398208, "learning_rate": 4.6485628363837986e-05, "loss": 0.1637, "step": 4514 }, { "epoch": 7.178060413354531, "grad_norm": 3.757883602821559, "learning_rate": 4.64900555218943e-05, "loss": 0.2054, "step": 4515 }, { "epoch": 7.1796502384737675, "grad_norm": 4.132407897837306, "learning_rate": 4.649448048330183e-05, "loss": 0.2097, "step": 4516 }, { "epoch": 7.1812400635930045, "grad_norm": 4.357569013118494, "learning_rate": 4.6498903246562886e-05, "loss": 0.1245, "step": 4517 }, { "epoch": 7.1828298887122415, "grad_norm": 3.976686184216191, "learning_rate": 4.650332381018051e-05, "loss": 0.2748, "step": 4518 }, { "epoch": 7.1844197138314785, "grad_norm": 4.293871509086424, "learning_rate": 4.650774217265851e-05, "loss": 0.1487, "step": 4519 }, { "epoch": 7.1860095389507155, "grad_norm": 3.5536632924492566, "learning_rate": 4.6512158332501425e-05, "loss": 0.1902, "step": 4520 }, { "epoch": 7.1875993640699525, "grad_norm": 1.7266418325776576, "learning_rate": 4.651657228821455e-05, "loss": 0.1493, "step": 4521 }, { "epoch": 7.1891891891891895, "grad_norm": 4.474490394279598, "learning_rate": 4.652098403830393e-05, "loss": 0.2612, "step": 4522 }, { "epoch": 7.1907790143084265, "grad_norm": 4.417410224420178, "learning_rate": 4.652539358127632e-05, "loss": 0.1635, "step": 4523 }, { "epoch": 7.192368839427663, "grad_norm": 2.4883423154002555, "learning_rate": 4.652980091563927e-05, "loss": 0.2763, "step": 4524 }, { "epoch": 7.1939586645469, "grad_norm": 29.478519091687346, "learning_rate": 4.653420603990106e-05, "loss": 7.5321, "step": 4525 }, { "epoch": 7.195548489666137, "grad_norm": 2.4686447863051715, "learning_rate": 4.65386089525707e-05, "loss": 0.2099, "step": 4526 }, { "epoch": 7.197138314785374, "grad_norm": 1.6642467885580376, "learning_rate": 4.654300965215797e-05, "loss": 0.0947, "step": 4527 }, { "epoch": 7.198728139904611, "grad_norm": 2.758781500507689, "learning_rate": 4.6547408137173404e-05, "loss": 0.258, "step": 4528 }, { "epoch": 7.200317965023848, "grad_norm": 2.781808904715662, "learning_rate": 4.655180440612825e-05, "loss": 0.1615, "step": 4529 }, { "epoch": 7.201907790143085, "grad_norm": 3.981821797076212, "learning_rate": 4.655619845753456e-05, "loss": 0.1963, "step": 4530 }, { "epoch": 7.203497615262322, "grad_norm": 2.0165405847180686, "learning_rate": 4.656059028990507e-05, "loss": 0.187, "step": 4531 }, { "epoch": 7.205087440381558, "grad_norm": 4.24364445646243, "learning_rate": 4.6564979901753344e-05, "loss": 0.157, "step": 4532 }, { "epoch": 7.206677265500795, "grad_norm": 2.904758145169337, "learning_rate": 4.6569367291593624e-05, "loss": 0.2314, "step": 4533 }, { "epoch": 7.208267090620032, "grad_norm": 2.565064631691079, "learning_rate": 4.657375245794096e-05, "loss": 0.1332, "step": 4534 }, { "epoch": 7.209856915739269, "grad_norm": 3.683076427285052, "learning_rate": 4.6578135399311114e-05, "loss": 0.2722, "step": 4535 }, { "epoch": 7.211446740858506, "grad_norm": 54.16715047219509, "learning_rate": 4.658251611422064e-05, "loss": 3.4737, "step": 4536 }, { "epoch": 7.213036565977743, "grad_norm": 3.10510466285117, "learning_rate": 4.658689460118681e-05, "loss": 0.154, "step": 4537 }, { "epoch": 7.21462639109698, "grad_norm": 2.5116271294917323, "learning_rate": 4.659127085872766e-05, "loss": 0.1641, "step": 4538 }, { "epoch": 7.216216216216216, "grad_norm": 48.16702850486449, "learning_rate": 4.6595644885362e-05, "loss": 5.3855, "step": 4539 }, { "epoch": 7.217806041335453, "grad_norm": 3.501931745885533, "learning_rate": 4.660001667960937e-05, "loss": 0.2109, "step": 4540 }, { "epoch": 7.21939586645469, "grad_norm": 2.3998930765295463, "learning_rate": 4.6604386239990074e-05, "loss": 0.2027, "step": 4541 }, { "epoch": 7.220985691573927, "grad_norm": 3.6958479719269293, "learning_rate": 4.660875356502519e-05, "loss": 0.2279, "step": 4542 }, { "epoch": 7.222575516693164, "grad_norm": 2.344503837369767, "learning_rate": 4.661311865323652e-05, "loss": 0.2033, "step": 4543 }, { "epoch": 7.224165341812401, "grad_norm": 1.5747479415327035, "learning_rate": 4.6617481503146644e-05, "loss": 0.1453, "step": 4544 }, { "epoch": 7.225755166931638, "grad_norm": 2.615810965808244, "learning_rate": 4.6621842113278896e-05, "loss": 0.1692, "step": 4545 }, { "epoch": 7.227344992050875, "grad_norm": 3.511344066573996, "learning_rate": 4.6626200482157375e-05, "loss": 0.1524, "step": 4546 }, { "epoch": 7.228934817170111, "grad_norm": 1.9898296669332898, "learning_rate": 4.663055660830692e-05, "loss": 0.1211, "step": 4547 }, { "epoch": 7.230524642289348, "grad_norm": 3.318262980739692, "learning_rate": 4.6634910490253146e-05, "loss": 0.1752, "step": 4548 }, { "epoch": 7.232114467408585, "grad_norm": 46.65507460527682, "learning_rate": 4.6639262126522425e-05, "loss": 2.5714, "step": 4549 }, { "epoch": 7.233704292527822, "grad_norm": 3.099797153453075, "learning_rate": 4.664361151564186e-05, "loss": 0.1549, "step": 4550 }, { "epoch": 7.235294117647059, "grad_norm": 1.9333214538393355, "learning_rate": 4.6647958656139385e-05, "loss": 0.1059, "step": 4551 }, { "epoch": 7.236883942766296, "grad_norm": 2.518635857167061, "learning_rate": 4.6652303546543614e-05, "loss": 0.1104, "step": 4552 }, { "epoch": 7.238473767885533, "grad_norm": 1.998129639297619, "learning_rate": 4.6656646185383964e-05, "loss": 0.1105, "step": 4553 }, { "epoch": 7.24006359300477, "grad_norm": 2.034320725694436, "learning_rate": 4.6660986571190625e-05, "loss": 0.165, "step": 4554 }, { "epoch": 7.241653418124006, "grad_norm": 2.525889972094502, "learning_rate": 4.666532470249453e-05, "loss": 0.2118, "step": 4555 }, { "epoch": 7.243243243243243, "grad_norm": 3.1451120278009235, "learning_rate": 4.666966057782736e-05, "loss": 0.2127, "step": 4556 }, { "epoch": 7.24483306836248, "grad_norm": 2.1200454228492838, "learning_rate": 4.66739941957216e-05, "loss": 0.177, "step": 4557 }, { "epoch": 7.246422893481717, "grad_norm": 2.1571405463157487, "learning_rate": 4.6678325554710464e-05, "loss": 0.2235, "step": 4558 }, { "epoch": 7.248012718600954, "grad_norm": 1.766101336864548, "learning_rate": 4.668265465332796e-05, "loss": 0.1308, "step": 4559 }, { "epoch": 7.249602543720191, "grad_norm": 1.4953758259828855, "learning_rate": 4.6686981490108825e-05, "loss": 0.171, "step": 4560 }, { "epoch": 7.251192368839428, "grad_norm": 2.3193787160007115, "learning_rate": 4.669130606358858e-05, "loss": 0.1828, "step": 4561 }, { "epoch": 7.252782193958664, "grad_norm": 1.7040094721142731, "learning_rate": 4.669562837230354e-05, "loss": 0.1829, "step": 4562 }, { "epoch": 7.254372019077901, "grad_norm": 2.4303965512409773, "learning_rate": 4.6699948414790734e-05, "loss": 0.1362, "step": 4563 }, { "epoch": 7.255961844197138, "grad_norm": 1.8243774351961926, "learning_rate": 4.670426618958799e-05, "loss": 0.1051, "step": 4564 }, { "epoch": 7.257551669316375, "grad_norm": 1.9669435625129836, "learning_rate": 4.670858169523391e-05, "loss": 0.156, "step": 4565 }, { "epoch": 7.259141494435612, "grad_norm": 3.0069495962431123, "learning_rate": 4.671289493026784e-05, "loss": 0.1728, "step": 4566 }, { "epoch": 7.260731319554849, "grad_norm": 2.1773407115075503, "learning_rate": 4.67172058932299e-05, "loss": 0.1501, "step": 4567 }, { "epoch": 7.262321144674086, "grad_norm": 1.9622327169783964, "learning_rate": 4.672151458266101e-05, "loss": 0.1545, "step": 4568 }, { "epoch": 7.263910969793323, "grad_norm": 2.1373689819462247, "learning_rate": 4.6725820997102805e-05, "loss": 0.2674, "step": 4569 }, { "epoch": 7.26550079491256, "grad_norm": 1.5441598754945203, "learning_rate": 4.6730125135097733e-05, "loss": 0.163, "step": 4570 }, { "epoch": 7.267090620031796, "grad_norm": 2.4432448792908215, "learning_rate": 4.673442699518901e-05, "loss": 0.1452, "step": 4571 }, { "epoch": 7.268680445151033, "grad_norm": 3.015236339153733, "learning_rate": 4.673872657592059e-05, "loss": 0.207, "step": 4572 }, { "epoch": 7.27027027027027, "grad_norm": 4.251574198317745, "learning_rate": 4.674302387583724e-05, "loss": 0.2539, "step": 4573 }, { "epoch": 7.271860095389507, "grad_norm": 3.0047200768079905, "learning_rate": 4.674731889348446e-05, "loss": 0.1247, "step": 4574 }, { "epoch": 7.273449920508744, "grad_norm": 3.418098328142938, "learning_rate": 4.6751611627408564e-05, "loss": 0.2613, "step": 4575 }, { "epoch": 7.275039745627981, "grad_norm": 1.5499181878844996, "learning_rate": 4.6755902076156606e-05, "loss": 0.1251, "step": 4576 }, { "epoch": 7.276629570747218, "grad_norm": 4.368871661218968, "learning_rate": 4.6760190238276425e-05, "loss": 0.1487, "step": 4577 }, { "epoch": 7.278219395866454, "grad_norm": 1.592053376494794, "learning_rate": 4.676447611231663e-05, "loss": 0.1436, "step": 4578 }, { "epoch": 7.279809220985691, "grad_norm": 3.0537116890969402, "learning_rate": 4.676875969682661e-05, "loss": 0.1261, "step": 4579 }, { "epoch": 7.281399046104928, "grad_norm": 4.222513577643903, "learning_rate": 4.677304099035653e-05, "loss": 0.2222, "step": 4580 }, { "epoch": 7.282988871224165, "grad_norm": 2.1954841909472114, "learning_rate": 4.6777319991457325e-05, "loss": 0.1677, "step": 4581 }, { "epoch": 7.284578696343402, "grad_norm": 3.9065208069911943, "learning_rate": 4.6781596698680705e-05, "loss": 0.1077, "step": 4582 }, { "epoch": 7.286168521462639, "grad_norm": 2.830472452397222, "learning_rate": 4.6785871110579165e-05, "loss": 0.1731, "step": 4583 }, { "epoch": 7.287758346581876, "grad_norm": 2.1980108570697316, "learning_rate": 4.679014322570597e-05, "loss": 0.183, "step": 4584 }, { "epoch": 7.289348171701113, "grad_norm": 2.4333176520245265, "learning_rate": 4.6794413042615165e-05, "loss": 0.1908, "step": 4585 }, { "epoch": 7.290937996820349, "grad_norm": 3.23592945826406, "learning_rate": 4.6798680559861566e-05, "loss": 0.1674, "step": 4586 }, { "epoch": 7.292527821939586, "grad_norm": 1.4252833125469218, "learning_rate": 4.680294577600078e-05, "loss": 0.1904, "step": 4587 }, { "epoch": 7.294117647058823, "grad_norm": 1.6806088353766642, "learning_rate": 4.680720868958918e-05, "loss": 0.1284, "step": 4588 }, { "epoch": 7.29570747217806, "grad_norm": 91.5272230814426, "learning_rate": 4.681146929918392e-05, "loss": 6.4066, "step": 4589 }, { "epoch": 7.297297297297297, "grad_norm": 2.4930198078774057, "learning_rate": 4.681572760334296e-05, "loss": 0.1557, "step": 4590 }, { "epoch": 7.298887122416534, "grad_norm": 2.914012020511383, "learning_rate": 4.681998360062499e-05, "loss": 0.1534, "step": 4591 }, { "epoch": 7.300476947535771, "grad_norm": 1.3041471798428454, "learning_rate": 4.6824237289589525e-05, "loss": 0.1732, "step": 4592 }, { "epoch": 7.302066772655008, "grad_norm": 2.209648591398421, "learning_rate": 4.682848866879683e-05, "loss": 0.1564, "step": 4593 }, { "epoch": 7.3036565977742445, "grad_norm": 2.7413448973005368, "learning_rate": 4.6832737736808e-05, "loss": 0.2145, "step": 4594 }, { "epoch": 7.3052464228934815, "grad_norm": 1.8186951921757886, "learning_rate": 4.683698449218484e-05, "loss": 0.1396, "step": 4595 }, { "epoch": 7.3068362480127185, "grad_norm": 63.057714270396716, "learning_rate": 4.6841228933490005e-05, "loss": 2.6173, "step": 4596 }, { "epoch": 7.3084260731319555, "grad_norm": 61.49505222251891, "learning_rate": 4.684547105928689e-05, "loss": 7.8309, "step": 4597 }, { "epoch": 7.3100158982511925, "grad_norm": 228.71112477038872, "learning_rate": 4.6849710868139694e-05, "loss": 19.6757, "step": 4598 }, { "epoch": 7.3116057233704295, "grad_norm": 4.250878005395424, "learning_rate": 4.6853948358613394e-05, "loss": 0.1567, "step": 4599 }, { "epoch": 7.3131955484896665, "grad_norm": 15.046421577595163, "learning_rate": 4.6858183529273767e-05, "loss": 1.7316, "step": 4600 }, { "epoch": 7.314785373608903, "grad_norm": 1.5836214299328233, "learning_rate": 4.686241637868734e-05, "loss": 0.1196, "step": 4601 }, { "epoch": 7.31637519872814, "grad_norm": 2.228720962393585, "learning_rate": 4.686664690542145e-05, "loss": 0.1423, "step": 4602 }, { "epoch": 7.317965023847377, "grad_norm": 1.7984626796609704, "learning_rate": 4.687087510804423e-05, "loss": 0.159, "step": 4603 }, { "epoch": 7.319554848966614, "grad_norm": 2.2236302397086654, "learning_rate": 4.687510098512458e-05, "loss": 0.1883, "step": 4604 }, { "epoch": 7.321144674085851, "grad_norm": 1.9213632995119332, "learning_rate": 4.687932453523219e-05, "loss": 0.171, "step": 4605 }, { "epoch": 7.322734499205088, "grad_norm": 2.66377980090811, "learning_rate": 4.6883545756937545e-05, "loss": 0.1901, "step": 4606 }, { "epoch": 7.324324324324325, "grad_norm": 1.68682227718761, "learning_rate": 4.6887764648811906e-05, "loss": 0.1156, "step": 4607 }, { "epoch": 7.325914149443562, "grad_norm": 1.5477636587500019, "learning_rate": 4.6891981209427343e-05, "loss": 0.1623, "step": 4608 }, { "epoch": 7.327503974562799, "grad_norm": 2.77993159213339, "learning_rate": 4.689619543735671e-05, "loss": 0.2013, "step": 4609 }, { "epoch": 7.329093799682035, "grad_norm": 2.2536050789889392, "learning_rate": 4.690040733117361e-05, "loss": 0.1772, "step": 4610 }, { "epoch": 7.330683624801272, "grad_norm": 1.6287876871319098, "learning_rate": 4.69046168894525e-05, "loss": 0.2, "step": 4611 }, { "epoch": 7.332273449920509, "grad_norm": 1.8028030178081373, "learning_rate": 4.6908824110768585e-05, "loss": 0.1228, "step": 4612 }, { "epoch": 7.333863275039746, "grad_norm": 1826.6907209016326, "learning_rate": 4.691302899369788e-05, "loss": 5.8109, "step": 4613 }, { "epoch": 7.335453100158983, "grad_norm": 834.6858736953902, "learning_rate": 4.6917231536817176e-05, "loss": 7.4766, "step": 4614 }, { "epoch": 7.33704292527822, "grad_norm": 2.200708696499558, "learning_rate": 4.692143173870407e-05, "loss": 0.1707, "step": 4615 }, { "epoch": 7.338632750397457, "grad_norm": 2.5438678070580107, "learning_rate": 4.692562959793694e-05, "loss": 0.1418, "step": 4616 }, { "epoch": 7.340222575516693, "grad_norm": 2.653523601142218, "learning_rate": 4.692982511309498e-05, "loss": 0.2261, "step": 4617 }, { "epoch": 7.34181240063593, "grad_norm": 1.9092335906747837, "learning_rate": 4.6934018282758135e-05, "loss": 0.1243, "step": 4618 }, { "epoch": 7.343402225755167, "grad_norm": 2.6153331513227958, "learning_rate": 4.6938209105507185e-05, "loss": 0.1641, "step": 4619 }, { "epoch": 7.344992050874404, "grad_norm": 3.342657781777808, "learning_rate": 4.694239757992368e-05, "loss": 0.2476, "step": 4620 }, { "epoch": 7.346581875993641, "grad_norm": 2.1995604934121284, "learning_rate": 4.6946583704589973e-05, "loss": 0.1753, "step": 4621 }, { "epoch": 7.348171701112878, "grad_norm": 3.7877139236335178, "learning_rate": 4.695076747808923e-05, "loss": 0.252, "step": 4622 }, { "epoch": 7.349761526232115, "grad_norm": 5.11658136382788, "learning_rate": 4.695494889900536e-05, "loss": 0.1697, "step": 4623 }, { "epoch": 7.351351351351352, "grad_norm": 3.333462480028068, "learning_rate": 4.6959127965923145e-05, "loss": 0.1513, "step": 4624 }, { "epoch": 7.352941176470588, "grad_norm": 3.442470478617867, "learning_rate": 4.69633046774281e-05, "loss": 0.2073, "step": 4625 }, { "epoch": 7.354531001589825, "grad_norm": 2.153235332540995, "learning_rate": 4.696747903210655e-05, "loss": 0.1435, "step": 4626 }, { "epoch": 7.356120826709062, "grad_norm": 3.7531558836767203, "learning_rate": 4.697165102854565e-05, "loss": 0.1764, "step": 4627 }, { "epoch": 7.357710651828299, "grad_norm": 1.905212809648395, "learning_rate": 4.6975820665333314e-05, "loss": 0.1972, "step": 4628 }, { "epoch": 7.359300476947536, "grad_norm": 3.1282838603787813, "learning_rate": 4.697998794105827e-05, "loss": 0.214, "step": 4629 }, { "epoch": 7.360890302066773, "grad_norm": 3.986941257135869, "learning_rate": 4.6984152854310057e-05, "loss": 0.1663, "step": 4630 }, { "epoch": 7.36248012718601, "grad_norm": 2.5789731102464764, "learning_rate": 4.6988315403679e-05, "loss": 0.2061, "step": 4631 }, { "epoch": 7.364069952305247, "grad_norm": 3.478378498736349, "learning_rate": 4.699247558775622e-05, "loss": 0.1679, "step": 4632 }, { "epoch": 7.365659777424483, "grad_norm": 3.1052506188008584, "learning_rate": 4.6996633405133657e-05, "loss": 0.1934, "step": 4633 }, { "epoch": 7.36724960254372, "grad_norm": 1.836535369394707, "learning_rate": 4.7000788854404024e-05, "loss": 0.1763, "step": 4634 }, { "epoch": 7.368839427662957, "grad_norm": 2.9097823684859665, "learning_rate": 4.700494193416087e-05, "loss": 0.2084, "step": 4635 }, { "epoch": 7.370429252782194, "grad_norm": 3.8700946065124473, "learning_rate": 4.7009092642998514e-05, "loss": 0.1972, "step": 4636 }, { "epoch": 7.372019077901431, "grad_norm": 4.103456867416233, "learning_rate": 4.7013240979512094e-05, "loss": 0.2149, "step": 4637 }, { "epoch": 7.373608903020668, "grad_norm": 2.7190520942942986, "learning_rate": 4.701738694229755e-05, "loss": 0.1682, "step": 4638 }, { "epoch": 7.375198728139905, "grad_norm": 3.456693748029552, "learning_rate": 4.702153052995163e-05, "loss": 0.1748, "step": 4639 }, { "epoch": 7.376788553259141, "grad_norm": 3.123671088805879, "learning_rate": 4.702567174107186e-05, "loss": 0.1895, "step": 4640 }, { "epoch": 7.378378378378378, "grad_norm": 3.06799377971188, "learning_rate": 4.702981057425662e-05, "loss": 0.161, "step": 4641 }, { "epoch": 7.379968203497615, "grad_norm": 3.1111318143528375, "learning_rate": 4.703394702810504e-05, "loss": 0.1696, "step": 4642 }, { "epoch": 7.381558028616852, "grad_norm": 3.938155534936393, "learning_rate": 4.703808110121709e-05, "loss": 0.1665, "step": 4643 }, { "epoch": 7.383147853736089, "grad_norm": 4.789806348871144, "learning_rate": 4.7042212792193535e-05, "loss": 0.2065, "step": 4644 }, { "epoch": 7.384737678855326, "grad_norm": 4.330853948569379, "learning_rate": 4.704634209963595e-05, "loss": 0.2924, "step": 4645 }, { "epoch": 7.386327503974563, "grad_norm": 4.276456178120317, "learning_rate": 4.705046902214671e-05, "loss": 0.2048, "step": 4646 }, { "epoch": 7.3879173290938, "grad_norm": 4.8027921699504255, "learning_rate": 4.705459355832899e-05, "loss": 0.2169, "step": 4647 }, { "epoch": 7.389507154213036, "grad_norm": 3.1552031406340415, "learning_rate": 4.705871570678681e-05, "loss": 0.1258, "step": 4648 }, { "epoch": 7.391096979332273, "grad_norm": 1.5820221026350458, "learning_rate": 4.706283546612496e-05, "loss": 0.1533, "step": 4649 }, { "epoch": 7.39268680445151, "grad_norm": 3.5949954804990085, "learning_rate": 4.7066952834949044e-05, "loss": 0.2635, "step": 4650 }, { "epoch": 7.394276629570747, "grad_norm": 2.435206769591396, "learning_rate": 4.7071067811865475e-05, "loss": 0.2243, "step": 4651 }, { "epoch": 7.395866454689984, "grad_norm": 2.0229113379514305, "learning_rate": 4.7075180395481504e-05, "loss": 0.1718, "step": 4652 }, { "epoch": 7.397456279809221, "grad_norm": 3.3185192368035623, "learning_rate": 4.707929058440516e-05, "loss": 0.1916, "step": 4653 }, { "epoch": 7.399046104928458, "grad_norm": 4.629987550376275, "learning_rate": 4.708339837724529e-05, "loss": 0.1829, "step": 4654 }, { "epoch": 7.400635930047695, "grad_norm": 2.625168033516052, "learning_rate": 4.7087503772611556e-05, "loss": 0.1833, "step": 4655 }, { "epoch": 7.402225755166931, "grad_norm": 2.6160258975796093, "learning_rate": 4.709160676911444e-05, "loss": 0.1739, "step": 4656 }, { "epoch": 7.403815580286168, "grad_norm": 2.010846845540932, "learning_rate": 4.709570736536521e-05, "loss": 0.1583, "step": 4657 }, { "epoch": 7.405405405405405, "grad_norm": 3.6716892066463918, "learning_rate": 4.7099805559975975e-05, "loss": 0.1493, "step": 4658 }, { "epoch": 7.406995230524642, "grad_norm": 2.993739993523424, "learning_rate": 4.710390135155964e-05, "loss": 0.145, "step": 4659 }, { "epoch": 7.408585055643879, "grad_norm": 3.5255149338371963, "learning_rate": 4.7107994738729926e-05, "loss": 0.1443, "step": 4660 }, { "epoch": 7.410174880763116, "grad_norm": 2.947498540360932, "learning_rate": 4.711208572010137e-05, "loss": 0.1519, "step": 4661 }, { "epoch": 7.411764705882353, "grad_norm": 1.4237328886242548, "learning_rate": 4.7116174294289336e-05, "loss": 0.1358, "step": 4662 }, { "epoch": 7.413354531001589, "grad_norm": 2.2217210986354923, "learning_rate": 4.712026045990997e-05, "loss": 0.1626, "step": 4663 }, { "epoch": 7.414944356120826, "grad_norm": 1.7180844920916618, "learning_rate": 4.712434421558026e-05, "loss": 0.1533, "step": 4664 }, { "epoch": 7.416534181240063, "grad_norm": 4.908859392498175, "learning_rate": 4.712842555991801e-05, "loss": 0.1947, "step": 4665 }, { "epoch": 7.4181240063593, "grad_norm": 4.269436482654356, "learning_rate": 4.713250449154181e-05, "loss": 0.157, "step": 4666 }, { "epoch": 7.419713831478537, "grad_norm": 3.124987618485747, "learning_rate": 4.7136581009071127e-05, "loss": 0.1687, "step": 4667 }, { "epoch": 7.421303656597774, "grad_norm": 1.514799967021865, "learning_rate": 4.714065511112618e-05, "loss": 0.129, "step": 4668 }, { "epoch": 7.422893481717011, "grad_norm": 3.3932292283602203, "learning_rate": 4.714472679632803e-05, "loss": 0.1591, "step": 4669 }, { "epoch": 7.424483306836248, "grad_norm": 2.6906369998068262, "learning_rate": 4.714879606329858e-05, "loss": 0.1978, "step": 4670 }, { "epoch": 7.426073131955485, "grad_norm": 2.934327035483828, "learning_rate": 4.7152862910660514e-05, "loss": 0.1479, "step": 4671 }, { "epoch": 7.4276629570747215, "grad_norm": 3.9816725066451153, "learning_rate": 4.715692733703736e-05, "loss": 0.2408, "step": 4672 }, { "epoch": 7.4292527821939585, "grad_norm": 2.3343490397692452, "learning_rate": 4.7160989341053453e-05, "loss": 0.1978, "step": 4673 }, { "epoch": 7.4308426073131955, "grad_norm": 2.227151952114795, "learning_rate": 4.716504892133394e-05, "loss": 0.1706, "step": 4674 }, { "epoch": 7.4324324324324325, "grad_norm": 2.682389220129188, "learning_rate": 4.716910607650483e-05, "loss": 0.1567, "step": 4675 }, { "epoch": 7.4340222575516695, "grad_norm": 3.175203996554223, "learning_rate": 4.7173160805192896e-05, "loss": 0.1393, "step": 4676 }, { "epoch": 7.4356120826709065, "grad_norm": 2.004516905179918, "learning_rate": 4.7177213106025765e-05, "loss": 0.1854, "step": 4677 }, { "epoch": 7.4372019077901435, "grad_norm": 3.7459321604305007, "learning_rate": 4.718126297763189e-05, "loss": 0.1993, "step": 4678 }, { "epoch": 7.43879173290938, "grad_norm": 1.2376547117061463, "learning_rate": 4.718531041864052e-05, "loss": 0.1592, "step": 4679 }, { "epoch": 7.440381558028617, "grad_norm": 2.280954771931182, "learning_rate": 4.7189355427681764e-05, "loss": 0.1661, "step": 4680 }, { "epoch": 7.441971383147854, "grad_norm": 1.5612257727041825, "learning_rate": 4.7193398003386515e-05, "loss": 0.1231, "step": 4681 }, { "epoch": 7.443561208267091, "grad_norm": 4.951074546996653, "learning_rate": 4.719743814438651e-05, "loss": 0.1471, "step": 4682 }, { "epoch": 7.4451510333863276, "grad_norm": 3.537055110729649, "learning_rate": 4.720147584931431e-05, "loss": 0.1275, "step": 4683 }, { "epoch": 7.4467408585055646, "grad_norm": 4.894256462561929, "learning_rate": 4.7205511116803306e-05, "loss": 0.1753, "step": 4684 }, { "epoch": 7.4483306836248016, "grad_norm": 1.8203135128724133, "learning_rate": 4.720954394548769e-05, "loss": 0.1668, "step": 4685 }, { "epoch": 7.4499205087440385, "grad_norm": 5.084558279767693, "learning_rate": 4.721357433400251e-05, "loss": 0.2477, "step": 4686 }, { "epoch": 7.451510333863275, "grad_norm": 5.075150722843725, "learning_rate": 4.721760228098362e-05, "loss": 0.113, "step": 4687 }, { "epoch": 7.453100158982512, "grad_norm": 2.0579198230509945, "learning_rate": 4.722162778506771e-05, "loss": 0.1378, "step": 4688 }, { "epoch": 7.454689984101749, "grad_norm": 5.912397767805827, "learning_rate": 4.7225650844892286e-05, "loss": 0.1791, "step": 4689 }, { "epoch": 7.456279809220986, "grad_norm": 5.157151610608219, "learning_rate": 4.7229671459095686e-05, "loss": 0.146, "step": 4690 }, { "epoch": 7.457869634340223, "grad_norm": 3.535675623894268, "learning_rate": 4.723368962631708e-05, "loss": 0.1324, "step": 4691 }, { "epoch": 7.45945945945946, "grad_norm": 6.338322565965297, "learning_rate": 4.723770534519647e-05, "loss": 0.3028, "step": 4692 }, { "epoch": 7.461049284578697, "grad_norm": 2.79837048647265, "learning_rate": 4.7241718614374675e-05, "loss": 0.3088, "step": 4693 }, { "epoch": 7.462639109697934, "grad_norm": 3.364948468503576, "learning_rate": 4.7245729432493356e-05, "loss": 0.1631, "step": 4694 }, { "epoch": 7.46422893481717, "grad_norm": 3.3913954708455063, "learning_rate": 4.7249737798194976e-05, "loss": 0.1876, "step": 4695 }, { "epoch": 7.465818759936407, "grad_norm": 5.8944704318496886, "learning_rate": 4.725374371012288e-05, "loss": 0.1803, "step": 4696 }, { "epoch": 7.467408585055644, "grad_norm": 3.095669997474016, "learning_rate": 4.7257747166921187e-05, "loss": 0.1588, "step": 4697 }, { "epoch": 7.468998410174881, "grad_norm": 4.7883787985428405, "learning_rate": 4.726174816723488e-05, "loss": 0.1656, "step": 4698 }, { "epoch": 7.470588235294118, "grad_norm": 3.965217845987796, "learning_rate": 4.726574670970976e-05, "loss": 0.1819, "step": 4699 }, { "epoch": 7.472178060413355, "grad_norm": 3.2236672212096815, "learning_rate": 4.7269742792992476e-05, "loss": 0.1332, "step": 4700 }, { "epoch": 7.473767885532592, "grad_norm": 2.2609844250944664, "learning_rate": 4.727373641573049e-05, "loss": 0.1736, "step": 4701 }, { "epoch": 7.475357710651828, "grad_norm": 4.769516491322302, "learning_rate": 4.7277727576572105e-05, "loss": 0.1199, "step": 4702 }, { "epoch": 7.476947535771065, "grad_norm": 3.337869102149578, "learning_rate": 4.728171627416647e-05, "loss": 0.1126, "step": 4703 }, { "epoch": 7.478537360890302, "grad_norm": 2.1821135727877246, "learning_rate": 4.728570250716353e-05, "loss": 0.1385, "step": 4704 }, { "epoch": 7.480127186009539, "grad_norm": 3.9755841187792282, "learning_rate": 4.728968627421412e-05, "loss": 0.1332, "step": 4705 }, { "epoch": 7.481717011128776, "grad_norm": 3.6358439431351597, "learning_rate": 4.729366757396986e-05, "loss": 0.1671, "step": 4706 }, { "epoch": 7.483306836248013, "grad_norm": 750.0141026126608, "learning_rate": 4.729764640508322e-05, "loss": 13.4396, "step": 4707 }, { "epoch": 7.48489666136725, "grad_norm": 2.5082252208984603, "learning_rate": 4.730162276620753e-05, "loss": 0.1515, "step": 4708 }, { "epoch": 7.486486486486487, "grad_norm": 3.849322461922561, "learning_rate": 4.7305596655996916e-05, "loss": 0.3028, "step": 4709 }, { "epoch": 7.488076311605723, "grad_norm": 3.2936206926161433, "learning_rate": 4.730956807310637e-05, "loss": 0.221, "step": 4710 }, { "epoch": 7.48966613672496, "grad_norm": 5.155836582657859, "learning_rate": 4.731353701619171e-05, "loss": 0.1725, "step": 4711 }, { "epoch": 7.491255961844197, "grad_norm": 2.4761484033994847, "learning_rate": 4.731750348390959e-05, "loss": 0.1319, "step": 4712 }, { "epoch": 7.492845786963434, "grad_norm": 7.997796151686728, "learning_rate": 4.732146747491751e-05, "loss": 0.2, "step": 4713 }, { "epoch": 7.494435612082671, "grad_norm": 3.039593478647834, "learning_rate": 4.732542898787379e-05, "loss": 0.1939, "step": 4714 }, { "epoch": 7.496025437201908, "grad_norm": 4.472770549551557, "learning_rate": 4.732938802143762e-05, "loss": 0.1672, "step": 4715 }, { "epoch": 7.497615262321145, "grad_norm": 3.1513952507067096, "learning_rate": 4.733334457426899e-05, "loss": 0.2079, "step": 4716 }, { "epoch": 7.499205087440382, "grad_norm": 5.458426153729237, "learning_rate": 4.733729864502877e-05, "loss": 0.2682, "step": 4717 }, { "epoch": 7.500794912559618, "grad_norm": 4.226231852706508, "learning_rate": 4.7341250232378634e-05, "loss": 0.252, "step": 4718 }, { "epoch": 7.502384737678855, "grad_norm": 1.7967875331770338, "learning_rate": 4.734519933498112e-05, "loss": 0.1878, "step": 4719 }, { "epoch": 7.503974562798092, "grad_norm": 2.4682409598013098, "learning_rate": 4.73491459514996e-05, "loss": 0.2136, "step": 4720 }, { "epoch": 7.505564387917329, "grad_norm": 2.16699606590182, "learning_rate": 4.735309008059829e-05, "loss": 0.1877, "step": 4721 }, { "epoch": 7.507154213036566, "grad_norm": 1.9773407372854714, "learning_rate": 4.735703172094223e-05, "loss": 0.2204, "step": 4722 }, { "epoch": 7.508744038155803, "grad_norm": 2.221847340094475, "learning_rate": 4.736097087119734e-05, "loss": 0.219, "step": 4723 }, { "epoch": 7.51033386327504, "grad_norm": 2.253443471611839, "learning_rate": 4.7364907530030355e-05, "loss": 0.1843, "step": 4724 }, { "epoch": 7.511923688394276, "grad_norm": 4.240762526070993, "learning_rate": 4.736884169610884e-05, "loss": 0.3385, "step": 4725 }, { "epoch": 7.513513513513513, "grad_norm": 2.0539090475674966, "learning_rate": 4.737277336810125e-05, "loss": 0.2133, "step": 4726 }, { "epoch": 7.51510333863275, "grad_norm": 1.4743084092590693, "learning_rate": 4.737670254467683e-05, "loss": 0.195, "step": 4727 }, { "epoch": 7.516693163751987, "grad_norm": 1.7562589578841121, "learning_rate": 4.738062922450571e-05, "loss": 0.1281, "step": 4728 }, { "epoch": 7.518282988871224, "grad_norm": 1.750779209530418, "learning_rate": 4.7384553406258847e-05, "loss": 0.2146, "step": 4729 }, { "epoch": 7.519872813990461, "grad_norm": 1.4592913766492661, "learning_rate": 4.738847508860804e-05, "loss": 0.1336, "step": 4730 }, { "epoch": 7.521462639109698, "grad_norm": 2.3438049638781915, "learning_rate": 4.739239427022597e-05, "loss": 0.1536, "step": 4731 }, { "epoch": 7.523052464228935, "grad_norm": 1.5234195825683736, "learning_rate": 4.73963109497861e-05, "loss": 0.1792, "step": 4732 }, { "epoch": 7.524642289348172, "grad_norm": 2.041928222211489, "learning_rate": 4.7400225125962794e-05, "loss": 0.1114, "step": 4733 }, { "epoch": 7.526232114467408, "grad_norm": 1.8280261962082247, "learning_rate": 4.7404136797431254e-05, "loss": 0.2361, "step": 4734 }, { "epoch": 7.527821939586645, "grad_norm": 3.36071105748803, "learning_rate": 4.74080459628675e-05, "loss": 0.1426, "step": 4735 }, { "epoch": 7.529411764705882, "grad_norm": 28.649114268005768, "learning_rate": 4.741195262094844e-05, "loss": 4.3817, "step": 4736 }, { "epoch": 7.531001589825119, "grad_norm": 1.601734718346111, "learning_rate": 4.7415856770351794e-05, "loss": 0.1504, "step": 4737 }, { "epoch": 7.532591414944356, "grad_norm": 1.5445450752056085, "learning_rate": 4.741975840975617e-05, "loss": 0.1077, "step": 4738 }, { "epoch": 7.534181240063593, "grad_norm": 1.345083505579829, "learning_rate": 4.742365753784098e-05, "loss": 0.1775, "step": 4739 }, { "epoch": 7.53577106518283, "grad_norm": 1.998266539143038, "learning_rate": 4.742755415328652e-05, "loss": 0.1731, "step": 4740 }, { "epoch": 7.537360890302066, "grad_norm": 1.7413664435036789, "learning_rate": 4.7431448254773944e-05, "loss": 0.1282, "step": 4741 }, { "epoch": 7.538950715421303, "grad_norm": 1.783407789000656, "learning_rate": 4.7435339840985216e-05, "loss": 0.1558, "step": 4742 }, { "epoch": 7.54054054054054, "grad_norm": 51.44892367431455, "learning_rate": 4.7439228910603185e-05, "loss": 10.1336, "step": 4743 }, { "epoch": 7.542130365659777, "grad_norm": 1.4433244049754699, "learning_rate": 4.7443115462311546e-05, "loss": 0.1565, "step": 4744 }, { "epoch": 7.543720190779014, "grad_norm": 1.6683684694506802, "learning_rate": 4.744699949479483e-05, "loss": 0.1369, "step": 4745 }, { "epoch": 7.545310015898251, "grad_norm": 2.0149915466008994, "learning_rate": 4.745088100673844e-05, "loss": 0.1406, "step": 4746 }, { "epoch": 7.546899841017488, "grad_norm": 1.4581996764056835, "learning_rate": 4.745475999682863e-05, "loss": 0.1327, "step": 4747 }, { "epoch": 7.548489666136725, "grad_norm": 1.9748463659862572, "learning_rate": 4.745863646375248e-05, "loss": 0.1573, "step": 4748 }, { "epoch": 7.550079491255962, "grad_norm": 1.30087943561732, "learning_rate": 4.7462510406197986e-05, "loss": 0.1129, "step": 4749 }, { "epoch": 7.5516693163751984, "grad_norm": 2.3818264865709424, "learning_rate": 4.7466381822853916e-05, "loss": 0.2294, "step": 4750 }, { "epoch": 7.5532591414944354, "grad_norm": 1.7245447384240924, "learning_rate": 4.7470250712409964e-05, "loss": 0.1465, "step": 4751 }, { "epoch": 7.5548489666136724, "grad_norm": 1.2101239806431119, "learning_rate": 4.747411707355664e-05, "loss": 0.1489, "step": 4752 }, { "epoch": 7.556438791732909, "grad_norm": 1.6527050953914617, "learning_rate": 4.7477980904985316e-05, "loss": 0.1537, "step": 4753 }, { "epoch": 7.558028616852146, "grad_norm": 2.477854866767092, "learning_rate": 4.748184220538824e-05, "loss": 0.1136, "step": 4754 }, { "epoch": 7.559618441971383, "grad_norm": 1.3041266116483894, "learning_rate": 4.7485700973458495e-05, "loss": 0.0911, "step": 4755 }, { "epoch": 7.56120826709062, "grad_norm": 2.2002470913846137, "learning_rate": 4.748955720789002e-05, "loss": 0.2142, "step": 4756 }, { "epoch": 7.5627980922098565, "grad_norm": 3.0736668768238786, "learning_rate": 4.749341090737763e-05, "loss": 0.1373, "step": 4757 }, { "epoch": 7.5643879173290935, "grad_norm": 1.2933614306714942, "learning_rate": 4.749726207061699e-05, "loss": 0.1718, "step": 4758 }, { "epoch": 7.5659777424483305, "grad_norm": 2.075661618005865, "learning_rate": 4.75011106963046e-05, "loss": 0.1537, "step": 4759 }, { "epoch": 7.5675675675675675, "grad_norm": 3.482582044362261, "learning_rate": 4.750495678313786e-05, "loss": 0.1651, "step": 4760 }, { "epoch": 7.5691573926868045, "grad_norm": 2.879845230783415, "learning_rate": 4.7508800329814994e-05, "loss": 0.1753, "step": 4761 }, { "epoch": 7.5707472178060415, "grad_norm": 3.1621032045861934, "learning_rate": 4.7512641335035116e-05, "loss": 0.1303, "step": 4762 }, { "epoch": 7.5723370429252785, "grad_norm": 1.5750307200935876, "learning_rate": 4.751647979749817e-05, "loss": 0.15, "step": 4763 }, { "epoch": 7.573926868044515, "grad_norm": 3.6782968902022515, "learning_rate": 4.752031571590499e-05, "loss": 0.2155, "step": 4764 }, { "epoch": 7.575516693163752, "grad_norm": 1.8957613870660304, "learning_rate": 4.7524149088957245e-05, "loss": 0.9014, "step": 4765 }, { "epoch": 7.577106518282989, "grad_norm": 3.09632919471064, "learning_rate": 4.752797991535748e-05, "loss": 0.1138, "step": 4766 }, { "epoch": 7.578696343402226, "grad_norm": 2.3797123359675783, "learning_rate": 4.753180819380911e-05, "loss": 0.1767, "step": 4767 }, { "epoch": 7.580286168521463, "grad_norm": 2.219168271269133, "learning_rate": 4.753563392301638e-05, "loss": 0.1215, "step": 4768 }, { "epoch": 7.5818759936407, "grad_norm": 4.128501093189743, "learning_rate": 4.753945710168444e-05, "loss": 0.1421, "step": 4769 }, { "epoch": 7.583465818759937, "grad_norm": 3.2379415637688593, "learning_rate": 4.754327772851926e-05, "loss": 0.199, "step": 4770 }, { "epoch": 7.585055643879174, "grad_norm": 3.323650024918312, "learning_rate": 4.754709580222773e-05, "loss": 0.1795, "step": 4771 }, { "epoch": 7.586645468998411, "grad_norm": 2.5169014239700003, "learning_rate": 4.755091132151753e-05, "loss": 0.196, "step": 4772 }, { "epoch": 7.588235294117647, "grad_norm": 2.279971932912635, "learning_rate": 4.755472428509727e-05, "loss": 0.1747, "step": 4773 }, { "epoch": 7.589825119236884, "grad_norm": 3.4233366033168884, "learning_rate": 4.75585346916764e-05, "loss": 0.1507, "step": 4774 }, { "epoch": 7.591414944356121, "grad_norm": 1.9389525539447425, "learning_rate": 4.756234253996523e-05, "loss": 0.1491, "step": 4775 }, { "epoch": 7.593004769475358, "grad_norm": 2.7711551617651535, "learning_rate": 4.756614782867493e-05, "loss": 0.155, "step": 4776 }, { "epoch": 7.594594594594595, "grad_norm": 1.4968108682548629, "learning_rate": 4.7569950556517563e-05, "loss": 0.0973, "step": 4777 }, { "epoch": 7.596184419713832, "grad_norm": 4.844031944015179, "learning_rate": 4.7573750722206046e-05, "loss": 0.1682, "step": 4778 }, { "epoch": 7.597774244833069, "grad_norm": 2.378610057443689, "learning_rate": 4.757754832445415e-05, "loss": 0.0925, "step": 4779 }, { "epoch": 7.599364069952305, "grad_norm": 2.1966235385502406, "learning_rate": 4.7581343361976524e-05, "loss": 0.2128, "step": 4780 }, { "epoch": 7.600953895071542, "grad_norm": 2.8210170727194317, "learning_rate": 4.7585135833488696e-05, "loss": 0.1792, "step": 4781 }, { "epoch": 7.602543720190779, "grad_norm": 4.554974174590489, "learning_rate": 4.758892573770703e-05, "loss": 0.1984, "step": 4782 }, { "epoch": 7.604133545310016, "grad_norm": 1.903886722329593, "learning_rate": 4.759271307334881e-05, "loss": 0.1721, "step": 4783 }, { "epoch": 7.605723370429253, "grad_norm": 4.481724633862183, "learning_rate": 4.759649783913214e-05, "loss": 0.1396, "step": 4784 }, { "epoch": 7.60731319554849, "grad_norm": 2.478999217334688, "learning_rate": 4.760028003377602e-05, "loss": 0.1472, "step": 4785 }, { "epoch": 7.608903020667727, "grad_norm": 3.007223060046777, "learning_rate": 4.7604059656000314e-05, "loss": 0.1397, "step": 4786 }, { "epoch": 7.610492845786963, "grad_norm": 8.730791661322813, "learning_rate": 4.760783670452575e-05, "loss": 0.5768, "step": 4787 }, { "epoch": 7.6120826709062, "grad_norm": 2.7379108947168147, "learning_rate": 4.7611611178073946e-05, "loss": 0.1851, "step": 4788 }, { "epoch": 7.613672496025437, "grad_norm": 1.9016436329395812, "learning_rate": 4.7615383075367364e-05, "loss": 0.1542, "step": 4789 }, { "epoch": 7.615262321144674, "grad_norm": 2.7668087263854746, "learning_rate": 4.761915239512937e-05, "loss": 0.1782, "step": 4790 }, { "epoch": 7.616852146263911, "grad_norm": 1.6599032105887652, "learning_rate": 4.7622919136084184e-05, "loss": 0.1182, "step": 4791 }, { "epoch": 7.618441971383148, "grad_norm": 27.751020804952294, "learning_rate": 4.762668329695688e-05, "loss": 2.2302, "step": 4792 }, { "epoch": 7.620031796502385, "grad_norm": 1.8951048054325008, "learning_rate": 4.763044487647345e-05, "loss": 0.1233, "step": 4793 }, { "epoch": 7.621621621621622, "grad_norm": 3.0287561561644285, "learning_rate": 4.7634203873360724e-05, "loss": 0.1684, "step": 4794 }, { "epoch": 7.623211446740859, "grad_norm": 1.5345062778426573, "learning_rate": 4.7637960286346424e-05, "loss": 0.1393, "step": 4795 }, { "epoch": 7.624801271860095, "grad_norm": 2.9106308824383476, "learning_rate": 4.7641714114159136e-05, "loss": 0.1832, "step": 4796 }, { "epoch": 7.626391096979332, "grad_norm": 1.996694487050703, "learning_rate": 4.7645465355528325e-05, "loss": 0.1294, "step": 4797 }, { "epoch": 7.627980922098569, "grad_norm": 1.7492567187586974, "learning_rate": 4.764921400918432e-05, "loss": 0.1447, "step": 4798 }, { "epoch": 7.629570747217806, "grad_norm": 1.1793220969434797, "learning_rate": 4.7652960073858356e-05, "loss": 0.1696, "step": 4799 }, { "epoch": 7.631160572337043, "grad_norm": 4.529096893733174, "learning_rate": 4.765670354828252e-05, "loss": 0.1801, "step": 4800 }, { "epoch": 7.63275039745628, "grad_norm": 1.7026156386878346, "learning_rate": 4.766044443118978e-05, "loss": 0.1598, "step": 4801 }, { "epoch": 7.634340222575517, "grad_norm": 3.5516136873338877, "learning_rate": 4.766418272131399e-05, "loss": 0.1969, "step": 4802 }, { "epoch": 7.635930047694753, "grad_norm": 2.2079967799734472, "learning_rate": 4.766791841738986e-05, "loss": 0.1486, "step": 4803 }, { "epoch": 7.63751987281399, "grad_norm": 2.124930609544282, "learning_rate": 4.7671651518153e-05, "loss": 0.1772, "step": 4804 }, { "epoch": 7.639109697933227, "grad_norm": 2.8473318060050126, "learning_rate": 4.767538202233989e-05, "loss": 0.2068, "step": 4805 }, { "epoch": 7.640699523052464, "grad_norm": 2.7634170060588494, "learning_rate": 4.7679109928687886e-05, "loss": 0.1475, "step": 4806 }, { "epoch": 7.642289348171701, "grad_norm": 2.044527922093527, "learning_rate": 4.768283523593523e-05, "loss": 0.1115, "step": 4807 }, { "epoch": 7.643879173290938, "grad_norm": 3.950386450273311, "learning_rate": 4.768655794282105e-05, "loss": 0.1468, "step": 4808 }, { "epoch": 7.645468998410175, "grad_norm": 3.2449097015331616, "learning_rate": 4.769027804808533e-05, "loss": 0.1711, "step": 4809 }, { "epoch": 7.647058823529412, "grad_norm": 2.0167432008104926, "learning_rate": 4.769399555046895e-05, "loss": 0.175, "step": 4810 }, { "epoch": 7.648648648648649, "grad_norm": 2.6208862533654074, "learning_rate": 4.769771044871368e-05, "loss": 0.158, "step": 4811 }, { "epoch": 7.650238473767885, "grad_norm": 2.3912918717986527, "learning_rate": 4.770142274156215e-05, "loss": 0.1708, "step": 4812 }, { "epoch": 7.651828298887122, "grad_norm": 1.5514775690177256, "learning_rate": 4.77051324277579e-05, "loss": 0.1384, "step": 4813 }, { "epoch": 7.653418124006359, "grad_norm": 2.814862550330019, "learning_rate": 4.770883950604531e-05, "loss": 0.134, "step": 4814 }, { "epoch": 7.655007949125596, "grad_norm": 43.67830904971946, "learning_rate": 4.771254397516969e-05, "loss": 5.1057, "step": 4815 }, { "epoch": 7.656597774244833, "grad_norm": 2.8814425819872307, "learning_rate": 4.77162458338772e-05, "loss": 0.1899, "step": 4816 }, { "epoch": 7.65818759936407, "grad_norm": 3.366461473686901, "learning_rate": 4.77199450809149e-05, "loss": 0.1844, "step": 4817 }, { "epoch": 7.659777424483307, "grad_norm": 4.924699834617854, "learning_rate": 4.7723641715030733e-05, "loss": 0.1821, "step": 4818 }, { "epoch": 7.661367249602543, "grad_norm": 2.349006993854446, "learning_rate": 4.772733573497352e-05, "loss": 0.1419, "step": 4819 }, { "epoch": 7.66295707472178, "grad_norm": 86.34540492585505, "learning_rate": 4.773102713949295e-05, "loss": 13.2542, "step": 4820 }, { "epoch": 7.664546899841017, "grad_norm": 4.178754122373659, "learning_rate": 4.7734715927339636e-05, "loss": 0.1568, "step": 4821 }, { "epoch": 7.666136724960254, "grad_norm": 1.8076730756080879, "learning_rate": 4.773840209726507e-05, "loss": 0.1511, "step": 4822 }, { "epoch": 7.667726550079491, "grad_norm": 6.010955414298864, "learning_rate": 4.774208564802158e-05, "loss": 0.1622, "step": 4823 }, { "epoch": 7.669316375198728, "grad_norm": 2.5799361163688626, "learning_rate": 4.7745766578362445e-05, "loss": 0.152, "step": 4824 }, { "epoch": 7.670906200317965, "grad_norm": 3.2691986827547828, "learning_rate": 4.77494448870418e-05, "loss": 0.1753, "step": 4825 }, { "epoch": 7.672496025437201, "grad_norm": 4.815672266979673, "learning_rate": 4.775312057281467e-05, "loss": 0.1878, "step": 4826 }, { "epoch": 7.674085850556438, "grad_norm": 2.9888513525560874, "learning_rate": 4.7756793634436945e-05, "loss": 0.1837, "step": 4827 }, { "epoch": 7.675675675675675, "grad_norm": 3.0308953550536937, "learning_rate": 4.7760464070665465e-05, "loss": 0.1197, "step": 4828 }, { "epoch": 7.677265500794912, "grad_norm": 8.587909077285017, "learning_rate": 4.776413188025789e-05, "loss": 28.0462, "step": 4829 }, { "epoch": 7.678855325914149, "grad_norm": 1.941815489594796, "learning_rate": 4.776779706197282e-05, "loss": 0.1404, "step": 4830 }, { "epoch": 7.680445151033386, "grad_norm": 8.638761538916617, "learning_rate": 4.777145961456971e-05, "loss": 0.1992, "step": 4831 }, { "epoch": 7.682034976152623, "grad_norm": 2.1899099388093406, "learning_rate": 4.777511953680893e-05, "loss": 0.1636, "step": 4832 }, { "epoch": 7.68362480127186, "grad_norm": 3.2966946054890993, "learning_rate": 4.777877682745171e-05, "loss": 0.1725, "step": 4833 }, { "epoch": 7.685214626391097, "grad_norm": 6.20409913863841, "learning_rate": 4.778243148526021e-05, "loss": 0.2071, "step": 4834 }, { "epoch": 7.6868044515103335, "grad_norm": 5.307366886487229, "learning_rate": 4.778608350899745e-05, "loss": 0.1797, "step": 4835 }, { "epoch": 7.6883942766295705, "grad_norm": 2.2138608774455952, "learning_rate": 4.778973289742736e-05, "loss": 0.2032, "step": 4836 }, { "epoch": 7.6899841017488075, "grad_norm": 6.247543299588143, "learning_rate": 4.779337964931475e-05, "loss": 0.2135, "step": 4837 }, { "epoch": 7.6915739268680445, "grad_norm": 5.703974704592912, "learning_rate": 4.779702376342531e-05, "loss": 0.152, "step": 4838 }, { "epoch": 7.6931637519872815, "grad_norm": 1.7663526446162068, "learning_rate": 4.7800665238525666e-05, "loss": 0.1344, "step": 4839 }, { "epoch": 7.6947535771065185, "grad_norm": 7.217796914777775, "learning_rate": 4.78043040733833e-05, "loss": 0.188, "step": 4840 }, { "epoch": 7.6963434022257555, "grad_norm": 3.2411011228412674, "learning_rate": 4.780794026676659e-05, "loss": 0.1779, "step": 4841 }, { "epoch": 7.697933227344992, "grad_norm": 3.705838019085251, "learning_rate": 4.7811573817444834e-05, "loss": 0.1848, "step": 4842 }, { "epoch": 7.699523052464229, "grad_norm": 4.244345211545069, "learning_rate": 4.781520472418819e-05, "loss": 0.1451, "step": 4843 }, { "epoch": 7.701112877583466, "grad_norm": 2.494870191204854, "learning_rate": 4.781883298576773e-05, "loss": 0.1693, "step": 4844 }, { "epoch": 7.702702702702703, "grad_norm": 3.7886826839151198, "learning_rate": 4.7822458600955426e-05, "loss": 0.1389, "step": 4845 }, { "epoch": 7.70429252782194, "grad_norm": 3.3716713277912156, "learning_rate": 4.7826081568524144e-05, "loss": 0.175, "step": 4846 }, { "epoch": 7.705882352941177, "grad_norm": 4.6290037961701875, "learning_rate": 4.782970188724762e-05, "loss": 0.1545, "step": 4847 }, { "epoch": 7.707472178060414, "grad_norm": 5.008468793844193, "learning_rate": 4.783331955590052e-05, "loss": 0.1613, "step": 4848 }, { "epoch": 7.709062003179651, "grad_norm": 3.2710190445831806, "learning_rate": 4.7836934573258396e-05, "loss": 0.1095, "step": 4849 }, { "epoch": 7.710651828298887, "grad_norm": 4.187683332824225, "learning_rate": 4.784054693809769e-05, "loss": 0.1716, "step": 4850 }, { "epoch": 7.712241653418124, "grad_norm": 3.9547069636363887, "learning_rate": 4.784415664919576e-05, "loss": 0.1689, "step": 4851 }, { "epoch": 7.713831478537361, "grad_norm": 3.024567056434687, "learning_rate": 4.784776370533084e-05, "loss": 0.2025, "step": 4852 }, { "epoch": 7.715421303656598, "grad_norm": 4.092607250776077, "learning_rate": 4.7851368105282055e-05, "loss": 0.1782, "step": 4853 }, { "epoch": 7.717011128775835, "grad_norm": 4.795757882785464, "learning_rate": 4.7854969847829474e-05, "loss": 0.1679, "step": 4854 }, { "epoch": 7.718600953895072, "grad_norm": 4.144649297081837, "learning_rate": 4.785856893175402e-05, "loss": 0.1671, "step": 4855 }, { "epoch": 7.720190779014309, "grad_norm": 6.301477941112075, "learning_rate": 4.786216535583754e-05, "loss": 0.1434, "step": 4856 }, { "epoch": 7.721780604133546, "grad_norm": 2.7178251560997504, "learning_rate": 4.7865759118862785e-05, "loss": 0.1548, "step": 4857 }, { "epoch": 7.723370429252782, "grad_norm": 2.95976233048554, "learning_rate": 4.786935021961337e-05, "loss": 0.1649, "step": 4858 }, { "epoch": 7.724960254372019, "grad_norm": 7.604426793717746, "learning_rate": 4.7872938656873865e-05, "loss": 0.2387, "step": 4859 }, { "epoch": 7.726550079491256, "grad_norm": 2.174276802192697, "learning_rate": 4.78765244294297e-05, "loss": 0.1113, "step": 4860 }, { "epoch": 7.728139904610493, "grad_norm": 3.8805292812049044, "learning_rate": 4.788010753606722e-05, "loss": 0.1593, "step": 4861 }, { "epoch": 7.72972972972973, "grad_norm": 3.555134815355614, "learning_rate": 4.788368797557368e-05, "loss": 0.1184, "step": 4862 }, { "epoch": 7.731319554848967, "grad_norm": 7.951382167906444, "learning_rate": 4.788726574673723e-05, "loss": 0.1805, "step": 4863 }, { "epoch": 7.732909379968204, "grad_norm": 2.5168882791677594, "learning_rate": 4.789084084834691e-05, "loss": 0.1687, "step": 4864 }, { "epoch": 7.73449920508744, "grad_norm": 4.5787408520337705, "learning_rate": 4.789441327919269e-05, "loss": 0.1918, "step": 4865 }, { "epoch": 7.736089030206677, "grad_norm": 5.901620815009177, "learning_rate": 4.789798303806544e-05, "loss": 0.1853, "step": 4866 }, { "epoch": 7.737678855325914, "grad_norm": 4.688257340758661, "learning_rate": 4.790155012375691e-05, "loss": 0.1701, "step": 4867 }, { "epoch": 7.739268680445151, "grad_norm": 5.670597067576088, "learning_rate": 4.790511453505977e-05, "loss": 0.1317, "step": 4868 }, { "epoch": 7.740858505564388, "grad_norm": 4.622546343518444, "learning_rate": 4.790867627076761e-05, "loss": 0.1734, "step": 4869 }, { "epoch": 7.742448330683625, "grad_norm": 5.1573734395383815, "learning_rate": 4.79122353296749e-05, "loss": 0.1894, "step": 4870 }, { "epoch": 7.744038155802862, "grad_norm": 4.2621720310157585, "learning_rate": 4.791579171057704e-05, "loss": 0.1793, "step": 4871 }, { "epoch": 7.745627980922099, "grad_norm": 4.435375773390268, "learning_rate": 4.7919345412270306e-05, "loss": 0.1696, "step": 4872 }, { "epoch": 7.747217806041336, "grad_norm": 2.3945174558290327, "learning_rate": 4.792289643355191e-05, "loss": 0.1638, "step": 4873 }, { "epoch": 7.748807631160572, "grad_norm": 1.9890454351364455, "learning_rate": 4.792644477321995e-05, "loss": 0.1568, "step": 4874 }, { "epoch": 7.750397456279809, "grad_norm": 3.1940890099101154, "learning_rate": 4.792999043007347e-05, "loss": 0.2603, "step": 4875 }, { "epoch": 7.751987281399046, "grad_norm": 17.32519917476074, "learning_rate": 4.793353340291235e-05, "loss": 2.2677, "step": 4876 }, { "epoch": 7.753577106518283, "grad_norm": 4.527711701871871, "learning_rate": 4.7937073690537456e-05, "loss": 0.1758, "step": 4877 }, { "epoch": 7.75516693163752, "grad_norm": 3.6039058619977555, "learning_rate": 4.7940611291750514e-05, "loss": 0.1502, "step": 4878 }, { "epoch": 7.756756756756757, "grad_norm": 4.331219215680899, "learning_rate": 4.7944146205354186e-05, "loss": 0.1947, "step": 4879 }, { "epoch": 7.758346581875994, "grad_norm": 4.982291651447193, "learning_rate": 4.7947678430152016e-05, "loss": 0.1933, "step": 4880 }, { "epoch": 7.75993640699523, "grad_norm": 4.167488160064972, "learning_rate": 4.795120796494849e-05, "loss": 0.1368, "step": 4881 }, { "epoch": 7.761526232114467, "grad_norm": 4.055252257055197, "learning_rate": 4.7954734808548964e-05, "loss": 0.149, "step": 4882 }, { "epoch": 7.763116057233704, "grad_norm": 8.49608409897939, "learning_rate": 4.7958258959759754e-05, "loss": 0.3281, "step": 4883 }, { "epoch": 7.764705882352941, "grad_norm": 3.4343683080686853, "learning_rate": 4.7961780417388045e-05, "loss": 0.1487, "step": 4884 }, { "epoch": 7.766295707472178, "grad_norm": 4.919509848077963, "learning_rate": 4.796529918024197e-05, "loss": 0.1434, "step": 4885 }, { "epoch": 7.767885532591415, "grad_norm": 5.521508602156541, "learning_rate": 4.796881524713053e-05, "loss": 0.2172, "step": 4886 }, { "epoch": 7.769475357710652, "grad_norm": 3.550189502190708, "learning_rate": 4.7972328616863695e-05, "loss": 0.2002, "step": 4887 }, { "epoch": 7.771065182829888, "grad_norm": 4.0170862647156635, "learning_rate": 4.797583928825228e-05, "loss": 0.214, "step": 4888 }, { "epoch": 7.772655007949125, "grad_norm": 4.737471834537013, "learning_rate": 4.797934726010809e-05, "loss": 0.1481, "step": 4889 }, { "epoch": 7.774244833068362, "grad_norm": 3.642311542246314, "learning_rate": 4.798285253124377e-05, "loss": 0.2271, "step": 4890 }, { "epoch": 7.775834658187599, "grad_norm": 2.59105675825928, "learning_rate": 4.798635510047293e-05, "loss": 0.1685, "step": 4891 }, { "epoch": 7.777424483306836, "grad_norm": 2.197181112600223, "learning_rate": 4.798985496661007e-05, "loss": 0.135, "step": 4892 }, { "epoch": 7.779014308426073, "grad_norm": 3.48471007541512, "learning_rate": 4.799335212847062e-05, "loss": 0.1306, "step": 4893 }, { "epoch": 7.78060413354531, "grad_norm": 3.056013578425701, "learning_rate": 4.799684658487091e-05, "loss": 0.1186, "step": 4894 }, { "epoch": 7.782193958664547, "grad_norm": 3.5535282736313762, "learning_rate": 4.800033833462819e-05, "loss": 0.1439, "step": 4895 }, { "epoch": 7.783783783783784, "grad_norm": 2.675849154433615, "learning_rate": 4.800382737656064e-05, "loss": 0.1791, "step": 4896 }, { "epoch": 7.78537360890302, "grad_norm": 4.58423152103168, "learning_rate": 4.800731370948734e-05, "loss": 0.1792, "step": 4897 }, { "epoch": 7.786963434022257, "grad_norm": 2.0729235708226637, "learning_rate": 4.8010797332228294e-05, "loss": 0.1527, "step": 4898 }, { "epoch": 7.788553259141494, "grad_norm": 5.468472774126725, "learning_rate": 4.801427824360441e-05, "loss": 0.1866, "step": 4899 }, { "epoch": 7.790143084260731, "grad_norm": 3.772952143291985, "learning_rate": 4.801775644243754e-05, "loss": 0.1975, "step": 4900 }, { "epoch": 7.791732909379968, "grad_norm": 4.326817190959473, "learning_rate": 4.802123192755044e-05, "loss": 0.1809, "step": 4901 }, { "epoch": 7.793322734499205, "grad_norm": 6.195245824248576, "learning_rate": 4.8024704697766774e-05, "loss": 0.1545, "step": 4902 }, { "epoch": 7.794912559618442, "grad_norm": 3.433346259110007, "learning_rate": 4.802817475191115e-05, "loss": 0.1766, "step": 4903 }, { "epoch": 7.796502384737678, "grad_norm": 5.628266352613258, "learning_rate": 4.8031642088809064e-05, "loss": 0.1576, "step": 4904 }, { "epoch": 7.798092209856915, "grad_norm": 3.692341727245951, "learning_rate": 4.803510670728695e-05, "loss": 0.1631, "step": 4905 }, { "epoch": 7.799682034976152, "grad_norm": 6.794840960498299, "learning_rate": 4.803856860617217e-05, "loss": 0.2067, "step": 4906 }, { "epoch": 7.801271860095389, "grad_norm": 6.397007367718696, "learning_rate": 4.8042027784293e-05, "loss": 0.244, "step": 4907 }, { "epoch": 7.802861685214626, "grad_norm": 3.8735856557977844, "learning_rate": 4.804548424047861e-05, "loss": 0.1543, "step": 4908 }, { "epoch": 7.804451510333863, "grad_norm": 4.531061281199375, "learning_rate": 4.804893797355914e-05, "loss": 0.1502, "step": 4909 }, { "epoch": 7.8060413354531, "grad_norm": 3.7018281863408946, "learning_rate": 4.805238898236562e-05, "loss": 0.199, "step": 4910 }, { "epoch": 7.807631160572337, "grad_norm": 3.1156121204912917, "learning_rate": 4.805583726573e-05, "loss": 0.1495, "step": 4911 }, { "epoch": 7.809220985691574, "grad_norm": 3.303473257271987, "learning_rate": 4.805928282248516e-05, "loss": 0.2053, "step": 4912 }, { "epoch": 7.8108108108108105, "grad_norm": 3.598543308071502, "learning_rate": 4.806272565146492e-05, "loss": 0.1852, "step": 4913 }, { "epoch": 7.8124006359300475, "grad_norm": 3.5785539744186483, "learning_rate": 4.8066165751503984e-05, "loss": 0.1204, "step": 4914 }, { "epoch": 7.8139904610492845, "grad_norm": 3.661590418799151, "learning_rate": 4.806960312143802e-05, "loss": 0.1953, "step": 4915 }, { "epoch": 7.8155802861685215, "grad_norm": 2.957558397410124, "learning_rate": 4.80730377601036e-05, "loss": 0.1853, "step": 4916 }, { "epoch": 7.8171701112877585, "grad_norm": 2.4502807742983346, "learning_rate": 4.807646966633822e-05, "loss": 0.1266, "step": 4917 }, { "epoch": 7.8187599364069955, "grad_norm": 4.311878999091106, "learning_rate": 4.807989883898031e-05, "loss": 0.1458, "step": 4918 }, { "epoch": 7.8203497615262325, "grad_norm": 4.083731894959133, "learning_rate": 4.808332527686921e-05, "loss": 0.1952, "step": 4919 }, { "epoch": 7.821939586645469, "grad_norm": 3.2306281618224553, "learning_rate": 4.80867489788452e-05, "loss": 0.135, "step": 4920 }, { "epoch": 7.823529411764706, "grad_norm": 3.226340466006041, "learning_rate": 4.809016994374947e-05, "loss": 0.2138, "step": 4921 }, { "epoch": 7.825119236883943, "grad_norm": 20.61238321127757, "learning_rate": 4.809358817042417e-05, "loss": 2.9132, "step": 4922 }, { "epoch": 7.82670906200318, "grad_norm": 3.8745391467908914, "learning_rate": 4.809700365771234e-05, "loss": 0.1781, "step": 4923 }, { "epoch": 7.828298887122417, "grad_norm": 3.509846113538995, "learning_rate": 4.810041640445796e-05, "loss": 0.1543, "step": 4924 }, { "epoch": 7.829888712241654, "grad_norm": 2.3764799870692426, "learning_rate": 4.810382640950595e-05, "loss": 0.115, "step": 4925 }, { "epoch": 7.831478537360891, "grad_norm": 2.652961818009014, "learning_rate": 4.8107233671702124e-05, "loss": 0.1746, "step": 4926 }, { "epoch": 7.833068362480127, "grad_norm": 2.686173328715501, "learning_rate": 4.811063818989327e-05, "loss": 0.1792, "step": 4927 }, { "epoch": 7.834658187599364, "grad_norm": 208.27179330863018, "learning_rate": 4.811403996292707e-05, "loss": 14.0542, "step": 4928 }, { "epoch": 7.836248012718601, "grad_norm": 3.3088274349735487, "learning_rate": 4.8117438989652145e-05, "loss": 0.1676, "step": 4929 }, { "epoch": 7.837837837837838, "grad_norm": 4.262856600110085, "learning_rate": 4.812083526891807e-05, "loss": 0.1949, "step": 4930 }, { "epoch": 7.839427662957075, "grad_norm": 2.1374844179255272, "learning_rate": 4.8124228799575296e-05, "loss": 0.124, "step": 4931 }, { "epoch": 7.841017488076312, "grad_norm": 2.681152960889788, "learning_rate": 4.812761958047525e-05, "loss": 0.1275, "step": 4932 }, { "epoch": 7.842607313195549, "grad_norm": 12.863216084517088, "learning_rate": 4.8131007610470275e-05, "loss": 0.7009, "step": 4933 }, { "epoch": 7.844197138314786, "grad_norm": 3.7485051764350734, "learning_rate": 4.8134392888413654e-05, "loss": 0.1792, "step": 4934 }, { "epoch": 7.845786963434023, "grad_norm": 3.026990433203727, "learning_rate": 4.8137775413159575e-05, "loss": 0.188, "step": 4935 }, { "epoch": 7.847376788553259, "grad_norm": 2.1007247296818132, "learning_rate": 4.8141155183563196e-05, "loss": 0.1603, "step": 4936 }, { "epoch": 7.848966613672496, "grad_norm": 1.999488471969087, "learning_rate": 4.8144532198480576e-05, "loss": 0.1537, "step": 4937 }, { "epoch": 7.850556438791733, "grad_norm": 3.128559512297237, "learning_rate": 4.814790645676871e-05, "loss": 0.1304, "step": 4938 }, { "epoch": 7.85214626391097, "grad_norm": 2.0926042102396143, "learning_rate": 4.8151277957285545e-05, "loss": 0.183, "step": 4939 }, { "epoch": 7.853736089030207, "grad_norm": 1.6195699862714954, "learning_rate": 4.815464669888995e-05, "loss": 0.1365, "step": 4940 }, { "epoch": 7.855325914149444, "grad_norm": 2.809229748432529, "learning_rate": 4.815801268044172e-05, "loss": 0.1787, "step": 4941 }, { "epoch": 7.856915739268681, "grad_norm": 2.0247053277342633, "learning_rate": 4.8161375900801604e-05, "loss": 0.1713, "step": 4942 }, { "epoch": 7.858505564387917, "grad_norm": 2.2090789953658363, "learning_rate": 4.8164736358831266e-05, "loss": 0.1527, "step": 4943 }, { "epoch": 7.860095389507154, "grad_norm": 1.9611068478568885, "learning_rate": 4.816809405339331e-05, "loss": 0.1993, "step": 4944 }, { "epoch": 7.861685214626391, "grad_norm": 1.1890423270793156, "learning_rate": 4.817144898335129e-05, "loss": 0.1809, "step": 4945 }, { "epoch": 7.863275039745628, "grad_norm": 3.5338773883974666, "learning_rate": 4.817480114756967e-05, "loss": 0.2079, "step": 4946 }, { "epoch": 7.864864864864865, "grad_norm": 1.6960537379975122, "learning_rate": 4.817815054491387e-05, "loss": 0.1478, "step": 4947 }, { "epoch": 7.866454689984102, "grad_norm": 2.9087269689041975, "learning_rate": 4.818149717425024e-05, "loss": 0.163, "step": 4948 }, { "epoch": 7.868044515103339, "grad_norm": 1.6404688098672655, "learning_rate": 4.8184841034446064e-05, "loss": 0.1332, "step": 4949 }, { "epoch": 7.869634340222575, "grad_norm": 2.553962142007059, "learning_rate": 4.818818212436957e-05, "loss": 0.2385, "step": 4950 }, { "epoch": 7.871224165341812, "grad_norm": 3.0910186405873987, "learning_rate": 4.819152044288992e-05, "loss": 0.1582, "step": 4951 }, { "epoch": 7.872813990461049, "grad_norm": 36.03967410521832, "learning_rate": 4.819485598887722e-05, "loss": 0.9207, "step": 4952 }, { "epoch": 7.874403815580286, "grad_norm": 2.0236553660488026, "learning_rate": 4.819818876120249e-05, "loss": 0.1926, "step": 4953 }, { "epoch": 7.875993640699523, "grad_norm": 4.184349843779772, "learning_rate": 4.820151875873772e-05, "loss": 0.222, "step": 4954 }, { "epoch": 7.87758346581876, "grad_norm": 2.210387507969358, "learning_rate": 4.820484598035584e-05, "loss": 0.1751, "step": 4955 }, { "epoch": 7.879173290937997, "grad_norm": 6297.770638160034, "learning_rate": 4.8208170424930675e-05, "loss": 10.4534, "step": 4956 }, { "epoch": 7.880763116057234, "grad_norm": 3.8524524340204884, "learning_rate": 4.821149209133705e-05, "loss": 0.1446, "step": 4957 }, { "epoch": 7.882352941176471, "grad_norm": 3.82605314730378, "learning_rate": 4.821481097845068e-05, "loss": 0.1254, "step": 4958 }, { "epoch": 7.883942766295707, "grad_norm": 2.2741560276795423, "learning_rate": 4.821812708514824e-05, "loss": 0.168, "step": 4959 }, { "epoch": 7.885532591414944, "grad_norm": 120.86045405974924, "learning_rate": 4.8221440410307376e-05, "loss": 1.3924, "step": 4960 }, { "epoch": 7.887122416534181, "grad_norm": 3.909459770430315, "learning_rate": 4.822475095280662e-05, "loss": 0.1521, "step": 4961 }, { "epoch": 7.888712241653418, "grad_norm": 5.3537000396522645, "learning_rate": 4.8228058711525496e-05, "loss": 0.2861, "step": 4962 }, { "epoch": 7.890302066772655, "grad_norm": 3.868391643518189, "learning_rate": 4.8231363685344426e-05, "loss": 0.227, "step": 4963 }, { "epoch": 7.891891891891892, "grad_norm": 4.908552289778511, "learning_rate": 4.82346658731448e-05, "loss": 0.1837, "step": 4964 }, { "epoch": 7.893481717011129, "grad_norm": 3.925555464974056, "learning_rate": 4.8237965273808956e-05, "loss": 1.4195, "step": 4965 }, { "epoch": 7.895071542130365, "grad_norm": 3.755552242890727, "learning_rate": 4.824126188622016e-05, "loss": 0.2346, "step": 4966 }, { "epoch": 7.896661367249602, "grad_norm": 5.485537256042443, "learning_rate": 4.824455570926263e-05, "loss": 0.2818, "step": 4967 }, { "epoch": 7.898251192368839, "grad_norm": 3.5866565661282523, "learning_rate": 4.824784674182152e-05, "loss": 0.1539, "step": 4968 }, { "epoch": 7.899841017488076, "grad_norm": 2.8630403838349814, "learning_rate": 4.8251134982782956e-05, "loss": 0.2061, "step": 4969 }, { "epoch": 7.901430842607313, "grad_norm": 2.8642125361057658, "learning_rate": 4.8254420431033964e-05, "loss": 0.1734, "step": 4970 }, { "epoch": 7.90302066772655, "grad_norm": 1.9940617647341292, "learning_rate": 4.8257703085462547e-05, "loss": 0.1343, "step": 4971 }, { "epoch": 7.904610492845787, "grad_norm": 2.8690220318202786, "learning_rate": 4.826098294495764e-05, "loss": 0.2251, "step": 4972 }, { "epoch": 7.906200317965024, "grad_norm": 2.4856491389265605, "learning_rate": 4.8264260008409135e-05, "loss": 0.1804, "step": 4973 }, { "epoch": 7.907790143084261, "grad_norm": 2.7832817978862643, "learning_rate": 4.8267534274707873e-05, "loss": 0.1704, "step": 4974 }, { "epoch": 7.909379968203497, "grad_norm": 2.485709787805316, "learning_rate": 4.8270805742745626e-05, "loss": 0.1077, "step": 4975 }, { "epoch": 7.910969793322734, "grad_norm": 2.041579058703087, "learning_rate": 4.827407441141511e-05, "loss": 0.1658, "step": 4976 }, { "epoch": 7.912559618441971, "grad_norm": 3.5393131999962026, "learning_rate": 4.827734027961001e-05, "loss": 0.1621, "step": 4977 }, { "epoch": 7.914149443561208, "grad_norm": 2.4866715331014344, "learning_rate": 4.828060334622495e-05, "loss": 0.3023, "step": 4978 }, { "epoch": 7.915739268680445, "grad_norm": 1.8636972870552049, "learning_rate": 4.828386361015549e-05, "loss": 0.1323, "step": 4979 }, { "epoch": 7.917329093799682, "grad_norm": 1.911153724512037, "learning_rate": 4.828712107029816e-05, "loss": 0.1595, "step": 4980 }, { "epoch": 7.918918918918919, "grad_norm": 1.9949316022347554, "learning_rate": 4.829037572555042e-05, "loss": 0.182, "step": 4981 }, { "epoch": 7.920508744038155, "grad_norm": 3.007769842892119, "learning_rate": 4.829362757481069e-05, "loss": 0.2447, "step": 4982 }, { "epoch": 7.922098569157392, "grad_norm": 2.267652205254621, "learning_rate": 4.829687661697834e-05, "loss": 0.1728, "step": 4983 }, { "epoch": 7.923688394276629, "grad_norm": 3.0160224837107217, "learning_rate": 4.8300122850953675e-05, "loss": 0.1868, "step": 4984 }, { "epoch": 7.925278219395866, "grad_norm": 3.974766516167264, "learning_rate": 4.8303366275637975e-05, "loss": 0.1843, "step": 4985 }, { "epoch": 7.926868044515103, "grad_norm": 3.510510741236876, "learning_rate": 4.830660688993346e-05, "loss": 0.1805, "step": 4986 }, { "epoch": 7.92845786963434, "grad_norm": 3.2329235420290963, "learning_rate": 4.8309844692743284e-05, "loss": 0.1473, "step": 4987 }, { "epoch": 7.930047694753577, "grad_norm": 1.8636818152672503, "learning_rate": 4.8313079682971575e-05, "loss": 0.2082, "step": 4988 }, { "epoch": 7.9316375198728135, "grad_norm": 4.150152789953504, "learning_rate": 4.8316311859523424e-05, "loss": 0.2378, "step": 4989 }, { "epoch": 7.9332273449920505, "grad_norm": 5.461450180775667, "learning_rate": 4.831954122130483e-05, "loss": 0.3557, "step": 4990 }, { "epoch": 7.9348171701112875, "grad_norm": 3.414994117779668, "learning_rate": 4.832276776722278e-05, "loss": 0.1343, "step": 4991 }, { "epoch": 7.9364069952305245, "grad_norm": 3.1187476501957017, "learning_rate": 4.832599149618521e-05, "loss": 0.165, "step": 4992 }, { "epoch": 7.9379968203497615, "grad_norm": 3.520146182749627, "learning_rate": 4.832921240710099e-05, "loss": 0.1717, "step": 4993 }, { "epoch": 7.9395866454689985, "grad_norm": 2.165721718800238, "learning_rate": 4.8332430498879984e-05, "loss": 0.1634, "step": 4994 }, { "epoch": 7.9411764705882355, "grad_norm": 2.972952135351359, "learning_rate": 4.833564577043297e-05, "loss": 0.121, "step": 4995 }, { "epoch": 7.9427662957074725, "grad_norm": 7.257678551190708, "learning_rate": 4.8338858220671684e-05, "loss": 0.3683, "step": 4996 }, { "epoch": 7.9443561208267095, "grad_norm": 2.633176318003089, "learning_rate": 4.834206784850885e-05, "loss": 0.1805, "step": 4997 }, { "epoch": 7.945945945945946, "grad_norm": 2.6103477711132403, "learning_rate": 4.83452746528581e-05, "loss": 0.2239, "step": 4998 }, { "epoch": 7.947535771065183, "grad_norm": 3.9649821498683635, "learning_rate": 4.834847863263407e-05, "loss": 0.1448, "step": 4999 }, { "epoch": 7.94912559618442, "grad_norm": 2.0525047929742555, "learning_rate": 4.835167978675231e-05, "loss": 0.1375, "step": 5000 }, { "epoch": 7.950715421303657, "grad_norm": 5.677274030606974, "learning_rate": 4.835487811412937e-05, "loss": 0.1743, "step": 5001 }, { "epoch": 7.952305246422894, "grad_norm": 2.8843080693596956, "learning_rate": 4.83580736136827e-05, "loss": 0.223, "step": 5002 }, { "epoch": 7.953895071542131, "grad_norm": 3.515493977270323, "learning_rate": 4.836126628433077e-05, "loss": 0.1867, "step": 5003 }, { "epoch": 7.955484896661368, "grad_norm": 4.489343337604896, "learning_rate": 4.836445612499296e-05, "loss": 0.1714, "step": 5004 }, { "epoch": 7.957074721780604, "grad_norm": 3.331081329995802, "learning_rate": 4.8367643134589624e-05, "loss": 0.186, "step": 5005 }, { "epoch": 7.958664546899841, "grad_norm": 4.881074649041259, "learning_rate": 4.837082731204207e-05, "loss": 0.1529, "step": 5006 }, { "epoch": 7.960254372019078, "grad_norm": 3.51735837917988, "learning_rate": 4.8374008656272586e-05, "loss": 0.1149, "step": 5007 }, { "epoch": 7.961844197138315, "grad_norm": 72.72641604663492, "learning_rate": 4.837718716620439e-05, "loss": 2.4093, "step": 5008 }, { "epoch": 7.963434022257552, "grad_norm": 3.656394431152972, "learning_rate": 4.8380362840761676e-05, "loss": 0.1938, "step": 5009 }, { "epoch": 7.965023847376789, "grad_norm": 4.53340132389491, "learning_rate": 4.838353567886959e-05, "loss": 0.1221, "step": 5010 }, { "epoch": 7.966613672496026, "grad_norm": 4.626590611320177, "learning_rate": 4.838670567945424e-05, "loss": 0.1367, "step": 5011 }, { "epoch": 7.968203497615263, "grad_norm": 21.287611978067964, "learning_rate": 4.8389872841442705e-05, "loss": 1.2258, "step": 5012 }, { "epoch": 7.9697933227345, "grad_norm": 4.63752469798867, "learning_rate": 4.8393037163763e-05, "loss": 0.181, "step": 5013 }, { "epoch": 7.971383147853736, "grad_norm": 2.333990262694119, "learning_rate": 4.8396198645344134e-05, "loss": 0.1516, "step": 5014 }, { "epoch": 7.972972972972973, "grad_norm": 5.377266394369033, "learning_rate": 4.8399357285116045e-05, "loss": 0.1836, "step": 5015 }, { "epoch": 7.97456279809221, "grad_norm": 4.711071915768296, "learning_rate": 4.840251308200966e-05, "loss": 0.1612, "step": 5016 }, { "epoch": 7.976152623211447, "grad_norm": 1.4702010273453119, "learning_rate": 4.840566603495684e-05, "loss": 0.1361, "step": 5017 }, { "epoch": 7.977742448330684, "grad_norm": 4.3272317999364525, "learning_rate": 4.840881614289045e-05, "loss": 0.182, "step": 5018 }, { "epoch": 7.979332273449921, "grad_norm": 2.8566255497366932, "learning_rate": 4.841196340474427e-05, "loss": 0.175, "step": 5019 }, { "epoch": 7.980922098569158, "grad_norm": 2.5530906168616774, "learning_rate": 4.841510781945306e-05, "loss": 0.1966, "step": 5020 }, { "epoch": 7.982511923688394, "grad_norm": 2.574887823804332, "learning_rate": 4.841824938595258e-05, "loss": 0.1303, "step": 5021 }, { "epoch": 7.984101748807631, "grad_norm": 2.7344968921442794, "learning_rate": 4.84213881031795e-05, "loss": 0.1467, "step": 5022 }, { "epoch": 7.985691573926868, "grad_norm": 1.7139668977011346, "learning_rate": 4.8424523970071476e-05, "loss": 0.1373, "step": 5023 }, { "epoch": 7.987281399046105, "grad_norm": 2.1863213701401603, "learning_rate": 4.842765698556715e-05, "loss": 0.1708, "step": 5024 }, { "epoch": 7.988871224165342, "grad_norm": 2.9409486271313074, "learning_rate": 4.8430787148606084e-05, "loss": 0.1747, "step": 5025 }, { "epoch": 7.990461049284579, "grad_norm": 2.7856634254795534, "learning_rate": 4.843391445812886e-05, "loss": 0.2679, "step": 5026 }, { "epoch": 7.992050874403816, "grad_norm": 2.356756428038399, "learning_rate": 4.8437038913076975e-05, "loss": 0.1242, "step": 5027 }, { "epoch": 7.993640699523052, "grad_norm": 1.6939474389965583, "learning_rate": 4.844016051239292e-05, "loss": 0.123, "step": 5028 }, { "epoch": 7.995230524642289, "grad_norm": 1.1415276109176946, "learning_rate": 4.8443279255020146e-05, "loss": 0.1174, "step": 5029 }, { "epoch": 7.996820349761526, "grad_norm": 2.7278707722831483, "learning_rate": 4.844639513990309e-05, "loss": 0.1782, "step": 5030 }, { "epoch": 7.998410174880763, "grad_norm": 1.8911915656045215, "learning_rate": 4.84495081659871e-05, "loss": 0.1361, "step": 5031 }, { "epoch": 8.0, "grad_norm": 0.925024960554119, "learning_rate": 4.845261833221856e-05, "loss": 0.1079, "step": 5032 }, { "epoch": 8.001589825119236, "grad_norm": 1.989107764401011, "learning_rate": 4.8455725637544785e-05, "loss": 0.1675, "step": 5033 }, { "epoch": 8.003179650238474, "grad_norm": 2.10204168035289, "learning_rate": 4.8458830080914055e-05, "loss": 0.1427, "step": 5034 }, { "epoch": 8.00476947535771, "grad_norm": 2.4918120409684743, "learning_rate": 4.846193166127564e-05, "loss": 0.104, "step": 5035 }, { "epoch": 8.006359300476948, "grad_norm": 2.5552538175072783, "learning_rate": 4.846503037757976e-05, "loss": 0.2282, "step": 5036 }, { "epoch": 8.007949125596184, "grad_norm": 17.708679381229274, "learning_rate": 4.846812622877762e-05, "loss": 0.6405, "step": 5037 }, { "epoch": 8.009538950715422, "grad_norm": 29.95816071195867, "learning_rate": 4.8471219213821375e-05, "loss": 0.9014, "step": 5038 }, { "epoch": 8.011128775834658, "grad_norm": 4.010814833868078, "learning_rate": 4.847430933166417e-05, "loss": 0.1559, "step": 5039 }, { "epoch": 8.012718600953894, "grad_norm": 1.3113955660256025, "learning_rate": 4.8477396581260104e-05, "loss": 0.159, "step": 5040 }, { "epoch": 8.014308426073132, "grad_norm": 3.3830668105294075, "learning_rate": 4.848048096156426e-05, "loss": 0.2008, "step": 5041 }, { "epoch": 8.015898251192368, "grad_norm": 1.6304903489434344, "learning_rate": 4.848356247153269e-05, "loss": 0.0997, "step": 5042 }, { "epoch": 8.017488076311606, "grad_norm": 2.053729717927404, "learning_rate": 4.848664111012241e-05, "loss": 0.1181, "step": 5043 }, { "epoch": 8.019077901430842, "grad_norm": 2.339705707559315, "learning_rate": 4.848971687629142e-05, "loss": 0.2172, "step": 5044 }, { "epoch": 8.02066772655008, "grad_norm": 1.9049209585492022, "learning_rate": 4.849278976899867e-05, "loss": 0.1698, "step": 5045 }, { "epoch": 8.022257551669316, "grad_norm": 1.7479877192775553, "learning_rate": 4.849585978720411e-05, "loss": 0.1744, "step": 5046 }, { "epoch": 8.023847376788554, "grad_norm": 2.601232710297744, "learning_rate": 4.8498926929868645e-05, "loss": 0.1153, "step": 5047 }, { "epoch": 8.02543720190779, "grad_norm": 1.6748542772064732, "learning_rate": 4.850199119595415e-05, "loss": 0.1239, "step": 5048 }, { "epoch": 8.027027027027026, "grad_norm": 1.9852019692223268, "learning_rate": 4.850505258442351e-05, "loss": 0.2041, "step": 5049 }, { "epoch": 8.028616852146264, "grad_norm": 1.5200058012211717, "learning_rate": 4.8508111094240514e-05, "loss": 0.1664, "step": 5050 }, { "epoch": 8.0302066772655, "grad_norm": 7.8813923493468945, "learning_rate": 4.851116672437e-05, "loss": 1.1759, "step": 5051 }, { "epoch": 8.031796502384738, "grad_norm": 2.149300783597332, "learning_rate": 4.851421947377773e-05, "loss": 0.1144, "step": 5052 }, { "epoch": 8.033386327503974, "grad_norm": 2.631064849511695, "learning_rate": 4.8517269341430476e-05, "loss": 0.1398, "step": 5053 }, { "epoch": 8.034976152623212, "grad_norm": 2.0362179453743634, "learning_rate": 4.852031632629596e-05, "loss": 0.2771, "step": 5054 }, { "epoch": 8.036565977742448, "grad_norm": 2.659735742624196, "learning_rate": 4.8523360427342875e-05, "loss": 0.1298, "step": 5055 }, { "epoch": 8.038155802861684, "grad_norm": 3.3699110046016423, "learning_rate": 4.8526401643540925e-05, "loss": 0.1455, "step": 5056 }, { "epoch": 8.039745627980922, "grad_norm": 2.4363078973284846, "learning_rate": 4.852943997386075e-05, "loss": 0.1363, "step": 5057 }, { "epoch": 8.041335453100158, "grad_norm": 2.357255205382854, "learning_rate": 4.8532475417274e-05, "loss": 0.1403, "step": 5058 }, { "epoch": 8.042925278219396, "grad_norm": 3.5152844141740114, "learning_rate": 4.853550797275328e-05, "loss": 0.1566, "step": 5059 }, { "epoch": 8.044515103338632, "grad_norm": 2.920398335448487, "learning_rate": 4.8538537639272175e-05, "loss": 0.1136, "step": 5060 }, { "epoch": 8.04610492845787, "grad_norm": 2.4211379317054504, "learning_rate": 4.854156441580526e-05, "loss": 0.2112, "step": 5061 }, { "epoch": 8.047694753577106, "grad_norm": 1.9405182915941486, "learning_rate": 4.854458830132808e-05, "loss": 0.1861, "step": 5062 }, { "epoch": 8.049284578696344, "grad_norm": 2.484882014963822, "learning_rate": 4.854760929481715e-05, "loss": 0.175, "step": 5063 }, { "epoch": 8.05087440381558, "grad_norm": 2.4016260627204793, "learning_rate": 4.855062739524999e-05, "loss": 0.154, "step": 5064 }, { "epoch": 8.052464228934817, "grad_norm": 1.6714071169990865, "learning_rate": 4.855364260160507e-05, "loss": 0.1855, "step": 5065 }, { "epoch": 8.054054054054054, "grad_norm": 3.3494337759288264, "learning_rate": 4.855665491286185e-05, "loss": 0.1848, "step": 5066 }, { "epoch": 8.05564387917329, "grad_norm": 3.2428780862555735, "learning_rate": 4.8559664328000787e-05, "loss": 0.1534, "step": 5067 }, { "epoch": 8.057233704292528, "grad_norm": 3.2648436834465295, "learning_rate": 4.856267084600328e-05, "loss": 0.1613, "step": 5068 }, { "epoch": 8.058823529411764, "grad_norm": 1.399458306156896, "learning_rate": 4.856567446585176e-05, "loss": 0.1258, "step": 5069 }, { "epoch": 8.060413354531002, "grad_norm": 1.8608149007419768, "learning_rate": 4.8568675186529584e-05, "loss": 0.1123, "step": 5070 }, { "epoch": 8.062003179650238, "grad_norm": 5.061552608209933, "learning_rate": 4.8571673007021125e-05, "loss": 0.1536, "step": 5071 }, { "epoch": 8.063593004769475, "grad_norm": 1.6434893161404394, "learning_rate": 4.857466792631173e-05, "loss": 0.1755, "step": 5072 }, { "epoch": 8.065182829888712, "grad_norm": 2.256192376170943, "learning_rate": 4.857765994338774e-05, "loss": 0.2697, "step": 5073 }, { "epoch": 8.066772655007949, "grad_norm": 3.682355376107431, "learning_rate": 4.858064905723645e-05, "loss": 0.1762, "step": 5074 }, { "epoch": 8.068362480127186, "grad_norm": 3.487010063237753, "learning_rate": 4.858363526684615e-05, "loss": 0.1418, "step": 5075 }, { "epoch": 8.069952305246423, "grad_norm": 2.2733395496727504, "learning_rate": 4.858661857120613e-05, "loss": 0.1462, "step": 5076 }, { "epoch": 8.07154213036566, "grad_norm": 1.5253731033270141, "learning_rate": 4.858959896930665e-05, "loss": 0.1606, "step": 5077 }, { "epoch": 8.073131955484897, "grad_norm": 2.573573349019112, "learning_rate": 4.859257646013893e-05, "loss": 0.1668, "step": 5078 }, { "epoch": 8.074721780604133, "grad_norm": 2.3639744649447834, "learning_rate": 4.859555104269522e-05, "loss": 0.1701, "step": 5079 }, { "epoch": 8.07631160572337, "grad_norm": 3.46895466841614, "learning_rate": 4.859852271596873e-05, "loss": 0.179, "step": 5080 }, { "epoch": 8.077901430842607, "grad_norm": 1.7324412639155373, "learning_rate": 4.860149147895365e-05, "loss": 0.1831, "step": 5081 }, { "epoch": 8.079491255961845, "grad_norm": 4.054462334402116, "learning_rate": 4.8604457330645173e-05, "loss": 0.172, "step": 5082 }, { "epoch": 8.08108108108108, "grad_norm": 1.8345069868168538, "learning_rate": 4.860742027003944e-05, "loss": 0.1516, "step": 5083 }, { "epoch": 8.082670906200319, "grad_norm": 2.9366653892170955, "learning_rate": 4.861038029613362e-05, "loss": 0.1255, "step": 5084 }, { "epoch": 8.084260731319555, "grad_norm": 2.1227697436929787, "learning_rate": 4.8613337407925855e-05, "loss": 0.1981, "step": 5085 }, { "epoch": 8.085850556438793, "grad_norm": 1.6663289171036149, "learning_rate": 4.861629160441527e-05, "loss": 0.1369, "step": 5086 }, { "epoch": 8.087440381558029, "grad_norm": 1.7742186171482957, "learning_rate": 4.8619242884601954e-05, "loss": 0.1729, "step": 5087 }, { "epoch": 8.089030206677265, "grad_norm": 2.505975295467478, "learning_rate": 4.862219124748703e-05, "loss": 0.1565, "step": 5088 }, { "epoch": 8.090620031796503, "grad_norm": 1.4969390085601693, "learning_rate": 4.8625136692072574e-05, "loss": 0.1329, "step": 5089 }, { "epoch": 8.092209856915739, "grad_norm": 1.9226946206321054, "learning_rate": 4.8628079217361663e-05, "loss": 0.1535, "step": 5090 }, { "epoch": 8.093799682034977, "grad_norm": 1.7962061657998751, "learning_rate": 4.863101882235837e-05, "loss": 0.1729, "step": 5091 }, { "epoch": 8.095389507154213, "grad_norm": 2.714155443758785, "learning_rate": 4.863395550606772e-05, "loss": 0.1423, "step": 5092 }, { "epoch": 8.09697933227345, "grad_norm": 3.220946056173669, "learning_rate": 4.863688926749577e-05, "loss": 0.1578, "step": 5093 }, { "epoch": 8.098569157392687, "grad_norm": 75.36177943614244, "learning_rate": 4.8639820105649535e-05, "loss": 10.5632, "step": 5094 }, { "epoch": 8.100158982511923, "grad_norm": 2.0920316126414784, "learning_rate": 4.864274801953705e-05, "loss": 0.1298, "step": 5095 }, { "epoch": 8.10174880763116, "grad_norm": 2.349893150327733, "learning_rate": 4.864567300816731e-05, "loss": 0.1386, "step": 5096 }, { "epoch": 8.103338632750397, "grad_norm": 1.6148199090849726, "learning_rate": 4.8648595070550316e-05, "loss": 0.1654, "step": 5097 }, { "epoch": 8.104928457869635, "grad_norm": 1.1855275938211247, "learning_rate": 4.865151420569705e-05, "loss": 0.1748, "step": 5098 }, { "epoch": 8.106518282988871, "grad_norm": 3.757814379494921, "learning_rate": 4.86544304126195e-05, "loss": 0.2652, "step": 5099 }, { "epoch": 8.108108108108109, "grad_norm": 2.0810210084667466, "learning_rate": 4.865734369033062e-05, "loss": 0.1435, "step": 5100 }, { "epoch": 8.109697933227345, "grad_norm": 2.8327125865112275, "learning_rate": 4.866025403784439e-05, "loss": 0.0961, "step": 5101 }, { "epoch": 8.111287758346581, "grad_norm": 2.186951042405222, "learning_rate": 4.866316145417575e-05, "loss": 0.0871, "step": 5102 }, { "epoch": 8.112877583465819, "grad_norm": 3.074450709300555, "learning_rate": 4.866606593834065e-05, "loss": 0.1334, "step": 5103 }, { "epoch": 8.114467408585055, "grad_norm": 2.5700805469993955, "learning_rate": 4.8668967489356034e-05, "loss": 0.1737, "step": 5104 }, { "epoch": 8.116057233704293, "grad_norm": 1.6084743916483626, "learning_rate": 4.867186610623981e-05, "loss": 0.1386, "step": 5105 }, { "epoch": 8.117647058823529, "grad_norm": 3.3816783243757107, "learning_rate": 4.867476178801093e-05, "loss": 0.1834, "step": 5106 }, { "epoch": 8.119236883942767, "grad_norm": 2.253987751977176, "learning_rate": 4.867765453368929e-05, "loss": 0.2127, "step": 5107 }, { "epoch": 8.120826709062003, "grad_norm": 2.025088584386359, "learning_rate": 4.86805443422958e-05, "loss": 0.1535, "step": 5108 }, { "epoch": 8.12241653418124, "grad_norm": 2.175575740413713, "learning_rate": 4.868343121285238e-05, "loss": 0.1717, "step": 5109 }, { "epoch": 8.124006359300477, "grad_norm": 3.068379900181183, "learning_rate": 4.868631514438191e-05, "loss": 0.1306, "step": 5110 }, { "epoch": 8.125596184419713, "grad_norm": 2.248948629379114, "learning_rate": 4.8689196135908304e-05, "loss": 0.2042, "step": 5111 }, { "epoch": 8.127186009538951, "grad_norm": 2.0092609175527305, "learning_rate": 4.869207418645643e-05, "loss": 0.1786, "step": 5112 }, { "epoch": 8.128775834658187, "grad_norm": 3.407706667274365, "learning_rate": 4.869494929505219e-05, "loss": 0.1982, "step": 5113 }, { "epoch": 8.130365659777425, "grad_norm": 3.0147510131993767, "learning_rate": 4.869782146072246e-05, "loss": 0.1854, "step": 5114 }, { "epoch": 8.131955484896661, "grad_norm": 1.5695947214518804, "learning_rate": 4.87006906824951e-05, "loss": 0.145, "step": 5115 }, { "epoch": 8.133545310015899, "grad_norm": 4.833169820705534, "learning_rate": 4.8703556959399e-05, "loss": 0.2007, "step": 5116 }, { "epoch": 8.135135135135135, "grad_norm": 3.9600644260696094, "learning_rate": 4.8706420290464016e-05, "loss": 0.1664, "step": 5117 }, { "epoch": 8.136724960254371, "grad_norm": 3.9825442762113346, "learning_rate": 4.870928067472103e-05, "loss": 0.2354, "step": 5118 }, { "epoch": 8.138314785373609, "grad_norm": 2.6415039032099052, "learning_rate": 4.8712138111201895e-05, "loss": 0.1143, "step": 5119 }, { "epoch": 8.139904610492845, "grad_norm": 2.407983251857994, "learning_rate": 4.871499259893947e-05, "loss": 0.185, "step": 5120 }, { "epoch": 8.141494435612083, "grad_norm": 3.023349539440283, "learning_rate": 4.871784413696762e-05, "loss": 0.1623, "step": 5121 }, { "epoch": 8.14308426073132, "grad_norm": 3.3805192455978337, "learning_rate": 4.872069272432121e-05, "loss": 0.1775, "step": 5122 }, { "epoch": 8.144674085850557, "grad_norm": 2.6870568389920764, "learning_rate": 4.8723538360036077e-05, "loss": 0.1519, "step": 5123 }, { "epoch": 8.146263910969793, "grad_norm": 2.9914578830004386, "learning_rate": 4.872638104314909e-05, "loss": 0.1667, "step": 5124 }, { "epoch": 8.147853736089031, "grad_norm": 1.6159279862744265, "learning_rate": 4.87292207726981e-05, "loss": 0.1182, "step": 5125 }, { "epoch": 8.149443561208267, "grad_norm": 2.2214822011658906, "learning_rate": 4.873205754772196e-05, "loss": 0.1308, "step": 5126 }, { "epoch": 8.151033386327503, "grad_norm": 3.5764076108523035, "learning_rate": 4.8734891367260525e-05, "loss": 0.2098, "step": 5127 }, { "epoch": 8.152623211446741, "grad_norm": 6.02222529069119, "learning_rate": 4.8737722230354655e-05, "loss": 0.2569, "step": 5128 }, { "epoch": 8.154213036565977, "grad_norm": 7.065109609755068, "learning_rate": 4.87405501360462e-05, "loss": 0.1658, "step": 5129 }, { "epoch": 8.155802861685215, "grad_norm": 39.89551919826817, "learning_rate": 4.874337508337801e-05, "loss": 3.2823, "step": 5130 }, { "epoch": 8.157392686804451, "grad_norm": 7.973781235316152, "learning_rate": 4.874619707139396e-05, "loss": 0.1853, "step": 5131 }, { "epoch": 8.15898251192369, "grad_norm": 6.329582590176935, "learning_rate": 4.87490160991389e-05, "loss": 0.1553, "step": 5132 }, { "epoch": 8.160572337042925, "grad_norm": 6.858028108699794, "learning_rate": 4.875183216565868e-05, "loss": 0.1701, "step": 5133 }, { "epoch": 8.162162162162161, "grad_norm": 5.758975850370503, "learning_rate": 4.875464527000018e-05, "loss": 0.1555, "step": 5134 }, { "epoch": 8.1637519872814, "grad_norm": 5.741301440435139, "learning_rate": 4.8757455411211266e-05, "loss": 0.2389, "step": 5135 }, { "epoch": 8.165341812400635, "grad_norm": 7.332041417984684, "learning_rate": 4.876026258834079e-05, "loss": 0.169, "step": 5136 }, { "epoch": 8.166931637519873, "grad_norm": 4.068308988177857, "learning_rate": 4.8763066800438635e-05, "loss": 0.1618, "step": 5137 }, { "epoch": 8.16852146263911, "grad_norm": 5.986395387747771, "learning_rate": 4.876586804655568e-05, "loss": 0.8942, "step": 5138 }, { "epoch": 8.170111287758347, "grad_norm": 6.346919712646774, "learning_rate": 4.8768666325743806e-05, "loss": 0.1728, "step": 5139 }, { "epoch": 8.171701112877583, "grad_norm": 5.21306762001224, "learning_rate": 4.877146163705589e-05, "loss": 0.1367, "step": 5140 }, { "epoch": 8.17329093799682, "grad_norm": 19.44706709532741, "learning_rate": 4.877425397954583e-05, "loss": 2.1659, "step": 5141 }, { "epoch": 8.174880763116057, "grad_norm": 4.163740128320601, "learning_rate": 4.8777043352268495e-05, "loss": 0.1362, "step": 5142 }, { "epoch": 8.176470588235293, "grad_norm": 5.3487394858397375, "learning_rate": 4.877982975427981e-05, "loss": 0.1857, "step": 5143 }, { "epoch": 8.178060413354531, "grad_norm": 8.437431986420682, "learning_rate": 4.878261318463667e-05, "loss": 0.1618, "step": 5144 }, { "epoch": 8.179650238473767, "grad_norm": 4.069851437525765, "learning_rate": 4.878539364239697e-05, "loss": 0.1722, "step": 5145 }, { "epoch": 8.181240063593005, "grad_norm": 3.259277439054276, "learning_rate": 4.878817112661966e-05, "loss": 0.1655, "step": 5146 }, { "epoch": 8.182829888712241, "grad_norm": 4.124615043932462, "learning_rate": 4.879094563636463e-05, "loss": 0.1451, "step": 5147 }, { "epoch": 8.18441971383148, "grad_norm": 7.654023684998529, "learning_rate": 4.879371717069282e-05, "loss": 0.162, "step": 5148 }, { "epoch": 8.186009538950715, "grad_norm": 3.6814200714536116, "learning_rate": 4.879648572866616e-05, "loss": 0.1129, "step": 5149 }, { "epoch": 8.187599364069952, "grad_norm": 2.6310844814604986, "learning_rate": 4.879925130934761e-05, "loss": 0.182, "step": 5150 }, { "epoch": 8.18918918918919, "grad_norm": 4.224656188466838, "learning_rate": 4.880201391180111e-05, "loss": 0.1939, "step": 5151 }, { "epoch": 8.190779014308426, "grad_norm": 8.199392219149974, "learning_rate": 4.880477353509162e-05, "loss": 0.2188, "step": 5152 }, { "epoch": 8.192368839427663, "grad_norm": 3.037337703526505, "learning_rate": 4.8807530178285106e-05, "loss": 0.1805, "step": 5153 }, { "epoch": 8.1939586645469, "grad_norm": 4.958204366187704, "learning_rate": 4.881028384044855e-05, "loss": 0.2067, "step": 5154 }, { "epoch": 8.195548489666137, "grad_norm": 2.94269733282517, "learning_rate": 4.8813034520649924e-05, "loss": 0.2347, "step": 5155 }, { "epoch": 8.197138314785374, "grad_norm": 4.692347114989652, "learning_rate": 4.881578221795823e-05, "loss": 0.2012, "step": 5156 }, { "epoch": 8.19872813990461, "grad_norm": 8.101816048744546, "learning_rate": 4.8818526931443485e-05, "loss": 0.191, "step": 5157 }, { "epoch": 8.200317965023848, "grad_norm": 5.770573230813346, "learning_rate": 4.882126866017668e-05, "loss": 0.1651, "step": 5158 }, { "epoch": 8.201907790143084, "grad_norm": 3.7366074780925067, "learning_rate": 4.8824007403229856e-05, "loss": 0.1512, "step": 5159 }, { "epoch": 8.203497615262322, "grad_norm": 6.734892775608437, "learning_rate": 4.882674315967604e-05, "loss": 0.1976, "step": 5160 }, { "epoch": 8.205087440381558, "grad_norm": 7.637269689687009, "learning_rate": 4.882947592858927e-05, "loss": 0.304, "step": 5161 }, { "epoch": 8.206677265500796, "grad_norm": 97.90059977319034, "learning_rate": 4.8832205709044616e-05, "loss": 4.642, "step": 5162 }, { "epoch": 8.208267090620032, "grad_norm": 7.6042165185828985, "learning_rate": 4.8834932500118145e-05, "loss": 1.398, "step": 5163 }, { "epoch": 8.20985691573927, "grad_norm": 3.558600878836077, "learning_rate": 4.8837656300886934e-05, "loss": 0.1384, "step": 5164 }, { "epoch": 8.211446740858506, "grad_norm": 2.722214540350977, "learning_rate": 4.884037711042907e-05, "loss": 0.1643, "step": 5165 }, { "epoch": 8.213036565977742, "grad_norm": 4.840202520019953, "learning_rate": 4.884309492782367e-05, "loss": 0.1969, "step": 5166 }, { "epoch": 8.21462639109698, "grad_norm": 6.370580431527253, "learning_rate": 4.884580975215084e-05, "loss": 0.207, "step": 5167 }, { "epoch": 8.216216216216216, "grad_norm": 5.000963481905595, "learning_rate": 4.884852158249171e-05, "loss": 0.1368, "step": 5168 }, { "epoch": 8.217806041335454, "grad_norm": 2.4877380733947105, "learning_rate": 4.8851230417928434e-05, "loss": 0.163, "step": 5169 }, { "epoch": 8.21939586645469, "grad_norm": 2.669421997818057, "learning_rate": 4.8853936257544164e-05, "loss": 0.1617, "step": 5170 }, { "epoch": 8.220985691573928, "grad_norm": 11.155057582548519, "learning_rate": 4.885663910042306e-05, "loss": 0.1643, "step": 5171 }, { "epoch": 8.222575516693164, "grad_norm": 5.970463845638016, "learning_rate": 4.885933894565032e-05, "loss": 0.1395, "step": 5172 }, { "epoch": 8.2241653418124, "grad_norm": 5.394771150309761, "learning_rate": 4.886203579231215e-05, "loss": 0.1382, "step": 5173 }, { "epoch": 8.225755166931638, "grad_norm": 4.188564426935115, "learning_rate": 4.886472963949575e-05, "loss": 0.2, "step": 5174 }, { "epoch": 8.227344992050874, "grad_norm": 8.931463146857507, "learning_rate": 4.8867420486289354e-05, "loss": 0.1801, "step": 5175 }, { "epoch": 8.228934817170112, "grad_norm": 8.833761808555167, "learning_rate": 4.887010833178222e-05, "loss": 0.1681, "step": 5176 }, { "epoch": 8.230524642289348, "grad_norm": 3.9647504007996597, "learning_rate": 4.88727931750646e-05, "loss": 0.1298, "step": 5177 }, { "epoch": 8.232114467408586, "grad_norm": 6.044575551671389, "learning_rate": 4.8875475015227757e-05, "loss": 0.2581, "step": 5178 }, { "epoch": 8.233704292527822, "grad_norm": 6.829881060934325, "learning_rate": 4.887815385136402e-05, "loss": 0.1676, "step": 5179 }, { "epoch": 8.235294117647058, "grad_norm": 6.391502807997331, "learning_rate": 4.888082968256666e-05, "loss": 0.1981, "step": 5180 }, { "epoch": 8.236883942766296, "grad_norm": 5.3806315264486635, "learning_rate": 4.888350250793004e-05, "loss": 0.1741, "step": 5181 }, { "epoch": 8.238473767885532, "grad_norm": 4.317727754970113, "learning_rate": 4.8886172326549487e-05, "loss": 0.1752, "step": 5182 }, { "epoch": 8.24006359300477, "grad_norm": 8.512217022928274, "learning_rate": 4.8888839137521374e-05, "loss": 0.2635, "step": 5183 }, { "epoch": 8.241653418124006, "grad_norm": 10.718361276680985, "learning_rate": 4.8891502939943066e-05, "loss": 0.258, "step": 5184 }, { "epoch": 8.243243243243244, "grad_norm": 4.736729915587201, "learning_rate": 4.889416373291298e-05, "loss": 0.2422, "step": 5185 }, { "epoch": 8.24483306836248, "grad_norm": 6.0482217978997435, "learning_rate": 4.889682151553052e-05, "loss": 0.1838, "step": 5186 }, { "epoch": 8.246422893481718, "grad_norm": 4.962444063929784, "learning_rate": 4.889947628689613e-05, "loss": 0.1829, "step": 5187 }, { "epoch": 8.248012718600954, "grad_norm": 8.520293487623997, "learning_rate": 4.8902128046111265e-05, "loss": 0.2805, "step": 5188 }, { "epoch": 8.24960254372019, "grad_norm": 3.8317577871902797, "learning_rate": 4.890477679227841e-05, "loss": 0.1693, "step": 5189 }, { "epoch": 8.251192368839428, "grad_norm": 4.882498688998951, "learning_rate": 4.8907422524501035e-05, "loss": 0.2435, "step": 5190 }, { "epoch": 8.252782193958664, "grad_norm": 3.945022711690706, "learning_rate": 4.891006524188368e-05, "loss": 0.1708, "step": 5191 }, { "epoch": 8.254372019077902, "grad_norm": 16.06430289263403, "learning_rate": 4.8912704943531875e-05, "loss": 1.4076, "step": 5192 }, { "epoch": 8.255961844197138, "grad_norm": 5.873966633531453, "learning_rate": 4.891534162855217e-05, "loss": 0.1456, "step": 5193 }, { "epoch": 8.257551669316376, "grad_norm": 6.866517041196331, "learning_rate": 4.8917975296052143e-05, "loss": 0.1769, "step": 5194 }, { "epoch": 8.259141494435612, "grad_norm": 6.650230467671032, "learning_rate": 4.89206059451404e-05, "loss": 0.1805, "step": 5195 }, { "epoch": 8.260731319554848, "grad_norm": 3.5341219459840283, "learning_rate": 4.892323357492656e-05, "loss": 0.1161, "step": 5196 }, { "epoch": 8.262321144674086, "grad_norm": 14.013213368948511, "learning_rate": 4.892585818452126e-05, "loss": 0.5632, "step": 5197 }, { "epoch": 8.263910969793322, "grad_norm": 6.1331426128261555, "learning_rate": 4.892847977303617e-05, "loss": 0.1288, "step": 5198 }, { "epoch": 8.26550079491256, "grad_norm": 3.9631569790210857, "learning_rate": 4.893109833958397e-05, "loss": 0.1559, "step": 5199 }, { "epoch": 8.267090620031796, "grad_norm": 5.164803340192376, "learning_rate": 4.893371388327838e-05, "loss": 0.1723, "step": 5200 }, { "epoch": 8.268680445151034, "grad_norm": 3.1909787563243364, "learning_rate": 4.893632640323412e-05, "loss": 0.1792, "step": 5201 }, { "epoch": 8.27027027027027, "grad_norm": 4.00174147756515, "learning_rate": 4.893893589856696e-05, "loss": 0.155, "step": 5202 }, { "epoch": 8.271860095389506, "grad_norm": 5.260586429382655, "learning_rate": 4.8941542368393684e-05, "loss": 0.1551, "step": 5203 }, { "epoch": 8.273449920508744, "grad_norm": 3.494809418478414, "learning_rate": 4.894414581183208e-05, "loss": 0.2013, "step": 5204 }, { "epoch": 8.27503974562798, "grad_norm": 5.838286594044415, "learning_rate": 4.8946746228000984e-05, "loss": 0.1372, "step": 5205 }, { "epoch": 8.276629570747218, "grad_norm": 5.196317897579684, "learning_rate": 4.894934361602025e-05, "loss": 0.1405, "step": 5206 }, { "epoch": 8.278219395866454, "grad_norm": 5.856442941594216, "learning_rate": 4.895193797501076e-05, "loss": 0.1788, "step": 5207 }, { "epoch": 8.279809220985692, "grad_norm": 5.5449694105474565, "learning_rate": 4.895452930409441e-05, "loss": 0.2566, "step": 5208 }, { "epoch": 8.281399046104928, "grad_norm": 87.86077550014348, "learning_rate": 4.895711760239414e-05, "loss": 4.4801, "step": 5209 }, { "epoch": 8.282988871224166, "grad_norm": 9.590915178282932, "learning_rate": 4.895970286903388e-05, "loss": 0.1611, "step": 5210 }, { "epoch": 8.284578696343402, "grad_norm": 3.128102702413848, "learning_rate": 4.896228510313864e-05, "loss": 0.1846, "step": 5211 }, { "epoch": 8.286168521462638, "grad_norm": 6.567415604785429, "learning_rate": 4.896486430383441e-05, "loss": 0.1966, "step": 5212 }, { "epoch": 8.287758346581876, "grad_norm": 7.721039401642896, "learning_rate": 4.896744047024823e-05, "loss": 0.2279, "step": 5213 }, { "epoch": 8.289348171701112, "grad_norm": 2.948921880715489, "learning_rate": 4.897001360150816e-05, "loss": 0.2146, "step": 5214 }, { "epoch": 8.29093799682035, "grad_norm": 5.659402234850209, "learning_rate": 4.897258369674329e-05, "loss": 0.27, "step": 5215 }, { "epoch": 8.292527821939586, "grad_norm": 4.559841348103803, "learning_rate": 4.897515075508373e-05, "loss": 0.1553, "step": 5216 }, { "epoch": 8.294117647058824, "grad_norm": 6.706544222161147, "learning_rate": 4.897771477566063e-05, "loss": 0.1886, "step": 5217 }, { "epoch": 8.29570747217806, "grad_norm": 4.548358254777338, "learning_rate": 4.898027575760616e-05, "loss": 0.1829, "step": 5218 }, { "epoch": 8.297297297297296, "grad_norm": 5.1247578637065825, "learning_rate": 4.898283370005352e-05, "loss": 0.2366, "step": 5219 }, { "epoch": 8.298887122416534, "grad_norm": 4.571321321419958, "learning_rate": 4.898538860213694e-05, "loss": 0.1594, "step": 5220 }, { "epoch": 8.30047694753577, "grad_norm": 4.537687256467799, "learning_rate": 4.8987940462991673e-05, "loss": 0.8997, "step": 5221 }, { "epoch": 8.302066772655008, "grad_norm": 2.6697843199493025, "learning_rate": 4.899048928175401e-05, "loss": 0.1543, "step": 5222 }, { "epoch": 8.303656597774244, "grad_norm": 2.756016651302308, "learning_rate": 4.8993035057561275e-05, "loss": 0.1856, "step": 5223 }, { "epoch": 8.305246422893482, "grad_norm": 3.253289504201642, "learning_rate": 4.899557778955181e-05, "loss": 0.1276, "step": 5224 }, { "epoch": 8.306836248012718, "grad_norm": 3.4433672875085204, "learning_rate": 4.899811747686498e-05, "loss": 0.1167, "step": 5225 }, { "epoch": 8.308426073131955, "grad_norm": 2.824304003167212, "learning_rate": 4.9000654118641216e-05, "loss": 0.1651, "step": 5226 }, { "epoch": 8.310015898251192, "grad_norm": 3.9514099250939707, "learning_rate": 4.900318771402194e-05, "loss": 0.1611, "step": 5227 }, { "epoch": 8.311605723370429, "grad_norm": 4.190345561846332, "learning_rate": 4.900571826214962e-05, "loss": 0.1512, "step": 5228 }, { "epoch": 8.313195548489666, "grad_norm": 2.9013314500611087, "learning_rate": 4.9008245762167774e-05, "loss": 0.1627, "step": 5229 }, { "epoch": 8.314785373608903, "grad_norm": 3.351260112020737, "learning_rate": 4.901077021322092e-05, "loss": 0.1923, "step": 5230 }, { "epoch": 8.31637519872814, "grad_norm": 3.9189466201487284, "learning_rate": 4.901329161445462e-05, "loss": 0.1853, "step": 5231 }, { "epoch": 8.317965023847377, "grad_norm": 3.0202256536078487, "learning_rate": 4.901580996501549e-05, "loss": 0.1247, "step": 5232 }, { "epoch": 8.319554848966614, "grad_norm": 2.579485755274775, "learning_rate": 4.901832526405114e-05, "loss": 0.1665, "step": 5233 }, { "epoch": 8.32114467408585, "grad_norm": 2.9397876030323453, "learning_rate": 4.902083751071024e-05, "loss": 0.1811, "step": 5234 }, { "epoch": 8.322734499205087, "grad_norm": 4.4254710250530875, "learning_rate": 4.902334670414249e-05, "loss": 0.1398, "step": 5235 }, { "epoch": 8.324324324324325, "grad_norm": 3.1932599757742643, "learning_rate": 4.902585284349861e-05, "loss": 0.1582, "step": 5236 }, { "epoch": 8.32591414944356, "grad_norm": 4.634639633651532, "learning_rate": 4.9028355927930364e-05, "loss": 0.1292, "step": 5237 }, { "epoch": 8.327503974562799, "grad_norm": 3.533237632194829, "learning_rate": 4.9030855956590556e-05, "loss": 0.1915, "step": 5238 }, { "epoch": 8.329093799682035, "grad_norm": 2.7767241728853045, "learning_rate": 4.903335292863301e-05, "loss": 0.1131, "step": 5239 }, { "epoch": 8.330683624801273, "grad_norm": 4.802771803266855, "learning_rate": 4.90358468432126e-05, "loss": 0.1367, "step": 5240 }, { "epoch": 8.332273449920509, "grad_norm": 5.098031567717378, "learning_rate": 4.9038337699485204e-05, "loss": 0.1937, "step": 5241 }, { "epoch": 8.333863275039745, "grad_norm": 4.737081570876735, "learning_rate": 4.904082549660779e-05, "loss": 0.2119, "step": 5242 }, { "epoch": 8.335453100158983, "grad_norm": 2.9478429925511667, "learning_rate": 4.90433102337383e-05, "loss": 0.196, "step": 5243 }, { "epoch": 8.337042925278219, "grad_norm": 3.8561220831821137, "learning_rate": 4.904579191003576e-05, "loss": 0.2372, "step": 5244 }, { "epoch": 8.338632750397457, "grad_norm": 3.0099693909976035, "learning_rate": 4.90482705246602e-05, "loss": 0.151, "step": 5245 }, { "epoch": 8.340222575516693, "grad_norm": 2.193109020057152, "learning_rate": 4.90507460767727e-05, "loss": 0.1978, "step": 5246 }, { "epoch": 8.34181240063593, "grad_norm": 6.39258534503877, "learning_rate": 4.905321856553539e-05, "loss": 0.1537, "step": 5247 }, { "epoch": 8.343402225755167, "grad_norm": 2.379677022393169, "learning_rate": 4.90556879901114e-05, "loss": 0.2022, "step": 5248 }, { "epoch": 8.344992050874405, "grad_norm": 2.79748801443883, "learning_rate": 4.905815434966493e-05, "loss": 0.1312, "step": 5249 }, { "epoch": 8.34658187599364, "grad_norm": 5.441808451721164, "learning_rate": 4.906061764336121e-05, "loss": 0.2791, "step": 5250 }, { "epoch": 8.348171701112877, "grad_norm": 5.305065736541831, "learning_rate": 4.90630778703665e-05, "loss": 0.1901, "step": 5251 }, { "epoch": 8.349761526232115, "grad_norm": 3.6061518577156493, "learning_rate": 4.906553502984811e-05, "loss": 0.1887, "step": 5252 }, { "epoch": 8.35135135135135, "grad_norm": 2.960120012606837, "learning_rate": 4.9067989120974365e-05, "loss": 0.1636, "step": 5253 }, { "epoch": 8.352941176470589, "grad_norm": 106.46024947762956, "learning_rate": 4.907044014291465e-05, "loss": 10.2831, "step": 5254 }, { "epoch": 8.354531001589825, "grad_norm": 2.3653963338019817, "learning_rate": 4.9072888094839395e-05, "loss": 0.1272, "step": 5255 }, { "epoch": 8.356120826709063, "grad_norm": 2.358956290349129, "learning_rate": 4.9075332975920044e-05, "loss": 0.1568, "step": 5256 }, { "epoch": 8.357710651828299, "grad_norm": 1.9740540950788474, "learning_rate": 4.9077774785329086e-05, "loss": 0.1723, "step": 5257 }, { "epoch": 8.359300476947535, "grad_norm": 2.5672380672896358, "learning_rate": 4.908021352224008e-05, "loss": 0.1723, "step": 5258 }, { "epoch": 8.360890302066773, "grad_norm": 4.492768352970576, "learning_rate": 4.908264918582759e-05, "loss": 0.2534, "step": 5259 }, { "epoch": 8.362480127186009, "grad_norm": 2.822903171506547, "learning_rate": 4.9085081775267224e-05, "loss": 0.2096, "step": 5260 }, { "epoch": 8.364069952305247, "grad_norm": 22.37692774602424, "learning_rate": 4.908751128973565e-05, "loss": 1.7843, "step": 5261 }, { "epoch": 8.365659777424483, "grad_norm": 2.0605519122002973, "learning_rate": 4.908993772841055e-05, "loss": 0.1181, "step": 5262 }, { "epoch": 8.36724960254372, "grad_norm": 2.4204997321029498, "learning_rate": 4.9092361090470686e-05, "loss": 0.1931, "step": 5263 }, { "epoch": 8.368839427662957, "grad_norm": 2.8684335333211433, "learning_rate": 4.9094781375095826e-05, "loss": 0.1759, "step": 5264 }, { "epoch": 8.370429252782195, "grad_norm": 1.8659848105796324, "learning_rate": 4.909719858146679e-05, "loss": 0.1252, "step": 5265 }, { "epoch": 8.372019077901431, "grad_norm": 5.849631051963039, "learning_rate": 4.9099612708765436e-05, "loss": 0.1978, "step": 5266 }, { "epoch": 8.373608903020667, "grad_norm": 3.9283184986611266, "learning_rate": 4.910202375617468e-05, "loss": 0.1411, "step": 5267 }, { "epoch": 8.375198728139905, "grad_norm": 3.497363741466084, "learning_rate": 4.9104431722878465e-05, "loss": 0.1743, "step": 5268 }, { "epoch": 8.376788553259141, "grad_norm": 1.9346962150505627, "learning_rate": 4.910683660806178e-05, "loss": 0.1209, "step": 5269 }, { "epoch": 8.378378378378379, "grad_norm": 3.7233080710675805, "learning_rate": 4.9109238410910656e-05, "loss": 0.226, "step": 5270 }, { "epoch": 8.379968203497615, "grad_norm": 3.449231595526045, "learning_rate": 4.911163713061217e-05, "loss": 0.1833, "step": 5271 }, { "epoch": 8.381558028616853, "grad_norm": 2.4162674074122474, "learning_rate": 4.911403276635446e-05, "loss": 0.188, "step": 5272 }, { "epoch": 8.383147853736089, "grad_norm": 2.731341131133504, "learning_rate": 4.911642531732667e-05, "loss": 0.1493, "step": 5273 }, { "epoch": 8.384737678855325, "grad_norm": 1.9543608791707343, "learning_rate": 4.9118814782719e-05, "loss": 0.1102, "step": 5274 }, { "epoch": 8.386327503974563, "grad_norm": 3.0925080207048676, "learning_rate": 4.9121201161722736e-05, "loss": 0.1456, "step": 5275 }, { "epoch": 8.3879173290938, "grad_norm": 2.8240128829409863, "learning_rate": 4.9123584453530146e-05, "loss": 0.2029, "step": 5276 }, { "epoch": 8.389507154213037, "grad_norm": 2.616092093590299, "learning_rate": 4.912596465733458e-05, "loss": 0.1793, "step": 5277 }, { "epoch": 8.391096979332273, "grad_norm": 3.059673638547035, "learning_rate": 4.912834177233043e-05, "loss": 0.2162, "step": 5278 }, { "epoch": 8.392686804451511, "grad_norm": 3.3904397680155536, "learning_rate": 4.913071579771313e-05, "loss": 0.1711, "step": 5279 }, { "epoch": 8.394276629570747, "grad_norm": 3.024704874453519, "learning_rate": 4.913308673267914e-05, "loss": 0.1548, "step": 5280 }, { "epoch": 8.395866454689983, "grad_norm": 14.650792106555212, "learning_rate": 4.913545457642601e-05, "loss": 0.6741, "step": 5281 }, { "epoch": 8.397456279809221, "grad_norm": 3.6219158360054813, "learning_rate": 4.9137819328152295e-05, "loss": 0.2102, "step": 5282 }, { "epoch": 8.399046104928457, "grad_norm": 2.671013271501224, "learning_rate": 4.914018098705762e-05, "loss": 0.1762, "step": 5283 }, { "epoch": 8.400635930047695, "grad_norm": 1.8485907469218965, "learning_rate": 4.914253955234264e-05, "loss": 0.1846, "step": 5284 }, { "epoch": 8.402225755166931, "grad_norm": 3.5859817072984863, "learning_rate": 4.914489502320907e-05, "loss": 0.2096, "step": 5285 }, { "epoch": 8.40381558028617, "grad_norm": 2.610210003210112, "learning_rate": 4.9147247398859674e-05, "loss": 0.1728, "step": 5286 }, { "epoch": 8.405405405405405, "grad_norm": 2.8423993764542432, "learning_rate": 4.914959667849825e-05, "loss": 0.1531, "step": 5287 }, { "epoch": 8.406995230524643, "grad_norm": 3.704948637808435, "learning_rate": 4.915194286132966e-05, "loss": 0.1463, "step": 5288 }, { "epoch": 8.40858505564388, "grad_norm": 2.430679407619619, "learning_rate": 4.9154285946559797e-05, "loss": 0.1291, "step": 5289 }, { "epoch": 8.410174880763115, "grad_norm": 2.22283240924879, "learning_rate": 4.915662593339561e-05, "loss": 0.1409, "step": 5290 }, { "epoch": 8.411764705882353, "grad_norm": 1.9433183171946475, "learning_rate": 4.915896282104511e-05, "loss": 0.1428, "step": 5291 }, { "epoch": 8.41335453100159, "grad_norm": 2.81413126835575, "learning_rate": 4.916129660871734e-05, "loss": 0.1233, "step": 5292 }, { "epoch": 8.414944356120827, "grad_norm": 2.978531311579743, "learning_rate": 4.9163627295622405e-05, "loss": 0.1682, "step": 5293 }, { "epoch": 8.416534181240063, "grad_norm": 2.1283200649221063, "learning_rate": 4.916595488097143e-05, "loss": 0.1523, "step": 5294 }, { "epoch": 8.418124006359301, "grad_norm": 2.1782933843273535, "learning_rate": 4.916827936397663e-05, "loss": 0.2392, "step": 5295 }, { "epoch": 8.419713831478537, "grad_norm": 3.2287932142068714, "learning_rate": 4.917060074385125e-05, "loss": 0.0957, "step": 5296 }, { "epoch": 8.421303656597773, "grad_norm": 1.8983814782553385, "learning_rate": 4.9172919019809577e-05, "loss": 0.1129, "step": 5297 }, { "epoch": 8.422893481717011, "grad_norm": 2.043028107481882, "learning_rate": 4.917523419106696e-05, "loss": 0.1512, "step": 5298 }, { "epoch": 8.424483306836247, "grad_norm": 1.6292407051523936, "learning_rate": 4.917754625683982e-05, "loss": 0.1166, "step": 5299 }, { "epoch": 8.426073131955485, "grad_norm": 3.2009921697792265, "learning_rate": 4.9179855216345574e-05, "loss": 0.1532, "step": 5300 }, { "epoch": 8.427662957074721, "grad_norm": 2.5046091260967382, "learning_rate": 4.918216106880274e-05, "loss": 0.162, "step": 5301 }, { "epoch": 8.42925278219396, "grad_norm": 1.4477122186233438, "learning_rate": 4.9184463813430874e-05, "loss": 0.1294, "step": 5302 }, { "epoch": 8.430842607313195, "grad_norm": 2.0808443001759307, "learning_rate": 4.918676344945057e-05, "loss": 0.1387, "step": 5303 }, { "epoch": 8.432432432432432, "grad_norm": 3.4722817572399736, "learning_rate": 4.918905997608349e-05, "loss": 0.1169, "step": 5304 }, { "epoch": 8.43402225755167, "grad_norm": 1.7834210535188175, "learning_rate": 4.919135339255235e-05, "loss": 0.1514, "step": 5305 }, { "epoch": 8.435612082670906, "grad_norm": 3.549258419295536, "learning_rate": 4.9193643698080896e-05, "loss": 0.1257, "step": 5306 }, { "epoch": 8.437201907790143, "grad_norm": 2.135335086993168, "learning_rate": 4.919593089189395e-05, "loss": 0.1251, "step": 5307 }, { "epoch": 8.43879173290938, "grad_norm": 2.7324989244674476, "learning_rate": 4.919821497321738e-05, "loss": 0.1355, "step": 5308 }, { "epoch": 8.440381558028617, "grad_norm": 3.108249698821315, "learning_rate": 4.920049594127811e-05, "loss": 0.0847, "step": 5309 }, { "epoch": 8.441971383147854, "grad_norm": 3.8324659117139523, "learning_rate": 4.9202773795304105e-05, "loss": 0.1442, "step": 5310 }, { "epoch": 8.443561208267091, "grad_norm": 2.9114297254820447, "learning_rate": 4.92050485345244e-05, "loss": 0.1605, "step": 5311 }, { "epoch": 8.445151033386328, "grad_norm": 2.2575344297921354, "learning_rate": 4.9207320158169085e-05, "loss": 0.1834, "step": 5312 }, { "epoch": 8.446740858505564, "grad_norm": 4.46271290209314, "learning_rate": 4.920958866546929e-05, "loss": 0.1266, "step": 5313 }, { "epoch": 8.448330683624802, "grad_norm": 2.632949746560885, "learning_rate": 4.921185405565721e-05, "loss": 0.1703, "step": 5314 }, { "epoch": 8.449920508744038, "grad_norm": 1.5292550860867795, "learning_rate": 4.92141163279661e-05, "loss": 0.1575, "step": 5315 }, { "epoch": 8.451510333863276, "grad_norm": 3.6924596501061937, "learning_rate": 4.9216375481630235e-05, "loss": 0.1751, "step": 5316 }, { "epoch": 8.453100158982512, "grad_norm": 2.8556549031307843, "learning_rate": 4.921863151588501e-05, "loss": 0.197, "step": 5317 }, { "epoch": 8.45468998410175, "grad_norm": 3.4569292758391654, "learning_rate": 4.922088442996681e-05, "loss": 0.1694, "step": 5318 }, { "epoch": 8.456279809220986, "grad_norm": 4.75826638250189, "learning_rate": 4.922313422311312e-05, "loss": 0.1698, "step": 5319 }, { "epoch": 8.457869634340222, "grad_norm": 3.0096040373583457, "learning_rate": 4.922538089456246e-05, "loss": 0.1436, "step": 5320 }, { "epoch": 8.45945945945946, "grad_norm": 4.06178144854372, "learning_rate": 4.922762444355443e-05, "loss": 0.2772, "step": 5321 }, { "epoch": 8.461049284578696, "grad_norm": 5.60862366425306, "learning_rate": 4.922986486932964e-05, "loss": 0.1543, "step": 5322 }, { "epoch": 8.462639109697934, "grad_norm": 3.9315973099604724, "learning_rate": 4.923210217112981e-05, "loss": 0.1645, "step": 5323 }, { "epoch": 8.46422893481717, "grad_norm": 5.064737565771541, "learning_rate": 4.923433634819769e-05, "loss": 0.1903, "step": 5324 }, { "epoch": 8.465818759936408, "grad_norm": 2.4867557842249366, "learning_rate": 4.9236567399777086e-05, "loss": 0.1671, "step": 5325 }, { "epoch": 8.467408585055644, "grad_norm": 3.2372233415935203, "learning_rate": 4.923879532511287e-05, "loss": 0.1632, "step": 5326 }, { "epoch": 8.46899841017488, "grad_norm": 3.1878006677521937, "learning_rate": 4.924102012345097e-05, "loss": 0.1411, "step": 5327 }, { "epoch": 8.470588235294118, "grad_norm": 1.7662965155768184, "learning_rate": 4.924324179403838e-05, "loss": 0.1179, "step": 5328 }, { "epoch": 8.472178060413354, "grad_norm": 3.3278216161135963, "learning_rate": 4.9245460336123136e-05, "loss": 0.123, "step": 5329 }, { "epoch": 8.473767885532592, "grad_norm": 4.8320227480986295, "learning_rate": 4.924767574895434e-05, "loss": 0.2014, "step": 5330 }, { "epoch": 8.475357710651828, "grad_norm": 3.1957688492147396, "learning_rate": 4.9249888031782165e-05, "loss": 0.1424, "step": 5331 }, { "epoch": 8.476947535771066, "grad_norm": 2.170324698454584, "learning_rate": 4.925209718385782e-05, "loss": 0.134, "step": 5332 }, { "epoch": 8.478537360890302, "grad_norm": 4.772899770568751, "learning_rate": 4.9254303204433606e-05, "loss": 0.1616, "step": 5333 }, { "epoch": 8.48012718600954, "grad_norm": 2.107926589327192, "learning_rate": 4.925650609276284e-05, "loss": 0.1931, "step": 5334 }, { "epoch": 8.481717011128776, "grad_norm": 5.4083502728477715, "learning_rate": 4.925870584809995e-05, "loss": 0.1252, "step": 5335 }, { "epoch": 8.483306836248012, "grad_norm": 2.563024889302822, "learning_rate": 4.926090246970038e-05, "loss": 0.1586, "step": 5336 }, { "epoch": 8.48489666136725, "grad_norm": 3.0801614223906215, "learning_rate": 4.926309595682066e-05, "loss": 0.1363, "step": 5337 }, { "epoch": 8.486486486486486, "grad_norm": 3.5371272781998018, "learning_rate": 4.9265286308718375e-05, "loss": 0.249, "step": 5338 }, { "epoch": 8.488076311605724, "grad_norm": 3.4570654302383876, "learning_rate": 4.926747352465217e-05, "loss": 0.1191, "step": 5339 }, { "epoch": 8.48966613672496, "grad_norm": 3.8388216274033065, "learning_rate": 4.926965760388175e-05, "loss": 0.1553, "step": 5340 }, { "epoch": 8.491255961844198, "grad_norm": 2.372808241279813, "learning_rate": 4.9271838545667876e-05, "loss": 0.1434, "step": 5341 }, { "epoch": 8.492845786963434, "grad_norm": 1.8989263663178342, "learning_rate": 4.9274016349272396e-05, "loss": 0.1775, "step": 5342 }, { "epoch": 8.49443561208267, "grad_norm": 3.247566368898396, "learning_rate": 4.927619101395818e-05, "loss": 0.1867, "step": 5343 }, { "epoch": 8.496025437201908, "grad_norm": 3.1951971134236548, "learning_rate": 4.92783625389892e-05, "loss": 0.1341, "step": 5344 }, { "epoch": 8.497615262321144, "grad_norm": 2.4354308287264854, "learning_rate": 4.928053092363047e-05, "loss": 0.1759, "step": 5345 }, { "epoch": 8.499205087440382, "grad_norm": 4.419238405218485, "learning_rate": 4.928269616714807e-05, "loss": 0.1728, "step": 5346 }, { "epoch": 8.500794912559618, "grad_norm": 3.7601825096158237, "learning_rate": 4.9284858268809136e-05, "loss": 0.1405, "step": 5347 }, { "epoch": 8.502384737678856, "grad_norm": 3.311300250270439, "learning_rate": 4.9287017227881886e-05, "loss": 0.1432, "step": 5348 }, { "epoch": 8.503974562798092, "grad_norm": 1.6282049387074289, "learning_rate": 4.928917304363558e-05, "loss": 0.1522, "step": 5349 }, { "epoch": 8.505564387917328, "grad_norm": 1.4853614179252492, "learning_rate": 4.929132571534057e-05, "loss": 0.126, "step": 5350 }, { "epoch": 8.507154213036566, "grad_norm": 3.136041163491659, "learning_rate": 4.9293475242268225e-05, "loss": 0.1503, "step": 5351 }, { "epoch": 8.508744038155802, "grad_norm": 1.6306641668795792, "learning_rate": 4.9295621623691034e-05, "loss": 0.1331, "step": 5352 }, { "epoch": 8.51033386327504, "grad_norm": 2.4338045323218376, "learning_rate": 4.9297764858882514e-05, "loss": 0.1244, "step": 5353 }, { "epoch": 8.511923688394276, "grad_norm": 3.102727274921048, "learning_rate": 4.9299904947117266e-05, "loss": 0.1602, "step": 5354 }, { "epoch": 8.513513513513514, "grad_norm": 1.6165132152331934, "learning_rate": 4.9302041887670934e-05, "loss": 0.2094, "step": 5355 }, { "epoch": 8.51510333863275, "grad_norm": 4.329370257804166, "learning_rate": 4.930417567982025e-05, "loss": 0.2149, "step": 5356 }, { "epoch": 8.516693163751988, "grad_norm": 1.778164170316947, "learning_rate": 4.9306306322843e-05, "loss": 0.1159, "step": 5357 }, { "epoch": 8.518282988871224, "grad_norm": 1.516082035650747, "learning_rate": 4.930843381601804e-05, "loss": 0.1533, "step": 5358 }, { "epoch": 8.51987281399046, "grad_norm": 3.4319479040109218, "learning_rate": 4.931055815862528e-05, "loss": 0.1578, "step": 5359 }, { "epoch": 8.521462639109698, "grad_norm": 3.0463594935694016, "learning_rate": 4.931267934994573e-05, "loss": 0.1202, "step": 5360 }, { "epoch": 8.523052464228934, "grad_norm": 5.256050008214594, "learning_rate": 4.931479738926143e-05, "loss": 0.1299, "step": 5361 }, { "epoch": 8.524642289348172, "grad_norm": 4.2834923668719105, "learning_rate": 4.931691227585549e-05, "loss": 0.2076, "step": 5362 }, { "epoch": 8.526232114467408, "grad_norm": 7.108533742635488, "learning_rate": 4.931902400901212e-05, "loss": 0.1595, "step": 5363 }, { "epoch": 8.527821939586646, "grad_norm": 4.41908977854237, "learning_rate": 4.932113258801655e-05, "loss": 0.1426, "step": 5364 }, { "epoch": 8.529411764705882, "grad_norm": 4.699906809807721, "learning_rate": 4.9323238012155126e-05, "loss": 0.229, "step": 5365 }, { "epoch": 8.53100158982512, "grad_norm": 5.885687841813832, "learning_rate": 4.9325340280715226e-05, "loss": 0.2433, "step": 5366 }, { "epoch": 8.532591414944356, "grad_norm": 2.6055410330467166, "learning_rate": 4.93274393929853e-05, "loss": 0.1362, "step": 5367 }, { "epoch": 8.534181240063592, "grad_norm": 5.986865662664183, "learning_rate": 4.932953534825489e-05, "loss": 0.1654, "step": 5368 }, { "epoch": 8.53577106518283, "grad_norm": 2.49943389831294, "learning_rate": 4.9331628145814584e-05, "loss": 0.1833, "step": 5369 }, { "epoch": 8.537360890302066, "grad_norm": 3.197348559398834, "learning_rate": 4.9333717784956056e-05, "loss": 0.158, "step": 5370 }, { "epoch": 8.538950715421304, "grad_norm": 3.802283353641469, "learning_rate": 4.933580426497202e-05, "loss": 0.1982, "step": 5371 }, { "epoch": 8.54054054054054, "grad_norm": 4.478837507858739, "learning_rate": 4.933788758515629e-05, "loss": 0.1532, "step": 5372 }, { "epoch": 8.542130365659778, "grad_norm": 3.4600801385739004, "learning_rate": 4.9339967744803736e-05, "loss": 0.1335, "step": 5373 }, { "epoch": 8.543720190779014, "grad_norm": 3.281763449744702, "learning_rate": 4.93420447432103e-05, "loss": 0.1904, "step": 5374 }, { "epoch": 8.54531001589825, "grad_norm": 3.3804908270241145, "learning_rate": 4.934411857967299e-05, "loss": 0.1527, "step": 5375 }, { "epoch": 8.546899841017488, "grad_norm": 3.5102358313016526, "learning_rate": 4.9346189253489885e-05, "loss": 0.2371, "step": 5376 }, { "epoch": 8.548489666136724, "grad_norm": 2.8051459592225276, "learning_rate": 4.9348256763960146e-05, "loss": 0.1509, "step": 5377 }, { "epoch": 8.550079491255962, "grad_norm": 2.3954601618164895, "learning_rate": 4.935032111038399e-05, "loss": 0.192, "step": 5378 }, { "epoch": 8.551669316375198, "grad_norm": 3.537551720490227, "learning_rate": 4.9352382292062716e-05, "loss": 0.1905, "step": 5379 }, { "epoch": 8.553259141494436, "grad_norm": 2.7329384958045218, "learning_rate": 4.9354440308298674e-05, "loss": 0.177, "step": 5380 }, { "epoch": 8.554848966613672, "grad_norm": 2.734468682161299, "learning_rate": 4.935649515839531e-05, "loss": 0.1263, "step": 5381 }, { "epoch": 8.556438791732909, "grad_norm": 2.6681682091860965, "learning_rate": 4.9358546841657145e-05, "loss": 0.2348, "step": 5382 }, { "epoch": 8.558028616852146, "grad_norm": 2.5217726125561604, "learning_rate": 4.936059535738973e-05, "loss": 0.1502, "step": 5383 }, { "epoch": 8.559618441971383, "grad_norm": 3.6494322035699645, "learning_rate": 4.9362640704899745e-05, "loss": 0.1303, "step": 5384 }, { "epoch": 8.56120826709062, "grad_norm": 1.6539480199151637, "learning_rate": 4.936468288349489e-05, "loss": 0.1424, "step": 5385 }, { "epoch": 8.562798092209857, "grad_norm": 3.1993332730859563, "learning_rate": 4.9366721892483977e-05, "loss": 0.3289, "step": 5386 }, { "epoch": 8.564387917329094, "grad_norm": 2.570409973559718, "learning_rate": 4.936875773117687e-05, "loss": 0.1679, "step": 5387 }, { "epoch": 8.56597774244833, "grad_norm": 2.3152773956564157, "learning_rate": 4.9370790398884516e-05, "loss": 0.1601, "step": 5388 }, { "epoch": 8.567567567567568, "grad_norm": 2.5771613542912806, "learning_rate": 4.937281989491892e-05, "loss": 0.181, "step": 5389 }, { "epoch": 8.569157392686805, "grad_norm": 7.237714807085615, "learning_rate": 4.9374846218593176e-05, "loss": 1.567, "step": 5390 }, { "epoch": 8.57074721780604, "grad_norm": 1.7606775407458635, "learning_rate": 4.937686936922145e-05, "loss": 0.1653, "step": 5391 }, { "epoch": 8.572337042925279, "grad_norm": 1.7126827827193933, "learning_rate": 4.937888934611898e-05, "loss": 0.1818, "step": 5392 }, { "epoch": 8.573926868044515, "grad_norm": 3.268731511640149, "learning_rate": 4.9380906148602074e-05, "loss": 0.2183, "step": 5393 }, { "epoch": 8.575516693163753, "grad_norm": 2.54086252678503, "learning_rate": 4.938291977598811e-05, "loss": 0.1719, "step": 5394 }, { "epoch": 8.577106518282989, "grad_norm": 1.9363334553375875, "learning_rate": 4.938493022759556e-05, "loss": 0.1576, "step": 5395 }, { "epoch": 8.578696343402227, "grad_norm": 3.5559670509670283, "learning_rate": 4.938693750274395e-05, "loss": 0.1122, "step": 5396 }, { "epoch": 8.580286168521463, "grad_norm": 2.199565049027433, "learning_rate": 4.9388941600753906e-05, "loss": 0.158, "step": 5397 }, { "epoch": 8.581875993640699, "grad_norm": 3.274174558580387, "learning_rate": 4.939094252094709e-05, "loss": 0.1429, "step": 5398 }, { "epoch": 8.583465818759937, "grad_norm": 3.6001994584842247, "learning_rate": 4.939294026264628e-05, "loss": 0.175, "step": 5399 }, { "epoch": 8.585055643879173, "grad_norm": 3.727261711661235, "learning_rate": 4.9394934825175306e-05, "loss": 0.3545, "step": 5400 } ], "logging_steps": 1, "max_steps": 6000, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 600, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1145460372226048.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }