diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,37834 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.585055643879173, + "eval_steps": 500, + "global_step": 5400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001589825119236884, + "grad_norm": 3.795235013851999, + "learning_rate": 0.0, + "loss": 0.6801, + "step": 1 + }, + { + "epoch": 0.003179650238473768, + "grad_norm": 6.028367183562747, + "learning_rate": 5.0000000000000004e-08, + "loss": 0.6741, + "step": 2 + }, + { + "epoch": 0.0047694753577106515, + "grad_norm": 3.6732651811382016, + "learning_rate": 1.0000000000000001e-07, + "loss": 0.5347, + "step": 3 + }, + { + "epoch": 0.006359300476947536, + "grad_norm": 2.473833847570131, + "learning_rate": 1.5000000000000002e-07, + "loss": 0.578, + "step": 4 + }, + { + "epoch": 0.00794912559618442, + "grad_norm": 6.859032436531032, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.698, + "step": 5 + }, + { + "epoch": 0.009538950715421303, + "grad_norm": 2.548152524845228, + "learning_rate": 2.5e-07, + "loss": 0.5583, + "step": 6 + }, + { + "epoch": 0.011128775834658187, + "grad_norm": 3.7082336084796736, + "learning_rate": 3.0000000000000004e-07, + "loss": 0.8574, + "step": 7 + }, + { + "epoch": 0.012718600953895072, + "grad_norm": 5.6502460829957615, + "learning_rate": 3.5000000000000004e-07, + "loss": 1.3824, + "step": 8 + }, + { + "epoch": 0.014308426073131956, + "grad_norm": 5.838557402656807, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.5955, + "step": 9 + }, + { + "epoch": 0.01589825119236884, + "grad_norm": 4.405485478830656, + "learning_rate": 4.5e-07, + "loss": 0.6537, + "step": 10 + }, + { + "epoch": 0.017488076311605722, + "grad_norm": 4.352230388503188, + "learning_rate": 5e-07, + "loss": 0.7808, + "step": 11 + }, + { + "epoch": 0.019077901430842606, + "grad_norm": 4.566091731189592, + "learning_rate": 5.5e-07, + "loss": 0.5023, + "step": 12 + }, + { + "epoch": 0.02066772655007949, + "grad_norm": 4.693754198892459, + "learning_rate": 6.000000000000001e-07, + "loss": 0.6848, + "step": 13 + }, + { + "epoch": 0.022257551669316374, + "grad_norm": 2.183453710717913, + "learning_rate": 6.5e-07, + "loss": 0.5395, + "step": 14 + }, + { + "epoch": 0.02384737678855326, + "grad_norm": 3.478226082868467, + "learning_rate": 7.000000000000001e-07, + "loss": 0.569, + "step": 15 + }, + { + "epoch": 0.025437201907790145, + "grad_norm": 3.9488669302935406, + "learning_rate": 7.5e-07, + "loss": 0.744, + "step": 16 + }, + { + "epoch": 0.02702702702702703, + "grad_norm": 3.4256019814432914, + "learning_rate": 8.000000000000001e-07, + "loss": 0.4897, + "step": 17 + }, + { + "epoch": 0.028616852146263912, + "grad_norm": 3.4582194594942677, + "learning_rate": 8.5e-07, + "loss": 0.7124, + "step": 18 + }, + { + "epoch": 0.030206677265500796, + "grad_norm": 3.2847288868842215, + "learning_rate": 9e-07, + "loss": 0.5599, + "step": 19 + }, + { + "epoch": 0.03179650238473768, + "grad_norm": 4.5365052729153375, + "learning_rate": 9.500000000000001e-07, + "loss": 0.6273, + "step": 20 + }, + { + "epoch": 0.033386327503974564, + "grad_norm": 2.3454914931107202, + "learning_rate": 1e-06, + "loss": 0.654, + "step": 21 + }, + { + "epoch": 0.034976152623211444, + "grad_norm": 13.017990148845971, + "learning_rate": 1.0500000000000001e-06, + "loss": 0.7806, + "step": 22 + }, + { + "epoch": 0.03656597774244833, + "grad_norm": 3.0735310695295537, + "learning_rate": 1.1e-06, + "loss": 0.4908, + "step": 23 + }, + { + "epoch": 0.03815580286168521, + "grad_norm": 4.4854562533514795, + "learning_rate": 1.15e-06, + "loss": 0.6324, + "step": 24 + }, + { + "epoch": 0.0397456279809221, + "grad_norm": 3.724914090813323, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.6541, + "step": 25 + }, + { + "epoch": 0.04133545310015898, + "grad_norm": 2.864902725530838, + "learning_rate": 1.2499999999999999e-06, + "loss": 0.5107, + "step": 26 + }, + { + "epoch": 0.04292527821939587, + "grad_norm": 3.661800421624066, + "learning_rate": 1.3e-06, + "loss": 0.7261, + "step": 27 + }, + { + "epoch": 0.04451510333863275, + "grad_norm": 2.488482275129523, + "learning_rate": 1.35e-06, + "loss": 0.5254, + "step": 28 + }, + { + "epoch": 0.046104928457869634, + "grad_norm": 3.91195440532004, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.5203, + "step": 29 + }, + { + "epoch": 0.04769475357710652, + "grad_norm": 3.33721026092274, + "learning_rate": 1.45e-06, + "loss": 0.5819, + "step": 30 + }, + { + "epoch": 0.0492845786963434, + "grad_norm": 3.0107615084105688, + "learning_rate": 1.5e-06, + "loss": 0.5539, + "step": 31 + }, + { + "epoch": 0.05087440381558029, + "grad_norm": 3.057329129212838, + "learning_rate": 1.55e-06, + "loss": 0.6725, + "step": 32 + }, + { + "epoch": 0.05246422893481717, + "grad_norm": 3.138243445029786, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.5199, + "step": 33 + }, + { + "epoch": 0.05405405405405406, + "grad_norm": 3.1558323687632837, + "learning_rate": 1.65e-06, + "loss": 0.4547, + "step": 34 + }, + { + "epoch": 0.05564387917329094, + "grad_norm": 3.6418286297368483, + "learning_rate": 1.7e-06, + "loss": 0.663, + "step": 35 + }, + { + "epoch": 0.057233704292527825, + "grad_norm": 1.972435978752039, + "learning_rate": 1.75e-06, + "loss": 0.3549, + "step": 36 + }, + { + "epoch": 0.058823529411764705, + "grad_norm": 10.456881966513196, + "learning_rate": 1.8e-06, + "loss": 0.6311, + "step": 37 + }, + { + "epoch": 0.06041335453100159, + "grad_norm": 2.5858978442042484, + "learning_rate": 1.85e-06, + "loss": 0.4097, + "step": 38 + }, + { + "epoch": 0.06200317965023847, + "grad_norm": 3.340287874361837, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.636, + "step": 39 + }, + { + "epoch": 0.06359300476947535, + "grad_norm": 3.88574573447017, + "learning_rate": 1.95e-06, + "loss": 0.5211, + "step": 40 + }, + { + "epoch": 0.06518282988871224, + "grad_norm": 3.4345469234953385, + "learning_rate": 2e-06, + "loss": 0.4854, + "step": 41 + }, + { + "epoch": 0.06677265500794913, + "grad_norm": 8.753581492987225, + "learning_rate": 2.05e-06, + "loss": 0.3323, + "step": 42 + }, + { + "epoch": 0.06836248012718601, + "grad_norm": 1.8743545519752527, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.3416, + "step": 43 + }, + { + "epoch": 0.06995230524642289, + "grad_norm": 2.7848000736439076, + "learning_rate": 2.15e-06, + "loss": 0.4476, + "step": 44 + }, + { + "epoch": 0.07154213036565978, + "grad_norm": 4.1767808962119615, + "learning_rate": 2.2e-06, + "loss": 0.6096, + "step": 45 + }, + { + "epoch": 0.07313195548489666, + "grad_norm": 2.732515763981667, + "learning_rate": 2.25e-06, + "loss": 0.6896, + "step": 46 + }, + { + "epoch": 0.07472178060413355, + "grad_norm": 2.4892134977113045, + "learning_rate": 2.3e-06, + "loss": 0.692, + "step": 47 + }, + { + "epoch": 0.07631160572337042, + "grad_norm": 3.5722737710584833, + "learning_rate": 2.3500000000000004e-06, + "loss": 1.2525, + "step": 48 + }, + { + "epoch": 0.07790143084260731, + "grad_norm": 2.58716321667536, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.6116, + "step": 49 + }, + { + "epoch": 0.0794912559618442, + "grad_norm": 2.0753360795189346, + "learning_rate": 2.45e-06, + "loss": 0.3557, + "step": 50 + }, + { + "epoch": 0.08108108108108109, + "grad_norm": 2.4516850889788424, + "learning_rate": 2.4999999999999998e-06, + "loss": 0.5785, + "step": 51 + }, + { + "epoch": 0.08267090620031796, + "grad_norm": 3.489855211431725, + "learning_rate": 2.55e-06, + "loss": 0.5671, + "step": 52 + }, + { + "epoch": 0.08426073131955485, + "grad_norm": 2.576899528411801, + "learning_rate": 2.6e-06, + "loss": 0.3073, + "step": 53 + }, + { + "epoch": 0.08585055643879173, + "grad_norm": 2.4708493861674907, + "learning_rate": 2.65e-06, + "loss": 0.3626, + "step": 54 + }, + { + "epoch": 0.08744038155802862, + "grad_norm": 4.632028043287789, + "learning_rate": 2.7e-06, + "loss": 0.5277, + "step": 55 + }, + { + "epoch": 0.0890302066772655, + "grad_norm": 3.7424543322608157, + "learning_rate": 2.75e-06, + "loss": 0.5012, + "step": 56 + }, + { + "epoch": 0.09062003179650238, + "grad_norm": 2.0483528283718373, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.394, + "step": 57 + }, + { + "epoch": 0.09220985691573927, + "grad_norm": 1.9792144750603868, + "learning_rate": 2.8500000000000002e-06, + "loss": 0.3581, + "step": 58 + }, + { + "epoch": 0.09379968203497616, + "grad_norm": 3.8982763422672018, + "learning_rate": 2.9e-06, + "loss": 0.4495, + "step": 59 + }, + { + "epoch": 0.09538950715421304, + "grad_norm": 2.450485499301386, + "learning_rate": 2.9499999999999997e-06, + "loss": 0.3947, + "step": 60 + }, + { + "epoch": 0.09697933227344992, + "grad_norm": 2.9115830161776977, + "learning_rate": 3e-06, + "loss": 0.5786, + "step": 61 + }, + { + "epoch": 0.0985691573926868, + "grad_norm": 1.9944897401992767, + "learning_rate": 3.05e-06, + "loss": 0.3494, + "step": 62 + }, + { + "epoch": 0.10015898251192369, + "grad_norm": 1.8572833024485675, + "learning_rate": 3.1e-06, + "loss": 0.4715, + "step": 63 + }, + { + "epoch": 0.10174880763116058, + "grad_norm": 1.6569837912278225, + "learning_rate": 3.15e-06, + "loss": 0.3315, + "step": 64 + }, + { + "epoch": 0.10333863275039745, + "grad_norm": 2.1197045009140187, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.4436, + "step": 65 + }, + { + "epoch": 0.10492845786963434, + "grad_norm": 1.7210866836040277, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.3754, + "step": 66 + }, + { + "epoch": 0.10651828298887123, + "grad_norm": 2.7349500613318862, + "learning_rate": 3.3e-06, + "loss": 0.4898, + "step": 67 + }, + { + "epoch": 0.10810810810810811, + "grad_norm": 1.5380087808606269, + "learning_rate": 3.35e-06, + "loss": 0.3176, + "step": 68 + }, + { + "epoch": 0.10969793322734499, + "grad_norm": 2.348187789302059, + "learning_rate": 3.4e-06, + "loss": 0.4243, + "step": 69 + }, + { + "epoch": 0.11128775834658187, + "grad_norm": 2.3685160258091185, + "learning_rate": 3.4500000000000004e-06, + "loss": 0.4655, + "step": 70 + }, + { + "epoch": 0.11287758346581876, + "grad_norm": 1.762841631604455, + "learning_rate": 3.5e-06, + "loss": 0.3365, + "step": 71 + }, + { + "epoch": 0.11446740858505565, + "grad_norm": 1.9734055325732067, + "learning_rate": 3.55e-06, + "loss": 0.2708, + "step": 72 + }, + { + "epoch": 0.11605723370429252, + "grad_norm": 2.3424700315917164, + "learning_rate": 3.6e-06, + "loss": 0.3157, + "step": 73 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 4.29268894749181, + "learning_rate": 3.65e-06, + "loss": 0.4066, + "step": 74 + }, + { + "epoch": 0.1192368839427663, + "grad_norm": 2.100104227775165, + "learning_rate": 3.7e-06, + "loss": 0.4176, + "step": 75 + }, + { + "epoch": 0.12082670906200318, + "grad_norm": 2.772703843174882, + "learning_rate": 3.75e-06, + "loss": 0.4132, + "step": 76 + }, + { + "epoch": 0.12241653418124006, + "grad_norm": 1.9051357535961146, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.3785, + "step": 77 + }, + { + "epoch": 0.12400635930047695, + "grad_norm": 4.015127912491486, + "learning_rate": 3.8499999999999996e-06, + "loss": 0.4117, + "step": 78 + }, + { + "epoch": 0.12559618441971382, + "grad_norm": 1.533700791686892, + "learning_rate": 3.9e-06, + "loss": 0.2292, + "step": 79 + }, + { + "epoch": 0.1271860095389507, + "grad_norm": 2.0713583752923928, + "learning_rate": 3.9499999999999995e-06, + "loss": 0.2906, + "step": 80 + }, + { + "epoch": 0.1287758346581876, + "grad_norm": 4.150836854647218, + "learning_rate": 4e-06, + "loss": 0.5251, + "step": 81 + }, + { + "epoch": 0.13036565977742448, + "grad_norm": 2.0329564716601447, + "learning_rate": 4.05e-06, + "loss": 0.3036, + "step": 82 + }, + { + "epoch": 0.13195548489666137, + "grad_norm": 1.8454617656131709, + "learning_rate": 4.1e-06, + "loss": 0.4063, + "step": 83 + }, + { + "epoch": 0.13354531001589826, + "grad_norm": 3.498503236315121, + "learning_rate": 4.15e-06, + "loss": 0.4475, + "step": 84 + }, + { + "epoch": 0.13513513513513514, + "grad_norm": 2.3428010147959886, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.2985, + "step": 85 + }, + { + "epoch": 0.13672496025437203, + "grad_norm": 3.246869837216726, + "learning_rate": 4.25e-06, + "loss": 0.3859, + "step": 86 + }, + { + "epoch": 0.1383147853736089, + "grad_norm": 6.907869958180637, + "learning_rate": 4.3e-06, + "loss": 0.5418, + "step": 87 + }, + { + "epoch": 0.13990461049284578, + "grad_norm": 2.417911796670967, + "learning_rate": 4.35e-06, + "loss": 0.2827, + "step": 88 + }, + { + "epoch": 0.14149443561208266, + "grad_norm": 3.614348948612056, + "learning_rate": 4.4e-06, + "loss": 0.3201, + "step": 89 + }, + { + "epoch": 0.14308426073131955, + "grad_norm": 1.7733989273861601, + "learning_rate": 4.450000000000001e-06, + "loss": 0.2974, + "step": 90 + }, + { + "epoch": 0.14467408585055644, + "grad_norm": 2.1336073944163494, + "learning_rate": 4.5e-06, + "loss": 0.2637, + "step": 91 + }, + { + "epoch": 0.14626391096979333, + "grad_norm": 3.487206083713662, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.3126, + "step": 92 + }, + { + "epoch": 0.1478537360890302, + "grad_norm": 2.7020670510152143, + "learning_rate": 4.6e-06, + "loss": 0.3527, + "step": 93 + }, + { + "epoch": 0.1494435612082671, + "grad_norm": 3.6301491114326776, + "learning_rate": 4.65e-06, + "loss": 0.3443, + "step": 94 + }, + { + "epoch": 0.151033386327504, + "grad_norm": 9.1371951560765, + "learning_rate": 4.700000000000001e-06, + "loss": 0.4076, + "step": 95 + }, + { + "epoch": 0.15262321144674085, + "grad_norm": 3.7007232647417037, + "learning_rate": 4.75e-06, + "loss": 1.3226, + "step": 96 + }, + { + "epoch": 0.15421303656597773, + "grad_norm": 1.4436397128358596, + "learning_rate": 4.800000000000001e-06, + "loss": 0.2498, + "step": 97 + }, + { + "epoch": 0.15580286168521462, + "grad_norm": 2.121937515278619, + "learning_rate": 4.849999999999999e-06, + "loss": 0.3089, + "step": 98 + }, + { + "epoch": 0.1573926868044515, + "grad_norm": 21.430379825129187, + "learning_rate": 4.9e-06, + "loss": 57.7393, + "step": 99 + }, + { + "epoch": 0.1589825119236884, + "grad_norm": 2.1455680295612796, + "learning_rate": 4.95e-06, + "loss": 0.3399, + "step": 100 + }, + { + "epoch": 0.16057233704292528, + "grad_norm": 2.141545468935083, + "learning_rate": 4.9999999999999996e-06, + "loss": 0.335, + "step": 101 + }, + { + "epoch": 0.16216216216216217, + "grad_norm": 1.6598816898454043, + "learning_rate": 5.05e-06, + "loss": 0.2577, + "step": 102 + }, + { + "epoch": 0.16375198728139906, + "grad_norm": 2.224124896314827, + "learning_rate": 5.1e-06, + "loss": 0.3286, + "step": 103 + }, + { + "epoch": 0.16534181240063592, + "grad_norm": 1.5340621081005676, + "learning_rate": 5.15e-06, + "loss": 0.2321, + "step": 104 + }, + { + "epoch": 0.1669316375198728, + "grad_norm": 1.3964454966985334, + "learning_rate": 5.2e-06, + "loss": 0.2918, + "step": 105 + }, + { + "epoch": 0.1685214626391097, + "grad_norm": 2.3488960783500676, + "learning_rate": 5.25e-06, + "loss": 0.3884, + "step": 106 + }, + { + "epoch": 0.17011128775834658, + "grad_norm": 1.4748270908180765, + "learning_rate": 5.3e-06, + "loss": 0.303, + "step": 107 + }, + { + "epoch": 0.17170111287758347, + "grad_norm": 4.359653887439957, + "learning_rate": 5.3500000000000004e-06, + "loss": 0.4091, + "step": 108 + }, + { + "epoch": 0.17329093799682035, + "grad_norm": 1.8363152211718876, + "learning_rate": 5.4e-06, + "loss": 0.3519, + "step": 109 + }, + { + "epoch": 0.17488076311605724, + "grad_norm": 2.21210172687297, + "learning_rate": 5.45e-06, + "loss": 0.3841, + "step": 110 + }, + { + "epoch": 0.17647058823529413, + "grad_norm": 1.3700568484943283, + "learning_rate": 5.5e-06, + "loss": 0.2965, + "step": 111 + }, + { + "epoch": 0.178060413354531, + "grad_norm": 1.8351097591888834, + "learning_rate": 5.55e-06, + "loss": 0.3233, + "step": 112 + }, + { + "epoch": 0.17965023847376788, + "grad_norm": 1.873418352257156, + "learning_rate": 5.600000000000001e-06, + "loss": 0.3046, + "step": 113 + }, + { + "epoch": 0.18124006359300476, + "grad_norm": 1.5683857367827994, + "learning_rate": 5.65e-06, + "loss": 0.3191, + "step": 114 + }, + { + "epoch": 0.18282988871224165, + "grad_norm": 1.9229291386905674, + "learning_rate": 5.7000000000000005e-06, + "loss": 0.3689, + "step": 115 + }, + { + "epoch": 0.18441971383147854, + "grad_norm": 2.287181355620036, + "learning_rate": 5.750000000000001e-06, + "loss": 0.488, + "step": 116 + }, + { + "epoch": 0.18600953895071543, + "grad_norm": 2.56292831164503, + "learning_rate": 5.8e-06, + "loss": 0.2834, + "step": 117 + }, + { + "epoch": 0.1875993640699523, + "grad_norm": 1.8255763312328974, + "learning_rate": 5.850000000000001e-06, + "loss": 0.3151, + "step": 118 + }, + { + "epoch": 0.1891891891891892, + "grad_norm": 1.69002084197253, + "learning_rate": 5.899999999999999e-06, + "loss": 0.3033, + "step": 119 + }, + { + "epoch": 0.1907790143084261, + "grad_norm": 2.092237912197873, + "learning_rate": 5.95e-06, + "loss": 0.2884, + "step": 120 + }, + { + "epoch": 0.19236883942766295, + "grad_norm": 19.46802098072797, + "learning_rate": 6e-06, + "loss": 42.7243, + "step": 121 + }, + { + "epoch": 0.19395866454689983, + "grad_norm": 34.59585080980778, + "learning_rate": 6.05e-06, + "loss": 21.425, + "step": 122 + }, + { + "epoch": 0.19554848966613672, + "grad_norm": 4.888550520806797, + "learning_rate": 6.1e-06, + "loss": 0.5551, + "step": 123 + }, + { + "epoch": 0.1971383147853736, + "grad_norm": 2.7459983682638973, + "learning_rate": 6.1499999999999996e-06, + "loss": 0.3278, + "step": 124 + }, + { + "epoch": 0.1987281399046105, + "grad_norm": 3.4195016424214812, + "learning_rate": 6.2e-06, + "loss": 0.3194, + "step": 125 + }, + { + "epoch": 0.20031796502384738, + "grad_norm": 2.5311546520857315, + "learning_rate": 6.25e-06, + "loss": 0.3153, + "step": 126 + }, + { + "epoch": 0.20190779014308427, + "grad_norm": 2.405135633031996, + "learning_rate": 6.3e-06, + "loss": 0.3361, + "step": 127 + }, + { + "epoch": 0.20349761526232116, + "grad_norm": 3.450737385082818, + "learning_rate": 6.35e-06, + "loss": 0.2145, + "step": 128 + }, + { + "epoch": 0.20508744038155802, + "grad_norm": 4.421353614121937, + "learning_rate": 6.4000000000000006e-06, + "loss": 0.2826, + "step": 129 + }, + { + "epoch": 0.2066772655007949, + "grad_norm": 2.9407818450750174, + "learning_rate": 6.45e-06, + "loss": 0.3814, + "step": 130 + }, + { + "epoch": 0.2082670906200318, + "grad_norm": 1.928727797916912, + "learning_rate": 6.5000000000000004e-06, + "loss": 0.2184, + "step": 131 + }, + { + "epoch": 0.20985691573926868, + "grad_norm": 1.643137005529309, + "learning_rate": 6.55e-06, + "loss": 0.3137, + "step": 132 + }, + { + "epoch": 0.21144674085850557, + "grad_norm": 5.036813239131316, + "learning_rate": 6.6e-06, + "loss": 0.4503, + "step": 133 + }, + { + "epoch": 0.21303656597774245, + "grad_norm": 2.99279631609939, + "learning_rate": 6.650000000000001e-06, + "loss": 0.4495, + "step": 134 + }, + { + "epoch": 0.21462639109697934, + "grad_norm": 1.9404581617893732, + "learning_rate": 6.7e-06, + "loss": 0.3792, + "step": 135 + }, + { + "epoch": 0.21621621621621623, + "grad_norm": 1.5484886426898032, + "learning_rate": 6.750000000000001e-06, + "loss": 0.3009, + "step": 136 + }, + { + "epoch": 0.2178060413354531, + "grad_norm": 9.607796410170126, + "learning_rate": 6.8e-06, + "loss": 22.7958, + "step": 137 + }, + { + "epoch": 0.21939586645468998, + "grad_norm": 1.5829929844883943, + "learning_rate": 6.8500000000000005e-06, + "loss": 0.3367, + "step": 138 + }, + { + "epoch": 0.22098569157392686, + "grad_norm": 2.2091365254610102, + "learning_rate": 6.900000000000001e-06, + "loss": 0.3167, + "step": 139 + }, + { + "epoch": 0.22257551669316375, + "grad_norm": 6.1395924485133735, + "learning_rate": 6.95e-06, + "loss": 0.55, + "step": 140 + }, + { + "epoch": 0.22416534181240064, + "grad_norm": 1.5672514551703651, + "learning_rate": 7e-06, + "loss": 0.3542, + "step": 141 + }, + { + "epoch": 0.22575516693163752, + "grad_norm": 1.690540673835988, + "learning_rate": 7.049999999999999e-06, + "loss": 0.2882, + "step": 142 + }, + { + "epoch": 0.2273449920508744, + "grad_norm": 2.5064738996434257, + "learning_rate": 7.1e-06, + "loss": 1.1543, + "step": 143 + }, + { + "epoch": 0.2289348171701113, + "grad_norm": 2.082095601128218, + "learning_rate": 7.15e-06, + "loss": 0.3266, + "step": 144 + }, + { + "epoch": 0.23052464228934816, + "grad_norm": 1.321504688457293, + "learning_rate": 7.2e-06, + "loss": 0.2466, + "step": 145 + }, + { + "epoch": 0.23211446740858505, + "grad_norm": 1.9986308918387374, + "learning_rate": 7.25e-06, + "loss": 0.3142, + "step": 146 + }, + { + "epoch": 0.23370429252782193, + "grad_norm": 1.557083659456137, + "learning_rate": 7.3e-06, + "loss": 0.3395, + "step": 147 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 1.3208461362234136, + "learning_rate": 7.35e-06, + "loss": 0.2838, + "step": 148 + }, + { + "epoch": 0.2368839427662957, + "grad_norm": 5.695105395530786, + "learning_rate": 7.4e-06, + "loss": 0.7558, + "step": 149 + }, + { + "epoch": 0.2384737678855326, + "grad_norm": 2.254505991514579, + "learning_rate": 7.45e-06, + "loss": 0.2436, + "step": 150 + }, + { + "epoch": 0.24006359300476948, + "grad_norm": 2.421778755954633, + "learning_rate": 7.5e-06, + "loss": 0.2663, + "step": 151 + }, + { + "epoch": 0.24165341812400637, + "grad_norm": 1.3210898288169435, + "learning_rate": 7.55e-06, + "loss": 0.2112, + "step": 152 + }, + { + "epoch": 0.24324324324324326, + "grad_norm": 1.492282992694592, + "learning_rate": 7.600000000000001e-06, + "loss": 0.275, + "step": 153 + }, + { + "epoch": 0.24483306836248012, + "grad_norm": 2.963536638326518, + "learning_rate": 7.65e-06, + "loss": 0.278, + "step": 154 + }, + { + "epoch": 0.246422893481717, + "grad_norm": 1.5176140678622898, + "learning_rate": 7.699999999999999e-06, + "loss": 0.2454, + "step": 155 + }, + { + "epoch": 0.2480127186009539, + "grad_norm": 1.3768462513595667, + "learning_rate": 7.75e-06, + "loss": 0.2503, + "step": 156 + }, + { + "epoch": 0.24960254372019078, + "grad_norm": 1.7215217704863925, + "learning_rate": 7.8e-06, + "loss": 0.3222, + "step": 157 + }, + { + "epoch": 0.25119236883942764, + "grad_norm": 1.8496999629756794, + "learning_rate": 7.85e-06, + "loss": 0.352, + "step": 158 + }, + { + "epoch": 0.2527821939586645, + "grad_norm": 1.6785722194284984, + "learning_rate": 7.899999999999999e-06, + "loss": 0.3066, + "step": 159 + }, + { + "epoch": 0.2543720190779014, + "grad_norm": 2.035739688925869, + "learning_rate": 7.95e-06, + "loss": 0.3625, + "step": 160 + }, + { + "epoch": 0.2559618441971383, + "grad_norm": 3.354163500253444, + "learning_rate": 8e-06, + "loss": 0.4428, + "step": 161 + }, + { + "epoch": 0.2575516693163752, + "grad_norm": 1.6169523176618321, + "learning_rate": 8.05e-06, + "loss": 0.2023, + "step": 162 + }, + { + "epoch": 0.2591414944356121, + "grad_norm": 1.2185647428224278, + "learning_rate": 8.1e-06, + "loss": 0.2255, + "step": 163 + }, + { + "epoch": 0.26073131955484896, + "grad_norm": 1.4568855672643708, + "learning_rate": 8.15e-06, + "loss": 0.2997, + "step": 164 + }, + { + "epoch": 0.26232114467408585, + "grad_norm": 2.394779371193609, + "learning_rate": 8.2e-06, + "loss": 0.3152, + "step": 165 + }, + { + "epoch": 0.26391096979332274, + "grad_norm": 1.1579327561012442, + "learning_rate": 8.25e-06, + "loss": 0.2538, + "step": 166 + }, + { + "epoch": 0.2655007949125596, + "grad_norm": 2.7570689862592466, + "learning_rate": 8.3e-06, + "loss": 0.263, + "step": 167 + }, + { + "epoch": 0.2670906200317965, + "grad_norm": 1.5076129219881738, + "learning_rate": 8.35e-06, + "loss": 0.2457, + "step": 168 + }, + { + "epoch": 0.2686804451510334, + "grad_norm": 9.94150493674604, + "learning_rate": 8.400000000000001e-06, + "loss": 28.1962, + "step": 169 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 2.8058614569139664, + "learning_rate": 8.45e-06, + "loss": 0.368, + "step": 170 + }, + { + "epoch": 0.2718600953895072, + "grad_norm": 2.309486088012813, + "learning_rate": 8.5e-06, + "loss": 0.3274, + "step": 171 + }, + { + "epoch": 0.27344992050874406, + "grad_norm": 1.2696733059164054, + "learning_rate": 8.55e-06, + "loss": 0.2359, + "step": 172 + }, + { + "epoch": 0.27503974562798095, + "grad_norm": 3.5596992784562165, + "learning_rate": 8.6e-06, + "loss": 0.324, + "step": 173 + }, + { + "epoch": 0.2766295707472178, + "grad_norm": 2.6517365152645884, + "learning_rate": 8.65e-06, + "loss": 0.3599, + "step": 174 + }, + { + "epoch": 0.27821939586645467, + "grad_norm": 1.398768431561282, + "learning_rate": 8.7e-06, + "loss": 0.2918, + "step": 175 + }, + { + "epoch": 0.27980922098569155, + "grad_norm": 1.340141571239029, + "learning_rate": 8.750000000000001e-06, + "loss": 0.2428, + "step": 176 + }, + { + "epoch": 0.28139904610492844, + "grad_norm": 2.2922241474115523, + "learning_rate": 8.8e-06, + "loss": 0.3045, + "step": 177 + }, + { + "epoch": 0.28298887122416533, + "grad_norm": 3.7452420641326976, + "learning_rate": 8.85e-06, + "loss": 0.3966, + "step": 178 + }, + { + "epoch": 0.2845786963434022, + "grad_norm": 4.291839023163834, + "learning_rate": 8.900000000000001e-06, + "loss": 0.4295, + "step": 179 + }, + { + "epoch": 0.2861685214626391, + "grad_norm": 2.0585872904954705, + "learning_rate": 8.95e-06, + "loss": 0.2536, + "step": 180 + }, + { + "epoch": 0.287758346581876, + "grad_norm": 2.216551894401291, + "learning_rate": 9e-06, + "loss": 0.2114, + "step": 181 + }, + { + "epoch": 0.2893481717011129, + "grad_norm": 1.6394566545921296, + "learning_rate": 9.050000000000001e-06, + "loss": 0.2591, + "step": 182 + }, + { + "epoch": 0.29093799682034976, + "grad_norm": 0.8015361465635777, + "learning_rate": 9.100000000000001e-06, + "loss": 0.2162, + "step": 183 + }, + { + "epoch": 0.29252782193958665, + "grad_norm": 1.589354708478782, + "learning_rate": 9.15e-06, + "loss": 0.2602, + "step": 184 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 1.0050721548621755, + "learning_rate": 9.2e-06, + "loss": 0.2641, + "step": 185 + }, + { + "epoch": 0.2957074721780604, + "grad_norm": 1.2645804731964436, + "learning_rate": 9.250000000000001e-06, + "loss": 0.3015, + "step": 186 + }, + { + "epoch": 0.2972972972972973, + "grad_norm": 1.0007065259328647, + "learning_rate": 9.3e-06, + "loss": 0.441, + "step": 187 + }, + { + "epoch": 0.2988871224165342, + "grad_norm": 2.5886877542770286, + "learning_rate": 9.35e-06, + "loss": 0.259, + "step": 188 + }, + { + "epoch": 0.3004769475357711, + "grad_norm": 1.1340307925651814, + "learning_rate": 9.400000000000001e-06, + "loss": 0.2049, + "step": 189 + }, + { + "epoch": 0.302066772655008, + "grad_norm": 3.299680279210414, + "learning_rate": 9.450000000000001e-06, + "loss": 0.4893, + "step": 190 + }, + { + "epoch": 0.3036565977742448, + "grad_norm": 1.8654852143992833, + "learning_rate": 9.5e-06, + "loss": 0.3392, + "step": 191 + }, + { + "epoch": 0.3052464228934817, + "grad_norm": 2.1534063524174036, + "learning_rate": 9.550000000000002e-06, + "loss": 0.4219, + "step": 192 + }, + { + "epoch": 0.3068362480127186, + "grad_norm": 1.206918562135429, + "learning_rate": 9.600000000000001e-06, + "loss": 0.2479, + "step": 193 + }, + { + "epoch": 0.30842607313195547, + "grad_norm": 1.2185136445901936, + "learning_rate": 9.649999999999999e-06, + "loss": 0.2553, + "step": 194 + }, + { + "epoch": 0.31001589825119236, + "grad_norm": 2.8504487907491605, + "learning_rate": 9.699999999999999e-06, + "loss": 0.4003, + "step": 195 + }, + { + "epoch": 0.31160572337042924, + "grad_norm": 1.2910035087675946, + "learning_rate": 9.75e-06, + "loss": 0.2609, + "step": 196 + }, + { + "epoch": 0.31319554848966613, + "grad_norm": 1.5035623115464205, + "learning_rate": 9.8e-06, + "loss": 0.384, + "step": 197 + }, + { + "epoch": 0.314785373608903, + "grad_norm": 1.1597723452634396, + "learning_rate": 9.849999999999999e-06, + "loss": 0.2552, + "step": 198 + }, + { + "epoch": 0.3163751987281399, + "grad_norm": 1.5846388381977266, + "learning_rate": 9.9e-06, + "loss": 0.3061, + "step": 199 + }, + { + "epoch": 0.3179650238473768, + "grad_norm": 4.08839026091095, + "learning_rate": 9.95e-06, + "loss": 0.2834, + "step": 200 + }, + { + "epoch": 0.3195548489666137, + "grad_norm": 12.28117840075908, + "learning_rate": 9.999999999999999e-06, + "loss": 38.6443, + "step": 201 + }, + { + "epoch": 0.32114467408585057, + "grad_norm": 12.093634795709107, + "learning_rate": 1.005e-05, + "loss": 41.111, + "step": 202 + }, + { + "epoch": 0.32273449920508746, + "grad_norm": 1.6921477329907502, + "learning_rate": 1.01e-05, + "loss": 0.2379, + "step": 203 + }, + { + "epoch": 0.32432432432432434, + "grad_norm": 1.3309852079860687, + "learning_rate": 1.015e-05, + "loss": 0.2413, + "step": 204 + }, + { + "epoch": 0.32591414944356123, + "grad_norm": 1.7479271514906554, + "learning_rate": 1.02e-05, + "loss": 0.279, + "step": 205 + }, + { + "epoch": 0.3275039745627981, + "grad_norm": 2.674379392952364, + "learning_rate": 1.025e-05, + "loss": 0.3523, + "step": 206 + }, + { + "epoch": 0.32909379968203495, + "grad_norm": 1.3580620916865656, + "learning_rate": 1.03e-05, + "loss": 0.2742, + "step": 207 + }, + { + "epoch": 0.33068362480127184, + "grad_norm": 2.641514512119881, + "learning_rate": 1.035e-05, + "loss": 0.2324, + "step": 208 + }, + { + "epoch": 0.3322734499205087, + "grad_norm": 1.6188361619701048, + "learning_rate": 1.04e-05, + "loss": 0.2613, + "step": 209 + }, + { + "epoch": 0.3338632750397456, + "grad_norm": 1.076352196378147, + "learning_rate": 1.045e-05, + "loss": 0.215, + "step": 210 + }, + { + "epoch": 0.3354531001589825, + "grad_norm": 0.8052076039512558, + "learning_rate": 1.05e-05, + "loss": 0.1973, + "step": 211 + }, + { + "epoch": 0.3370429252782194, + "grad_norm": 5.067719529518202, + "learning_rate": 1.055e-05, + "loss": 0.8736, + "step": 212 + }, + { + "epoch": 0.3386327503974563, + "grad_norm": 1.590474697799204, + "learning_rate": 1.06e-05, + "loss": 0.2879, + "step": 213 + }, + { + "epoch": 0.34022257551669316, + "grad_norm": 0.9272548961019641, + "learning_rate": 1.065e-05, + "loss": 0.3491, + "step": 214 + }, + { + "epoch": 0.34181240063593005, + "grad_norm": 1.5710282915503249, + "learning_rate": 1.0700000000000001e-05, + "loss": 0.3012, + "step": 215 + }, + { + "epoch": 0.34340222575516693, + "grad_norm": 2.071716564644239, + "learning_rate": 1.075e-05, + "loss": 0.291, + "step": 216 + }, + { + "epoch": 0.3449920508744038, + "grad_norm": 3.584964691228448, + "learning_rate": 1.08e-05, + "loss": 0.482, + "step": 217 + }, + { + "epoch": 0.3465818759936407, + "grad_norm": 1.7350891684332745, + "learning_rate": 1.0850000000000001e-05, + "loss": 0.2093, + "step": 218 + }, + { + "epoch": 0.3481717011128776, + "grad_norm": 3.2872999006781134, + "learning_rate": 1.09e-05, + "loss": 0.2299, + "step": 219 + }, + { + "epoch": 0.3497615262321145, + "grad_norm": 0.8435806277760939, + "learning_rate": 1.095e-05, + "loss": 0.1958, + "step": 220 + }, + { + "epoch": 0.35135135135135137, + "grad_norm": 1.5890354065982326, + "learning_rate": 1.1e-05, + "loss": 0.3099, + "step": 221 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 1.4852859519718327, + "learning_rate": 1.1050000000000001e-05, + "loss": 0.2526, + "step": 222 + }, + { + "epoch": 0.35453100158982515, + "grad_norm": 2.0393514138937987, + "learning_rate": 1.11e-05, + "loss": 0.3452, + "step": 223 + }, + { + "epoch": 0.356120826709062, + "grad_norm": 1.557939102190742, + "learning_rate": 1.115e-05, + "loss": 0.257, + "step": 224 + }, + { + "epoch": 0.35771065182829886, + "grad_norm": 3.6990638368839748, + "learning_rate": 1.1200000000000001e-05, + "loss": 0.3914, + "step": 225 + }, + { + "epoch": 0.35930047694753575, + "grad_norm": 1.9782098847151628, + "learning_rate": 1.125e-05, + "loss": 0.3182, + "step": 226 + }, + { + "epoch": 0.36089030206677264, + "grad_norm": 1.0700909683464617, + "learning_rate": 1.13e-05, + "loss": 0.2279, + "step": 227 + }, + { + "epoch": 0.3624801271860095, + "grad_norm": 3.339744311250999, + "learning_rate": 1.1350000000000001e-05, + "loss": 0.3349, + "step": 228 + }, + { + "epoch": 0.3640699523052464, + "grad_norm": 1.439250645657682, + "learning_rate": 1.1400000000000001e-05, + "loss": 0.2959, + "step": 229 + }, + { + "epoch": 0.3656597774244833, + "grad_norm": 1.200827090589856, + "learning_rate": 1.145e-05, + "loss": 0.2294, + "step": 230 + }, + { + "epoch": 0.3672496025437202, + "grad_norm": 1.1228652930090741, + "learning_rate": 1.1500000000000002e-05, + "loss": 0.2227, + "step": 231 + }, + { + "epoch": 0.3688394276629571, + "grad_norm": 1.009665522432123, + "learning_rate": 1.1550000000000001e-05, + "loss": 0.2475, + "step": 232 + }, + { + "epoch": 0.37042925278219396, + "grad_norm": 2.45055375131345, + "learning_rate": 1.16e-05, + "loss": 0.3104, + "step": 233 + }, + { + "epoch": 0.37201907790143085, + "grad_norm": 1.334482583147438, + "learning_rate": 1.165e-05, + "loss": 0.2407, + "step": 234 + }, + { + "epoch": 0.37360890302066774, + "grad_norm": 1.1373994317607716, + "learning_rate": 1.1700000000000001e-05, + "loss": 0.2097, + "step": 235 + }, + { + "epoch": 0.3751987281399046, + "grad_norm": 2.2888577314528393, + "learning_rate": 1.1750000000000001e-05, + "loss": 0.2828, + "step": 236 + }, + { + "epoch": 0.3767885532591415, + "grad_norm": 1.772901914915241, + "learning_rate": 1.1799999999999999e-05, + "loss": 0.4346, + "step": 237 + }, + { + "epoch": 0.3783783783783784, + "grad_norm": 2.7663283691164637, + "learning_rate": 1.185e-05, + "loss": 0.2589, + "step": 238 + }, + { + "epoch": 0.3799682034976153, + "grad_norm": 3.473546176045371, + "learning_rate": 1.19e-05, + "loss": 0.3963, + "step": 239 + }, + { + "epoch": 0.3815580286168522, + "grad_norm": 2.1803729758718626, + "learning_rate": 1.1949999999999999e-05, + "loss": 0.2586, + "step": 240 + }, + { + "epoch": 0.383147853736089, + "grad_norm": 1.2768054161456535, + "learning_rate": 1.2e-05, + "loss": 0.2981, + "step": 241 + }, + { + "epoch": 0.3847376788553259, + "grad_norm": 1.2511560170472653, + "learning_rate": 1.205e-05, + "loss": 0.2804, + "step": 242 + }, + { + "epoch": 0.3863275039745628, + "grad_norm": 1.292697354269098, + "learning_rate": 1.21e-05, + "loss": 0.1988, + "step": 243 + }, + { + "epoch": 0.38791732909379967, + "grad_norm": 2.6874218269000596, + "learning_rate": 1.215e-05, + "loss": 0.3822, + "step": 244 + }, + { + "epoch": 0.38950715421303655, + "grad_norm": 1.237471635990396, + "learning_rate": 1.22e-05, + "loss": 0.2254, + "step": 245 + }, + { + "epoch": 0.39109697933227344, + "grad_norm": 1.0959191038863583, + "learning_rate": 1.225e-05, + "loss": 0.2208, + "step": 246 + }, + { + "epoch": 0.39268680445151033, + "grad_norm": 1.1912097074904957, + "learning_rate": 1.2299999999999999e-05, + "loss": 0.2608, + "step": 247 + }, + { + "epoch": 0.3942766295707472, + "grad_norm": 0.8813251812994075, + "learning_rate": 1.235e-05, + "loss": 0.2455, + "step": 248 + }, + { + "epoch": 0.3958664546899841, + "grad_norm": 1.5931295945709578, + "learning_rate": 1.24e-05, + "loss": 0.236, + "step": 249 + }, + { + "epoch": 0.397456279809221, + "grad_norm": 3.4614521260692035, + "learning_rate": 1.245e-05, + "loss": 0.3172, + "step": 250 + }, + { + "epoch": 0.3990461049284579, + "grad_norm": 1.3051934108896754, + "learning_rate": 1.25e-05, + "loss": 0.2421, + "step": 251 + }, + { + "epoch": 0.40063593004769477, + "grad_norm": 1.0621933834025052, + "learning_rate": 1.255e-05, + "loss": 0.2352, + "step": 252 + }, + { + "epoch": 0.40222575516693165, + "grad_norm": 1.1621702738797568, + "learning_rate": 1.26e-05, + "loss": 0.2364, + "step": 253 + }, + { + "epoch": 0.40381558028616854, + "grad_norm": 1.1667314450255237, + "learning_rate": 1.2650000000000001e-05, + "loss": 0.201, + "step": 254 + }, + { + "epoch": 0.40540540540540543, + "grad_norm": 1.664746076597334, + "learning_rate": 1.27e-05, + "loss": 0.2139, + "step": 255 + }, + { + "epoch": 0.4069952305246423, + "grad_norm": 1.6102209733895332, + "learning_rate": 1.275e-05, + "loss": 0.2064, + "step": 256 + }, + { + "epoch": 0.40858505564387915, + "grad_norm": 1.1731897391111512, + "learning_rate": 1.2800000000000001e-05, + "loss": 0.232, + "step": 257 + }, + { + "epoch": 0.41017488076311603, + "grad_norm": 1.1462826577116, + "learning_rate": 1.285e-05, + "loss": 0.2035, + "step": 258 + }, + { + "epoch": 0.4117647058823529, + "grad_norm": 1.6985771415219733, + "learning_rate": 1.29e-05, + "loss": 0.283, + "step": 259 + }, + { + "epoch": 0.4133545310015898, + "grad_norm": 1.2855505146722028, + "learning_rate": 1.295e-05, + "loss": 0.2478, + "step": 260 + }, + { + "epoch": 0.4149443561208267, + "grad_norm": 1.9269782260284674, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.2638, + "step": 261 + }, + { + "epoch": 0.4165341812400636, + "grad_norm": 1.8250106165671103, + "learning_rate": 1.305e-05, + "loss": 0.3383, + "step": 262 + }, + { + "epoch": 0.41812400635930047, + "grad_norm": 1.4613961756046159, + "learning_rate": 1.31e-05, + "loss": 0.2502, + "step": 263 + }, + { + "epoch": 0.41971383147853736, + "grad_norm": 0.7493017588435321, + "learning_rate": 1.3150000000000001e-05, + "loss": 0.1383, + "step": 264 + }, + { + "epoch": 0.42130365659777425, + "grad_norm": 1.8114713425161009, + "learning_rate": 1.32e-05, + "loss": 0.2281, + "step": 265 + }, + { + "epoch": 0.42289348171701113, + "grad_norm": 1.33626984581196, + "learning_rate": 1.325e-05, + "loss": 0.2491, + "step": 266 + }, + { + "epoch": 0.424483306836248, + "grad_norm": 18.412830529366733, + "learning_rate": 1.3300000000000001e-05, + "loss": 40.2664, + "step": 267 + }, + { + "epoch": 0.4260731319554849, + "grad_norm": 1.635465849878839, + "learning_rate": 1.3350000000000001e-05, + "loss": 0.3017, + "step": 268 + }, + { + "epoch": 0.4276629570747218, + "grad_norm": 1.2856599380909932, + "learning_rate": 1.34e-05, + "loss": 0.2055, + "step": 269 + }, + { + "epoch": 0.4292527821939587, + "grad_norm": 1.206983913274647, + "learning_rate": 1.345e-05, + "loss": 0.2019, + "step": 270 + }, + { + "epoch": 0.43084260731319557, + "grad_norm": 0.8173874713790353, + "learning_rate": 1.3500000000000001e-05, + "loss": 0.166, + "step": 271 + }, + { + "epoch": 0.43243243243243246, + "grad_norm": 2.5308695241674055, + "learning_rate": 1.355e-05, + "loss": 0.2845, + "step": 272 + }, + { + "epoch": 0.43402225755166934, + "grad_norm": 6.919223253912198, + "learning_rate": 1.36e-05, + "loss": 40.099, + "step": 273 + }, + { + "epoch": 0.4356120826709062, + "grad_norm": 1.2644081127298414, + "learning_rate": 1.3650000000000001e-05, + "loss": 0.2289, + "step": 274 + }, + { + "epoch": 0.43720190779014306, + "grad_norm": 1.69828801543595, + "learning_rate": 1.3700000000000001e-05, + "loss": 0.2224, + "step": 275 + }, + { + "epoch": 0.43879173290937995, + "grad_norm": 1.4628722050279872, + "learning_rate": 1.375e-05, + "loss": 0.239, + "step": 276 + }, + { + "epoch": 0.44038155802861684, + "grad_norm": 1.3452217657175023, + "learning_rate": 1.3800000000000002e-05, + "loss": 0.2152, + "step": 277 + }, + { + "epoch": 0.4419713831478537, + "grad_norm": 1.4384259177353511, + "learning_rate": 1.3850000000000001e-05, + "loss": 0.2345, + "step": 278 + }, + { + "epoch": 0.4435612082670906, + "grad_norm": 1.8228572507259209, + "learning_rate": 1.39e-05, + "loss": 0.3404, + "step": 279 + }, + { + "epoch": 0.4451510333863275, + "grad_norm": 2.8411570719014745, + "learning_rate": 1.395e-05, + "loss": 0.2896, + "step": 280 + }, + { + "epoch": 0.4467408585055644, + "grad_norm": 2.2520469464811033, + "learning_rate": 1.4e-05, + "loss": 0.3273, + "step": 281 + }, + { + "epoch": 0.4483306836248013, + "grad_norm": 2.3028172021890003, + "learning_rate": 1.405e-05, + "loss": 0.2578, + "step": 282 + }, + { + "epoch": 0.44992050874403816, + "grad_norm": 1.5174515611200659, + "learning_rate": 1.4099999999999999e-05, + "loss": 0.2476, + "step": 283 + }, + { + "epoch": 0.45151033386327505, + "grad_norm": 12.27447768285051, + "learning_rate": 1.415e-05, + "loss": 40.0011, + "step": 284 + }, + { + "epoch": 0.45310015898251194, + "grad_norm": 1.2507424712141602, + "learning_rate": 1.42e-05, + "loss": 0.2684, + "step": 285 + }, + { + "epoch": 0.4546899841017488, + "grad_norm": 1.2818280436191316, + "learning_rate": 1.4249999999999999e-05, + "loss": 0.9527, + "step": 286 + }, + { + "epoch": 0.4562798092209857, + "grad_norm": 1.6381485523684878, + "learning_rate": 1.43e-05, + "loss": 0.2345, + "step": 287 + }, + { + "epoch": 0.4578696343402226, + "grad_norm": 3.9510954847417374, + "learning_rate": 1.435e-05, + "loss": 0.2735, + "step": 288 + }, + { + "epoch": 0.4594594594594595, + "grad_norm": 1.788610426204811, + "learning_rate": 1.44e-05, + "loss": 0.2634, + "step": 289 + }, + { + "epoch": 0.4610492845786963, + "grad_norm": 1.3846110201600816, + "learning_rate": 1.445e-05, + "loss": 0.2811, + "step": 290 + }, + { + "epoch": 0.4626391096979332, + "grad_norm": 1.4826623126769016, + "learning_rate": 1.45e-05, + "loss": 0.2982, + "step": 291 + }, + { + "epoch": 0.4642289348171701, + "grad_norm": 2.843407437636561, + "learning_rate": 1.455e-05, + "loss": 0.3564, + "step": 292 + }, + { + "epoch": 0.465818759936407, + "grad_norm": 1.1862451460629464, + "learning_rate": 1.46e-05, + "loss": 0.2242, + "step": 293 + }, + { + "epoch": 0.46740858505564387, + "grad_norm": 2.4456443514585056, + "learning_rate": 1.465e-05, + "loss": 0.3022, + "step": 294 + }, + { + "epoch": 0.46899841017488075, + "grad_norm": 1.8245130065155648, + "learning_rate": 1.47e-05, + "loss": 0.2805, + "step": 295 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 1.985608797685802, + "learning_rate": 1.475e-05, + "loss": 0.278, + "step": 296 + }, + { + "epoch": 0.47217806041335453, + "grad_norm": 10.478212933183018, + "learning_rate": 1.48e-05, + "loss": 37.0519, + "step": 297 + }, + { + "epoch": 0.4737678855325914, + "grad_norm": 1.176374921671176, + "learning_rate": 1.485e-05, + "loss": 0.2445, + "step": 298 + }, + { + "epoch": 0.4753577106518283, + "grad_norm": 1.606927869759438, + "learning_rate": 1.49e-05, + "loss": 0.3542, + "step": 299 + }, + { + "epoch": 0.4769475357710652, + "grad_norm": 1.289245434892135, + "learning_rate": 1.4950000000000001e-05, + "loss": 0.2708, + "step": 300 + }, + { + "epoch": 0.4785373608903021, + "grad_norm": 0.9702469573159896, + "learning_rate": 1.5e-05, + "loss": 0.1889, + "step": 301 + }, + { + "epoch": 0.48012718600953896, + "grad_norm": 2.041437140050196, + "learning_rate": 1.5050000000000002e-05, + "loss": 0.2941, + "step": 302 + }, + { + "epoch": 0.48171701112877585, + "grad_norm": 1.55368993673939, + "learning_rate": 1.51e-05, + "loss": 0.348, + "step": 303 + }, + { + "epoch": 0.48330683624801274, + "grad_norm": 1.635769831633422, + "learning_rate": 1.515e-05, + "loss": 0.3151, + "step": 304 + }, + { + "epoch": 0.4848966613672496, + "grad_norm": 4.174578657280089, + "learning_rate": 1.5200000000000002e-05, + "loss": 0.3855, + "step": 305 + }, + { + "epoch": 0.4864864864864865, + "grad_norm": 1.5434445887723258, + "learning_rate": 1.525e-05, + "loss": 0.3291, + "step": 306 + }, + { + "epoch": 0.48807631160572335, + "grad_norm": 1.0480145496409916, + "learning_rate": 1.53e-05, + "loss": 0.2576, + "step": 307 + }, + { + "epoch": 0.48966613672496023, + "grad_norm": 1.0149642923234097, + "learning_rate": 1.535e-05, + "loss": 0.2072, + "step": 308 + }, + { + "epoch": 0.4912559618441971, + "grad_norm": 1.0188810509015556, + "learning_rate": 1.5399999999999998e-05, + "loss": 0.2282, + "step": 309 + }, + { + "epoch": 0.492845786963434, + "grad_norm": 0.9601687889794551, + "learning_rate": 1.545e-05, + "loss": 0.2184, + "step": 310 + }, + { + "epoch": 0.4944356120826709, + "grad_norm": 1.2040954041203409, + "learning_rate": 1.55e-05, + "loss": 0.2678, + "step": 311 + }, + { + "epoch": 0.4960254372019078, + "grad_norm": 2.6554655421596114, + "learning_rate": 1.555e-05, + "loss": 0.3025, + "step": 312 + }, + { + "epoch": 0.49761526232114467, + "grad_norm": 1.8483336847424112, + "learning_rate": 1.56e-05, + "loss": 0.3405, + "step": 313 + }, + { + "epoch": 0.49920508744038156, + "grad_norm": 0.8564175783463368, + "learning_rate": 1.5649999999999998e-05, + "loss": 0.2299, + "step": 314 + }, + { + "epoch": 0.5007949125596184, + "grad_norm": 2.227540542263857, + "learning_rate": 1.57e-05, + "loss": 0.2332, + "step": 315 + }, + { + "epoch": 0.5023847376788553, + "grad_norm": 1.3924747956345411, + "learning_rate": 1.575e-05, + "loss": 0.2275, + "step": 316 + }, + { + "epoch": 0.5039745627980922, + "grad_norm": 1.4055398814767508, + "learning_rate": 1.5799999999999998e-05, + "loss": 0.2293, + "step": 317 + }, + { + "epoch": 0.505564387917329, + "grad_norm": 0.9260122709335098, + "learning_rate": 1.585e-05, + "loss": 0.243, + "step": 318 + }, + { + "epoch": 0.5071542130365659, + "grad_norm": 1.7893005705909428, + "learning_rate": 1.59e-05, + "loss": 0.4086, + "step": 319 + }, + { + "epoch": 0.5087440381558028, + "grad_norm": 4.614275825676911, + "learning_rate": 1.5949999999999998e-05, + "loss": 0.3321, + "step": 320 + }, + { + "epoch": 0.5103338632750397, + "grad_norm": 1.0813017953068882, + "learning_rate": 1.6e-05, + "loss": 0.2685, + "step": 321 + }, + { + "epoch": 0.5119236883942766, + "grad_norm": 2.7191285958069114, + "learning_rate": 1.605e-05, + "loss": 0.3528, + "step": 322 + }, + { + "epoch": 0.5135135135135135, + "grad_norm": 1.313854062762596, + "learning_rate": 1.61e-05, + "loss": 0.2346, + "step": 323 + }, + { + "epoch": 0.5151033386327504, + "grad_norm": 2.355938867409812, + "learning_rate": 1.615e-05, + "loss": 0.2756, + "step": 324 + }, + { + "epoch": 0.5166931637519873, + "grad_norm": 1.6983055392926085, + "learning_rate": 1.62e-05, + "loss": 0.9052, + "step": 325 + }, + { + "epoch": 0.5182829888712241, + "grad_norm": 2.770798759344969, + "learning_rate": 1.625e-05, + "loss": 0.3467, + "step": 326 + }, + { + "epoch": 0.519872813990461, + "grad_norm": 1.7815817430137466, + "learning_rate": 1.63e-05, + "loss": 0.2842, + "step": 327 + }, + { + "epoch": 0.5214626391096979, + "grad_norm": 1.3687909737800006, + "learning_rate": 1.635e-05, + "loss": 0.2808, + "step": 328 + }, + { + "epoch": 0.5230524642289348, + "grad_norm": 1.998063474228559, + "learning_rate": 1.64e-05, + "loss": 0.7284, + "step": 329 + }, + { + "epoch": 0.5246422893481717, + "grad_norm": 3.3701653810155316, + "learning_rate": 1.645e-05, + "loss": 0.4206, + "step": 330 + }, + { + "epoch": 0.5262321144674086, + "grad_norm": 1.6453175559425075, + "learning_rate": 1.65e-05, + "loss": 0.3053, + "step": 331 + }, + { + "epoch": 0.5278219395866455, + "grad_norm": 1.399331730481635, + "learning_rate": 1.655e-05, + "loss": 0.272, + "step": 332 + }, + { + "epoch": 0.5294117647058824, + "grad_norm": 1.3972015370518003, + "learning_rate": 1.66e-05, + "loss": 0.2385, + "step": 333 + }, + { + "epoch": 0.5310015898251192, + "grad_norm": 0.9854314948197552, + "learning_rate": 1.665e-05, + "loss": 0.2144, + "step": 334 + }, + { + "epoch": 0.5325914149443561, + "grad_norm": 1.7976241069817438, + "learning_rate": 1.67e-05, + "loss": 0.2104, + "step": 335 + }, + { + "epoch": 0.534181240063593, + "grad_norm": 1.6843837020774817, + "learning_rate": 1.675e-05, + "loss": 0.3518, + "step": 336 + }, + { + "epoch": 0.5357710651828299, + "grad_norm": 1.4188177658561945, + "learning_rate": 1.6800000000000002e-05, + "loss": 0.2281, + "step": 337 + }, + { + "epoch": 0.5373608903020668, + "grad_norm": 8.172265802117579, + "learning_rate": 1.685e-05, + "loss": 20.671, + "step": 338 + }, + { + "epoch": 0.5389507154213037, + "grad_norm": 2.2787759559989467, + "learning_rate": 1.69e-05, + "loss": 0.8958, + "step": 339 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 0.9690356902895194, + "learning_rate": 1.695e-05, + "loss": 0.2454, + "step": 340 + }, + { + "epoch": 0.5421303656597775, + "grad_norm": 1.6265479273463743, + "learning_rate": 1.7e-05, + "loss": 0.2856, + "step": 341 + }, + { + "epoch": 0.5437201907790143, + "grad_norm": 0.8409714376322419, + "learning_rate": 1.705e-05, + "loss": 0.187, + "step": 342 + }, + { + "epoch": 0.5453100158982512, + "grad_norm": 2.6584611131450795, + "learning_rate": 1.71e-05, + "loss": 0.2437, + "step": 343 + }, + { + "epoch": 0.5468998410174881, + "grad_norm": 1.3497061068913465, + "learning_rate": 1.715e-05, + "loss": 0.2392, + "step": 344 + }, + { + "epoch": 0.548489666136725, + "grad_norm": 2.111399252812515, + "learning_rate": 1.72e-05, + "loss": 0.2222, + "step": 345 + }, + { + "epoch": 0.5500794912559619, + "grad_norm": 1.0677951045315937, + "learning_rate": 1.725e-05, + "loss": 0.2527, + "step": 346 + }, + { + "epoch": 0.5516693163751988, + "grad_norm": 1.665499951069647, + "learning_rate": 1.73e-05, + "loss": 0.2599, + "step": 347 + }, + { + "epoch": 0.5532591414944356, + "grad_norm": 0.5919484742703548, + "learning_rate": 1.735e-05, + "loss": 0.1868, + "step": 348 + }, + { + "epoch": 0.5548489666136724, + "grad_norm": 1.1436781965715417, + "learning_rate": 1.74e-05, + "loss": 0.2061, + "step": 349 + }, + { + "epoch": 0.5564387917329093, + "grad_norm": 1.1570378907839431, + "learning_rate": 1.745e-05, + "loss": 0.2838, + "step": 350 + }, + { + "epoch": 0.5580286168521462, + "grad_norm": 1.916505258736185, + "learning_rate": 1.7500000000000002e-05, + "loss": 0.2067, + "step": 351 + }, + { + "epoch": 0.5596184419713831, + "grad_norm": 1.8068534886869236, + "learning_rate": 1.755e-05, + "loss": 0.2316, + "step": 352 + }, + { + "epoch": 0.56120826709062, + "grad_norm": 0.9095040989529973, + "learning_rate": 1.76e-05, + "loss": 0.2487, + "step": 353 + }, + { + "epoch": 0.5627980922098569, + "grad_norm": 1.0516198347736088, + "learning_rate": 1.7650000000000002e-05, + "loss": 0.2239, + "step": 354 + }, + { + "epoch": 0.5643879173290938, + "grad_norm": 1.1849062890281172, + "learning_rate": 1.77e-05, + "loss": 0.2688, + "step": 355 + }, + { + "epoch": 0.5659777424483307, + "grad_norm": 1.4908541457604094, + "learning_rate": 1.775e-05, + "loss": 0.2216, + "step": 356 + }, + { + "epoch": 0.5675675675675675, + "grad_norm": 1.5677962505337695, + "learning_rate": 1.7800000000000002e-05, + "loss": 0.2254, + "step": 357 + }, + { + "epoch": 0.5691573926868044, + "grad_norm": 1.5461002045936105, + "learning_rate": 1.785e-05, + "loss": 0.265, + "step": 358 + }, + { + "epoch": 0.5707472178060413, + "grad_norm": 1.3077581300287706, + "learning_rate": 1.79e-05, + "loss": 0.2088, + "step": 359 + }, + { + "epoch": 0.5723370429252782, + "grad_norm": 0.6775878895493114, + "learning_rate": 1.7950000000000003e-05, + "loss": 0.2175, + "step": 360 + }, + { + "epoch": 0.5739268680445151, + "grad_norm": 11.67457637130232, + "learning_rate": 1.8e-05, + "loss": 36.2782, + "step": 361 + }, + { + "epoch": 0.575516693163752, + "grad_norm": 0.895049467191542, + "learning_rate": 1.805e-05, + "loss": 0.252, + "step": 362 + }, + { + "epoch": 0.5771065182829889, + "grad_norm": 10.408834793780843, + "learning_rate": 1.8100000000000003e-05, + "loss": 35.4284, + "step": 363 + }, + { + "epoch": 0.5786963434022258, + "grad_norm": 0.9706059689029187, + "learning_rate": 1.815e-05, + "loss": 0.2175, + "step": 364 + }, + { + "epoch": 0.5802861685214626, + "grad_norm": 1.8131627061748241, + "learning_rate": 1.8200000000000002e-05, + "loss": 0.2869, + "step": 365 + }, + { + "epoch": 0.5818759936406995, + "grad_norm": 1.5442152349703204, + "learning_rate": 1.825e-05, + "loss": 0.2494, + "step": 366 + }, + { + "epoch": 0.5834658187599364, + "grad_norm": 0.6149179291206693, + "learning_rate": 1.83e-05, + "loss": 0.1741, + "step": 367 + }, + { + "epoch": 0.5850556438791733, + "grad_norm": 1.6557239810530067, + "learning_rate": 1.8350000000000002e-05, + "loss": 0.3211, + "step": 368 + }, + { + "epoch": 0.5866454689984102, + "grad_norm": 21.93905647991169, + "learning_rate": 1.84e-05, + "loss": 35.3367, + "step": 369 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 1.3943284447943827, + "learning_rate": 1.845e-05, + "loss": 0.2035, + "step": 370 + }, + { + "epoch": 0.589825119236884, + "grad_norm": 4.420744802527324, + "learning_rate": 1.8500000000000002e-05, + "loss": 0.3177, + "step": 371 + }, + { + "epoch": 0.5914149443561209, + "grad_norm": 0.9558316868817092, + "learning_rate": 1.855e-05, + "loss": 0.1871, + "step": 372 + }, + { + "epoch": 0.5930047694753577, + "grad_norm": 2.6049479406132643, + "learning_rate": 1.86e-05, + "loss": 0.2902, + "step": 373 + }, + { + "epoch": 0.5945945945945946, + "grad_norm": 1.2232004510214258, + "learning_rate": 1.8650000000000003e-05, + "loss": 0.2659, + "step": 374 + }, + { + "epoch": 0.5961844197138315, + "grad_norm": 0.972050177325397, + "learning_rate": 1.87e-05, + "loss": 0.1993, + "step": 375 + }, + { + "epoch": 0.5977742448330684, + "grad_norm": 1.1990823518469167, + "learning_rate": 1.8750000000000002e-05, + "loss": 0.2043, + "step": 376 + }, + { + "epoch": 0.5993640699523053, + "grad_norm": 0.7252285151857282, + "learning_rate": 1.8800000000000003e-05, + "loss": 0.1711, + "step": 377 + }, + { + "epoch": 0.6009538950715422, + "grad_norm": 1.2684277566435336, + "learning_rate": 1.885e-05, + "loss": 0.2079, + "step": 378 + }, + { + "epoch": 0.6025437201907791, + "grad_norm": 1.5879408560782209, + "learning_rate": 1.8900000000000002e-05, + "loss": 0.1985, + "step": 379 + }, + { + "epoch": 0.604133545310016, + "grad_norm": 1.5042691942498285, + "learning_rate": 1.8950000000000003e-05, + "loss": 0.3018, + "step": 380 + }, + { + "epoch": 0.6057233704292527, + "grad_norm": 1.375702974387334, + "learning_rate": 1.9e-05, + "loss": 0.2054, + "step": 381 + }, + { + "epoch": 0.6073131955484896, + "grad_norm": 1.0811347000362872, + "learning_rate": 1.9050000000000002e-05, + "loss": 0.2735, + "step": 382 + }, + { + "epoch": 0.6089030206677265, + "grad_norm": 1.164210544600029, + "learning_rate": 1.9100000000000003e-05, + "loss": 0.2215, + "step": 383 + }, + { + "epoch": 0.6104928457869634, + "grad_norm": 2.109651566935429, + "learning_rate": 1.915e-05, + "loss": 0.3174, + "step": 384 + }, + { + "epoch": 0.6120826709062003, + "grad_norm": 1.0846599190397557, + "learning_rate": 1.9200000000000003e-05, + "loss": 0.2567, + "step": 385 + }, + { + "epoch": 0.6136724960254372, + "grad_norm": 1.7543664162270383, + "learning_rate": 1.9250000000000004e-05, + "loss": 0.2594, + "step": 386 + }, + { + "epoch": 0.615262321144674, + "grad_norm": 18.086638383427005, + "learning_rate": 1.9299999999999998e-05, + "loss": 46.7105, + "step": 387 + }, + { + "epoch": 0.6168521462639109, + "grad_norm": 1.4493850373945567, + "learning_rate": 1.935e-05, + "loss": 0.2563, + "step": 388 + }, + { + "epoch": 0.6184419713831478, + "grad_norm": 1.8805379661987693, + "learning_rate": 1.9399999999999997e-05, + "loss": 0.3318, + "step": 389 + }, + { + "epoch": 0.6200317965023847, + "grad_norm": 0.8612682303204704, + "learning_rate": 1.945e-05, + "loss": 0.2152, + "step": 390 + }, + { + "epoch": 0.6216216216216216, + "grad_norm": 1.5212939916615305, + "learning_rate": 1.95e-05, + "loss": 0.2475, + "step": 391 + }, + { + "epoch": 0.6232114467408585, + "grad_norm": 0.9110151872445261, + "learning_rate": 1.9549999999999997e-05, + "loss": 0.207, + "step": 392 + }, + { + "epoch": 0.6248012718600954, + "grad_norm": 0.9835167402219618, + "learning_rate": 1.96e-05, + "loss": 0.222, + "step": 393 + }, + { + "epoch": 0.6263910969793323, + "grad_norm": 2.766062005906614, + "learning_rate": 1.965e-05, + "loss": 0.2089, + "step": 394 + }, + { + "epoch": 0.6279809220985691, + "grad_norm": 0.9759666575323273, + "learning_rate": 1.9699999999999998e-05, + "loss": 0.2042, + "step": 395 + }, + { + "epoch": 0.629570747217806, + "grad_norm": 1.5022306101238487, + "learning_rate": 1.975e-05, + "loss": 0.1914, + "step": 396 + }, + { + "epoch": 0.6311605723370429, + "grad_norm": 0.8866005579380816, + "learning_rate": 1.98e-05, + "loss": 0.2047, + "step": 397 + }, + { + "epoch": 0.6327503974562798, + "grad_norm": 1.188039233167719, + "learning_rate": 1.9849999999999998e-05, + "loss": 0.2646, + "step": 398 + }, + { + "epoch": 0.6343402225755167, + "grad_norm": 0.9531727140787785, + "learning_rate": 1.99e-05, + "loss": 0.2756, + "step": 399 + }, + { + "epoch": 0.6359300476947536, + "grad_norm": 1.1162721782283782, + "learning_rate": 1.995e-05, + "loss": 0.2978, + "step": 400 + }, + { + "epoch": 0.6375198728139905, + "grad_norm": 1.1507352615538533, + "learning_rate": 1.9999999999999998e-05, + "loss": 0.24, + "step": 401 + }, + { + "epoch": 0.6391096979332274, + "grad_norm": 1.2799444436146774, + "learning_rate": 2.005e-05, + "loss": 0.2715, + "step": 402 + }, + { + "epoch": 0.6406995230524642, + "grad_norm": 1.5372070881470923, + "learning_rate": 2.01e-05, + "loss": 0.3011, + "step": 403 + }, + { + "epoch": 0.6422893481717011, + "grad_norm": 0.7208075477730832, + "learning_rate": 2.015e-05, + "loss": 0.182, + "step": 404 + }, + { + "epoch": 0.643879173290938, + "grad_norm": 1.8569291428266264, + "learning_rate": 2.02e-05, + "loss": 0.2872, + "step": 405 + }, + { + "epoch": 0.6454689984101749, + "grad_norm": 1.274493500320388, + "learning_rate": 2.025e-05, + "loss": 0.2158, + "step": 406 + }, + { + "epoch": 0.6470588235294118, + "grad_norm": 1.3539976910909095, + "learning_rate": 2.03e-05, + "loss": 0.2033, + "step": 407 + }, + { + "epoch": 0.6486486486486487, + "grad_norm": 0.8269817603609642, + "learning_rate": 2.035e-05, + "loss": 0.2141, + "step": 408 + }, + { + "epoch": 0.6502384737678856, + "grad_norm": 1.031075241900295, + "learning_rate": 2.04e-05, + "loss": 0.2482, + "step": 409 + }, + { + "epoch": 0.6518282988871225, + "grad_norm": 0.7367858246783379, + "learning_rate": 2.045e-05, + "loss": 0.1929, + "step": 410 + }, + { + "epoch": 0.6534181240063593, + "grad_norm": 0.6887521282117136, + "learning_rate": 2.05e-05, + "loss": 0.1427, + "step": 411 + }, + { + "epoch": 0.6550079491255962, + "grad_norm": 0.8767579835923949, + "learning_rate": 2.055e-05, + "loss": 0.1997, + "step": 412 + }, + { + "epoch": 0.6565977742448331, + "grad_norm": 1.0091956243235478, + "learning_rate": 2.06e-05, + "loss": 0.2418, + "step": 413 + }, + { + "epoch": 0.6581875993640699, + "grad_norm": 0.7677707870409065, + "learning_rate": 2.065e-05, + "loss": 0.2606, + "step": 414 + }, + { + "epoch": 0.6597774244833068, + "grad_norm": 1.5062432333715783, + "learning_rate": 2.07e-05, + "loss": 0.2374, + "step": 415 + }, + { + "epoch": 0.6613672496025437, + "grad_norm": 2.073150985730754, + "learning_rate": 2.075e-05, + "loss": 0.3889, + "step": 416 + }, + { + "epoch": 0.6629570747217806, + "grad_norm": 1.234404517563459, + "learning_rate": 2.08e-05, + "loss": 0.3124, + "step": 417 + }, + { + "epoch": 0.6645468998410174, + "grad_norm": 1.083511606848498, + "learning_rate": 2.085e-05, + "loss": 0.2402, + "step": 418 + }, + { + "epoch": 0.6661367249602543, + "grad_norm": 1.3217278958533336, + "learning_rate": 2.09e-05, + "loss": 0.2674, + "step": 419 + }, + { + "epoch": 0.6677265500794912, + "grad_norm": 0.9075566625540784, + "learning_rate": 2.095e-05, + "loss": 0.2569, + "step": 420 + }, + { + "epoch": 0.6693163751987281, + "grad_norm": 1.5176093200891982, + "learning_rate": 2.1e-05, + "loss": 0.2696, + "step": 421 + }, + { + "epoch": 0.670906200317965, + "grad_norm": 1.311761609888292, + "learning_rate": 2.105e-05, + "loss": 0.2494, + "step": 422 + }, + { + "epoch": 0.6724960254372019, + "grad_norm": 0.9197246958889534, + "learning_rate": 2.11e-05, + "loss": 0.1942, + "step": 423 + }, + { + "epoch": 0.6740858505564388, + "grad_norm": 0.836455602197686, + "learning_rate": 2.115e-05, + "loss": 0.1871, + "step": 424 + }, + { + "epoch": 0.6756756756756757, + "grad_norm": 1.92224761315951, + "learning_rate": 2.12e-05, + "loss": 0.3251, + "step": 425 + }, + { + "epoch": 0.6772655007949125, + "grad_norm": 0.8093418655426429, + "learning_rate": 2.125e-05, + "loss": 0.1974, + "step": 426 + }, + { + "epoch": 0.6788553259141494, + "grad_norm": 1.3113720693662, + "learning_rate": 2.13e-05, + "loss": 0.2304, + "step": 427 + }, + { + "epoch": 0.6804451510333863, + "grad_norm": 1.8710707649842238, + "learning_rate": 2.135e-05, + "loss": 0.3286, + "step": 428 + }, + { + "epoch": 0.6820349761526232, + "grad_norm": 1.3758565195440136, + "learning_rate": 2.1400000000000002e-05, + "loss": 0.1867, + "step": 429 + }, + { + "epoch": 0.6836248012718601, + "grad_norm": 2.4907257806326553, + "learning_rate": 2.145e-05, + "loss": 0.2354, + "step": 430 + }, + { + "epoch": 0.685214626391097, + "grad_norm": 1.2803557904074059, + "learning_rate": 2.15e-05, + "loss": 0.2018, + "step": 431 + }, + { + "epoch": 0.6868044515103339, + "grad_norm": 2.2491952182977513, + "learning_rate": 2.1550000000000002e-05, + "loss": 0.2248, + "step": 432 + }, + { + "epoch": 0.6883942766295708, + "grad_norm": 2.4878816277607294, + "learning_rate": 2.16e-05, + "loss": 0.255, + "step": 433 + }, + { + "epoch": 0.6899841017488076, + "grad_norm": 2.3734190295674593, + "learning_rate": 2.165e-05, + "loss": 0.2783, + "step": 434 + }, + { + "epoch": 0.6915739268680445, + "grad_norm": 2.6037241345595747, + "learning_rate": 2.1700000000000002e-05, + "loss": 0.3913, + "step": 435 + }, + { + "epoch": 0.6931637519872814, + "grad_norm": 1.0438598039487368, + "learning_rate": 2.175e-05, + "loss": 0.1921, + "step": 436 + }, + { + "epoch": 0.6947535771065183, + "grad_norm": 2.4971906438784846, + "learning_rate": 2.18e-05, + "loss": 0.2851, + "step": 437 + }, + { + "epoch": 0.6963434022257552, + "grad_norm": 15.17818027109146, + "learning_rate": 2.1850000000000003e-05, + "loss": 34.6932, + "step": 438 + }, + { + "epoch": 0.6979332273449921, + "grad_norm": 2.0774003977892566, + "learning_rate": 2.19e-05, + "loss": 0.2929, + "step": 439 + }, + { + "epoch": 0.699523052464229, + "grad_norm": 1.5629109099576126, + "learning_rate": 2.195e-05, + "loss": 0.3082, + "step": 440 + }, + { + "epoch": 0.7011128775834659, + "grad_norm": 1.7664022102585626, + "learning_rate": 2.2e-05, + "loss": 0.2367, + "step": 441 + }, + { + "epoch": 0.7027027027027027, + "grad_norm": 1.727077385431117, + "learning_rate": 2.205e-05, + "loss": 0.2285, + "step": 442 + }, + { + "epoch": 0.7042925278219396, + "grad_norm": 3.6624333276020735, + "learning_rate": 2.2100000000000002e-05, + "loss": 0.2804, + "step": 443 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 1.793650077807731, + "learning_rate": 2.215e-05, + "loss": 0.2906, + "step": 444 + }, + { + "epoch": 0.7074721780604134, + "grad_norm": 1.226351259706569, + "learning_rate": 2.22e-05, + "loss": 0.2709, + "step": 445 + }, + { + "epoch": 0.7090620031796503, + "grad_norm": 1.020389687996493, + "learning_rate": 2.2250000000000002e-05, + "loss": 0.2119, + "step": 446 + }, + { + "epoch": 0.7106518282988871, + "grad_norm": 1.4107150443955376, + "learning_rate": 2.23e-05, + "loss": 0.2613, + "step": 447 + }, + { + "epoch": 0.712241653418124, + "grad_norm": 1.919350223023952, + "learning_rate": 2.235e-05, + "loss": 0.267, + "step": 448 + }, + { + "epoch": 0.7138314785373608, + "grad_norm": 1.5806369448612663, + "learning_rate": 2.2400000000000002e-05, + "loss": 0.2502, + "step": 449 + }, + { + "epoch": 0.7154213036565977, + "grad_norm": 1.7989949820303464, + "learning_rate": 2.245e-05, + "loss": 0.2203, + "step": 450 + }, + { + "epoch": 0.7170111287758346, + "grad_norm": 1.700133875939382, + "learning_rate": 2.25e-05, + "loss": 0.1961, + "step": 451 + }, + { + "epoch": 0.7186009538950715, + "grad_norm": 1.560158790551767, + "learning_rate": 2.2550000000000003e-05, + "loss": 0.2562, + "step": 452 + }, + { + "epoch": 0.7201907790143084, + "grad_norm": 1.49759384159501, + "learning_rate": 2.26e-05, + "loss": 0.1815, + "step": 453 + }, + { + "epoch": 0.7217806041335453, + "grad_norm": 1.5272038838472888, + "learning_rate": 2.265e-05, + "loss": 0.3082, + "step": 454 + }, + { + "epoch": 0.7233704292527822, + "grad_norm": 1.4654228423980578, + "learning_rate": 2.2700000000000003e-05, + "loss": 0.2525, + "step": 455 + }, + { + "epoch": 0.724960254372019, + "grad_norm": 2.6419267054939395, + "learning_rate": 2.275e-05, + "loss": 0.2989, + "step": 456 + }, + { + "epoch": 0.7265500794912559, + "grad_norm": 1.8752035664498794, + "learning_rate": 2.2800000000000002e-05, + "loss": 0.2538, + "step": 457 + }, + { + "epoch": 0.7281399046104928, + "grad_norm": 2.264584820258348, + "learning_rate": 2.2850000000000003e-05, + "loss": 0.805, + "step": 458 + }, + { + "epoch": 0.7297297297297297, + "grad_norm": 1.5925093357045932, + "learning_rate": 2.29e-05, + "loss": 0.2073, + "step": 459 + }, + { + "epoch": 0.7313195548489666, + "grad_norm": 0.9148269917229194, + "learning_rate": 2.2950000000000002e-05, + "loss": 0.2443, + "step": 460 + }, + { + "epoch": 0.7329093799682035, + "grad_norm": 1.6769542796185932, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.3197, + "step": 461 + }, + { + "epoch": 0.7344992050874404, + "grad_norm": 1.0108113358566797, + "learning_rate": 2.305e-05, + "loss": 0.1797, + "step": 462 + }, + { + "epoch": 0.7360890302066773, + "grad_norm": 0.83390898111003, + "learning_rate": 2.3100000000000002e-05, + "loss": 0.1997, + "step": 463 + }, + { + "epoch": 0.7376788553259142, + "grad_norm": 1.9984904283104243, + "learning_rate": 2.315e-05, + "loss": 0.2419, + "step": 464 + }, + { + "epoch": 0.739268680445151, + "grad_norm": 1.0798815271392466, + "learning_rate": 2.32e-05, + "loss": 0.2532, + "step": 465 + }, + { + "epoch": 0.7408585055643879, + "grad_norm": 0.8270367502099909, + "learning_rate": 2.3250000000000003e-05, + "loss": 0.1752, + "step": 466 + }, + { + "epoch": 0.7424483306836248, + "grad_norm": 0.9491758147452222, + "learning_rate": 2.33e-05, + "loss": 0.2039, + "step": 467 + }, + { + "epoch": 0.7440381558028617, + "grad_norm": 4.52927124181405, + "learning_rate": 2.3350000000000002e-05, + "loss": 0.3417, + "step": 468 + }, + { + "epoch": 0.7456279809220986, + "grad_norm": 0.9449924613372288, + "learning_rate": 2.3400000000000003e-05, + "loss": 0.2111, + "step": 469 + }, + { + "epoch": 0.7472178060413355, + "grad_norm": 1.25915235032024, + "learning_rate": 2.345e-05, + "loss": 0.3406, + "step": 470 + }, + { + "epoch": 0.7488076311605724, + "grad_norm": 1.9106270981608868, + "learning_rate": 2.3500000000000002e-05, + "loss": 0.3402, + "step": 471 + }, + { + "epoch": 0.7503974562798092, + "grad_norm": 2.0228229077786257, + "learning_rate": 2.3550000000000003e-05, + "loss": 0.2905, + "step": 472 + }, + { + "epoch": 0.7519872813990461, + "grad_norm": 1.5095326494047443, + "learning_rate": 2.3599999999999998e-05, + "loss": 0.2425, + "step": 473 + }, + { + "epoch": 0.753577106518283, + "grad_norm": 1.320784148146851, + "learning_rate": 2.365e-05, + "loss": 0.234, + "step": 474 + }, + { + "epoch": 0.7551669316375199, + "grad_norm": 1.2855968910201843, + "learning_rate": 2.37e-05, + "loss": 0.2842, + "step": 475 + }, + { + "epoch": 0.7567567567567568, + "grad_norm": 1.4494956264699426, + "learning_rate": 2.3749999999999998e-05, + "loss": 0.2664, + "step": 476 + }, + { + "epoch": 0.7583465818759937, + "grad_norm": 0.8222991382675661, + "learning_rate": 2.38e-05, + "loss": 0.1855, + "step": 477 + }, + { + "epoch": 0.7599364069952306, + "grad_norm": 0.9844096899563934, + "learning_rate": 2.385e-05, + "loss": 0.2505, + "step": 478 + }, + { + "epoch": 0.7615262321144675, + "grad_norm": 1.329242141767537, + "learning_rate": 2.3899999999999998e-05, + "loss": 0.3044, + "step": 479 + }, + { + "epoch": 0.7631160572337043, + "grad_norm": 1.9242624036774687, + "learning_rate": 2.395e-05, + "loss": 0.7914, + "step": 480 + }, + { + "epoch": 0.7647058823529411, + "grad_norm": 1.6340339463147588, + "learning_rate": 2.4e-05, + "loss": 0.2829, + "step": 481 + }, + { + "epoch": 0.766295707472178, + "grad_norm": 1.142024448703623, + "learning_rate": 2.405e-05, + "loss": 0.2375, + "step": 482 + }, + { + "epoch": 0.7678855325914149, + "grad_norm": 0.9759993785459193, + "learning_rate": 2.41e-05, + "loss": 0.1668, + "step": 483 + }, + { + "epoch": 0.7694753577106518, + "grad_norm": 3.1130990271113355, + "learning_rate": 2.415e-05, + "loss": 0.2815, + "step": 484 + }, + { + "epoch": 0.7710651828298887, + "grad_norm": 3.6609922525153094, + "learning_rate": 2.42e-05, + "loss": 0.2728, + "step": 485 + }, + { + "epoch": 0.7726550079491256, + "grad_norm": 0.9154339253360586, + "learning_rate": 2.425e-05, + "loss": 0.2652, + "step": 486 + }, + { + "epoch": 0.7742448330683624, + "grad_norm": 0.8585997435988617, + "learning_rate": 2.43e-05, + "loss": 0.2496, + "step": 487 + }, + { + "epoch": 0.7758346581875993, + "grad_norm": 1.3528869649005333, + "learning_rate": 2.435e-05, + "loss": 0.2498, + "step": 488 + }, + { + "epoch": 0.7774244833068362, + "grad_norm": 3.9479601974577845, + "learning_rate": 2.44e-05, + "loss": 0.2524, + "step": 489 + }, + { + "epoch": 0.7790143084260731, + "grad_norm": 1.6569264345352395, + "learning_rate": 2.4449999999999998e-05, + "loss": 0.3827, + "step": 490 + }, + { + "epoch": 0.78060413354531, + "grad_norm": 1.0531724378223204, + "learning_rate": 2.45e-05, + "loss": 0.2136, + "step": 491 + }, + { + "epoch": 0.7821939586645469, + "grad_norm": 18.663037057927276, + "learning_rate": 2.455e-05, + "loss": 34.2023, + "step": 492 + }, + { + "epoch": 0.7837837837837838, + "grad_norm": 1.920925270347047, + "learning_rate": 2.4599999999999998e-05, + "loss": 0.2954, + "step": 493 + }, + { + "epoch": 0.7853736089030207, + "grad_norm": 1.8949845627030673, + "learning_rate": 2.465e-05, + "loss": 0.2053, + "step": 494 + }, + { + "epoch": 0.7869634340222575, + "grad_norm": 2.195073450963552, + "learning_rate": 2.47e-05, + "loss": 0.2328, + "step": 495 + }, + { + "epoch": 0.7885532591414944, + "grad_norm": 1.6320673046451244, + "learning_rate": 2.475e-05, + "loss": 0.232, + "step": 496 + }, + { + "epoch": 0.7901430842607313, + "grad_norm": 0.9253733786099647, + "learning_rate": 2.48e-05, + "loss": 0.2053, + "step": 497 + }, + { + "epoch": 0.7917329093799682, + "grad_norm": 1.2368538293556715, + "learning_rate": 2.485e-05, + "loss": 0.2329, + "step": 498 + }, + { + "epoch": 0.7933227344992051, + "grad_norm": 1.1677231112599487, + "learning_rate": 2.49e-05, + "loss": 0.2244, + "step": 499 + }, + { + "epoch": 0.794912559618442, + "grad_norm": 1.7771671200799681, + "learning_rate": 2.495e-05, + "loss": 0.246, + "step": 500 + }, + { + "epoch": 0.7965023847376789, + "grad_norm": 1.5357997030076218, + "learning_rate": 2.5e-05, + "loss": 0.253, + "step": 501 + }, + { + "epoch": 0.7980922098569158, + "grad_norm": 2.0534501382468378, + "learning_rate": 2.505e-05, + "loss": 0.198, + "step": 502 + }, + { + "epoch": 0.7996820349761526, + "grad_norm": 1.2319068579530479, + "learning_rate": 2.51e-05, + "loss": 0.2458, + "step": 503 + }, + { + "epoch": 0.8012718600953895, + "grad_norm": 0.8732640370197168, + "learning_rate": 2.515e-05, + "loss": 0.2118, + "step": 504 + }, + { + "epoch": 0.8028616852146264, + "grad_norm": 4.41176534113802, + "learning_rate": 2.52e-05, + "loss": 0.4236, + "step": 505 + }, + { + "epoch": 0.8044515103338633, + "grad_norm": 0.7180629972653341, + "learning_rate": 2.525e-05, + "loss": 0.1931, + "step": 506 + }, + { + "epoch": 0.8060413354531002, + "grad_norm": 0.6347767273532949, + "learning_rate": 2.5300000000000002e-05, + "loss": 0.1859, + "step": 507 + }, + { + "epoch": 0.8076311605723371, + "grad_norm": 0.9791426745533244, + "learning_rate": 2.535e-05, + "loss": 0.2799, + "step": 508 + }, + { + "epoch": 0.809220985691574, + "grad_norm": 31.58151475070209, + "learning_rate": 2.54e-05, + "loss": 33.3643, + "step": 509 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 0.935794606298465, + "learning_rate": 2.5450000000000002e-05, + "loss": 0.1858, + "step": 510 + }, + { + "epoch": 0.8124006359300477, + "grad_norm": 4.784009392534647, + "learning_rate": 2.55e-05, + "loss": 0.3739, + "step": 511 + }, + { + "epoch": 0.8139904610492846, + "grad_norm": 6.019860716766144, + "learning_rate": 2.555e-05, + "loss": 0.3004, + "step": 512 + }, + { + "epoch": 0.8155802861685215, + "grad_norm": 1.1532182944468512, + "learning_rate": 2.5600000000000002e-05, + "loss": 0.2125, + "step": 513 + }, + { + "epoch": 0.8171701112877583, + "grad_norm": 0.9353130479866597, + "learning_rate": 2.565e-05, + "loss": 0.2075, + "step": 514 + }, + { + "epoch": 0.8187599364069952, + "grad_norm": 2.2324430700755276, + "learning_rate": 2.57e-05, + "loss": 0.2162, + "step": 515 + }, + { + "epoch": 0.8203497615262321, + "grad_norm": 0.9308155590992341, + "learning_rate": 2.575e-05, + "loss": 0.2124, + "step": 516 + }, + { + "epoch": 0.821939586645469, + "grad_norm": 0.9740386688178756, + "learning_rate": 2.58e-05, + "loss": 0.2846, + "step": 517 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 1.1504004035932034, + "learning_rate": 2.585e-05, + "loss": 0.2811, + "step": 518 + }, + { + "epoch": 0.8251192368839427, + "grad_norm": 1.8083635934870836, + "learning_rate": 2.59e-05, + "loss": 0.2674, + "step": 519 + }, + { + "epoch": 0.8267090620031796, + "grad_norm": 2.955932180890619, + "learning_rate": 2.595e-05, + "loss": 0.33, + "step": 520 + }, + { + "epoch": 0.8282988871224165, + "grad_norm": 2.5779379390702437, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.3311, + "step": 521 + }, + { + "epoch": 0.8298887122416534, + "grad_norm": 1.1818559752883053, + "learning_rate": 2.605e-05, + "loss": 0.3906, + "step": 522 + }, + { + "epoch": 0.8314785373608903, + "grad_norm": 1.00316593008836, + "learning_rate": 2.61e-05, + "loss": 0.241, + "step": 523 + }, + { + "epoch": 0.8330683624801272, + "grad_norm": 1.8935949064822435, + "learning_rate": 2.6150000000000002e-05, + "loss": 0.2752, + "step": 524 + }, + { + "epoch": 0.834658187599364, + "grad_norm": 5.06112572064763, + "learning_rate": 2.62e-05, + "loss": 0.4075, + "step": 525 + }, + { + "epoch": 0.8362480127186009, + "grad_norm": 1.131170852932, + "learning_rate": 2.625e-05, + "loss": 0.2779, + "step": 526 + }, + { + "epoch": 0.8378378378378378, + "grad_norm": 1.5262419720788403, + "learning_rate": 2.6300000000000002e-05, + "loss": 0.2798, + "step": 527 + }, + { + "epoch": 0.8394276629570747, + "grad_norm": 1.2109422840577981, + "learning_rate": 2.635e-05, + "loss": 0.2831, + "step": 528 + }, + { + "epoch": 0.8410174880763116, + "grad_norm": 1.180551181034208, + "learning_rate": 2.64e-05, + "loss": 0.2676, + "step": 529 + }, + { + "epoch": 0.8426073131955485, + "grad_norm": 1.5709962624260545, + "learning_rate": 2.6450000000000003e-05, + "loss": 0.2174, + "step": 530 + }, + { + "epoch": 0.8441971383147854, + "grad_norm": 1.0202447362995952, + "learning_rate": 2.65e-05, + "loss": 0.1891, + "step": 531 + }, + { + "epoch": 0.8457869634340223, + "grad_norm": 2.2267126167782503, + "learning_rate": 2.655e-05, + "loss": 0.2788, + "step": 532 + }, + { + "epoch": 0.8473767885532592, + "grad_norm": 2.968170962753858, + "learning_rate": 2.6600000000000003e-05, + "loss": 0.364, + "step": 533 + }, + { + "epoch": 0.848966613672496, + "grad_norm": 1.6322900689086306, + "learning_rate": 2.665e-05, + "loss": 0.2787, + "step": 534 + }, + { + "epoch": 0.8505564387917329, + "grad_norm": 1.7253332488025737, + "learning_rate": 2.6700000000000002e-05, + "loss": 0.322, + "step": 535 + }, + { + "epoch": 0.8521462639109698, + "grad_norm": 1.7820066248592579, + "learning_rate": 2.6750000000000003e-05, + "loss": 0.3073, + "step": 536 + }, + { + "epoch": 0.8537360890302067, + "grad_norm": 1.5822247601093904, + "learning_rate": 2.68e-05, + "loss": 0.2782, + "step": 537 + }, + { + "epoch": 0.8553259141494436, + "grad_norm": 0.8342169926862201, + "learning_rate": 2.6850000000000002e-05, + "loss": 0.2006, + "step": 538 + }, + { + "epoch": 0.8569157392686805, + "grad_norm": 0.734823269579416, + "learning_rate": 2.69e-05, + "loss": 0.1563, + "step": 539 + }, + { + "epoch": 0.8585055643879174, + "grad_norm": 0.9276752031141264, + "learning_rate": 2.695e-05, + "loss": 0.2182, + "step": 540 + }, + { + "epoch": 0.8600953895071543, + "grad_norm": 0.8092208365303625, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.2417, + "step": 541 + }, + { + "epoch": 0.8616852146263911, + "grad_norm": 3.66079875460622, + "learning_rate": 2.705e-05, + "loss": 0.4777, + "step": 542 + }, + { + "epoch": 0.863275039745628, + "grad_norm": 2.0012817252751636, + "learning_rate": 2.71e-05, + "loss": 0.2564, + "step": 543 + }, + { + "epoch": 0.8648648648648649, + "grad_norm": 0.9741312295304617, + "learning_rate": 2.7150000000000003e-05, + "loss": 0.2345, + "step": 544 + }, + { + "epoch": 0.8664546899841018, + "grad_norm": 2.811529000676048, + "learning_rate": 2.72e-05, + "loss": 0.3443, + "step": 545 + }, + { + "epoch": 0.8680445151033387, + "grad_norm": 1.2524045283902445, + "learning_rate": 2.725e-05, + "loss": 0.2882, + "step": 546 + }, + { + "epoch": 0.8696343402225755, + "grad_norm": 0.8214317656721801, + "learning_rate": 2.7300000000000003e-05, + "loss": 0.2476, + "step": 547 + }, + { + "epoch": 0.8712241653418124, + "grad_norm": 3.934379177825496, + "learning_rate": 2.735e-05, + "loss": 0.2921, + "step": 548 + }, + { + "epoch": 0.8728139904610492, + "grad_norm": 3.341373621978701, + "learning_rate": 2.7400000000000002e-05, + "loss": 1.1221, + "step": 549 + }, + { + "epoch": 0.8744038155802861, + "grad_norm": 1.715300532463595, + "learning_rate": 2.7450000000000003e-05, + "loss": 0.2264, + "step": 550 + }, + { + "epoch": 0.875993640699523, + "grad_norm": 1.7980672728192069, + "learning_rate": 2.75e-05, + "loss": 0.2656, + "step": 551 + }, + { + "epoch": 0.8775834658187599, + "grad_norm": 2.222185061776179, + "learning_rate": 2.7550000000000002e-05, + "loss": 0.2648, + "step": 552 + }, + { + "epoch": 0.8791732909379968, + "grad_norm": 1.286869998742626, + "learning_rate": 2.7600000000000003e-05, + "loss": 0.2524, + "step": 553 + }, + { + "epoch": 0.8807631160572337, + "grad_norm": 1.7807532005365678, + "learning_rate": 2.765e-05, + "loss": 0.2686, + "step": 554 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 2.2800136053495836, + "learning_rate": 2.7700000000000002e-05, + "loss": 0.775, + "step": 555 + }, + { + "epoch": 0.8839427662957074, + "grad_norm": 1.676459694140018, + "learning_rate": 2.7750000000000004e-05, + "loss": 0.261, + "step": 556 + }, + { + "epoch": 0.8855325914149443, + "grad_norm": 3.154493706640757, + "learning_rate": 2.78e-05, + "loss": 0.3026, + "step": 557 + }, + { + "epoch": 0.8871224165341812, + "grad_norm": 1.5682506010155899, + "learning_rate": 2.7850000000000003e-05, + "loss": 0.2674, + "step": 558 + }, + { + "epoch": 0.8887122416534181, + "grad_norm": 1.4730844725055516, + "learning_rate": 2.79e-05, + "loss": 0.2351, + "step": 559 + }, + { + "epoch": 0.890302066772655, + "grad_norm": 1.2630208967180552, + "learning_rate": 2.795e-05, + "loss": 0.2459, + "step": 560 + }, + { + "epoch": 0.8918918918918919, + "grad_norm": 1.9830459201554855, + "learning_rate": 2.8e-05, + "loss": 0.2424, + "step": 561 + }, + { + "epoch": 0.8934817170111288, + "grad_norm": 1.0880761476341176, + "learning_rate": 2.805e-05, + "loss": 0.2227, + "step": 562 + }, + { + "epoch": 0.8950715421303657, + "grad_norm": 0.9161443357295311, + "learning_rate": 2.81e-05, + "loss": 0.1574, + "step": 563 + }, + { + "epoch": 0.8966613672496025, + "grad_norm": 1.661947821577921, + "learning_rate": 2.815e-05, + "loss": 0.2506, + "step": 564 + }, + { + "epoch": 0.8982511923688394, + "grad_norm": 1.1763465802154114, + "learning_rate": 2.8199999999999998e-05, + "loss": 0.2567, + "step": 565 + }, + { + "epoch": 0.8998410174880763, + "grad_norm": 0.8644306861576502, + "learning_rate": 2.825e-05, + "loss": 0.1474, + "step": 566 + }, + { + "epoch": 0.9014308426073132, + "grad_norm": 0.9554644903618156, + "learning_rate": 2.83e-05, + "loss": 0.2543, + "step": 567 + }, + { + "epoch": 0.9030206677265501, + "grad_norm": 1.2130384406634964, + "learning_rate": 2.8349999999999998e-05, + "loss": 0.2093, + "step": 568 + }, + { + "epoch": 0.904610492845787, + "grad_norm": 1.1611308000289235, + "learning_rate": 2.84e-05, + "loss": 0.2628, + "step": 569 + }, + { + "epoch": 0.9062003179650239, + "grad_norm": 1.0419132902835948, + "learning_rate": 2.845e-05, + "loss": 0.1982, + "step": 570 + }, + { + "epoch": 0.9077901430842608, + "grad_norm": 1.2281229501219237, + "learning_rate": 2.8499999999999998e-05, + "loss": 0.2227, + "step": 571 + }, + { + "epoch": 0.9093799682034976, + "grad_norm": 0.8083882338957968, + "learning_rate": 2.855e-05, + "loss": 0.1992, + "step": 572 + }, + { + "epoch": 0.9109697933227345, + "grad_norm": 0.9555461701338538, + "learning_rate": 2.86e-05, + "loss": 0.2388, + "step": 573 + }, + { + "epoch": 0.9125596184419714, + "grad_norm": 1.606807656501827, + "learning_rate": 2.865e-05, + "loss": 0.2244, + "step": 574 + }, + { + "epoch": 0.9141494435612083, + "grad_norm": 0.6549068893545189, + "learning_rate": 2.87e-05, + "loss": 0.2069, + "step": 575 + }, + { + "epoch": 0.9157392686804452, + "grad_norm": 2.43899204545746, + "learning_rate": 2.875e-05, + "loss": 0.1898, + "step": 576 + }, + { + "epoch": 0.9173290937996821, + "grad_norm": 0.9315903411957124, + "learning_rate": 2.88e-05, + "loss": 0.2881, + "step": 577 + }, + { + "epoch": 0.918918918918919, + "grad_norm": 1.0715576730901948, + "learning_rate": 2.885e-05, + "loss": 0.2212, + "step": 578 + }, + { + "epoch": 0.9205087440381559, + "grad_norm": 1.0667589805163178, + "learning_rate": 2.89e-05, + "loss": 0.2369, + "step": 579 + }, + { + "epoch": 0.9220985691573926, + "grad_norm": 13.20690071658688, + "learning_rate": 2.895e-05, + "loss": 33.2293, + "step": 580 + }, + { + "epoch": 0.9236883942766295, + "grad_norm": 0.9963529355605913, + "learning_rate": 2.9e-05, + "loss": 0.2242, + "step": 581 + }, + { + "epoch": 0.9252782193958664, + "grad_norm": 0.8927584997248048, + "learning_rate": 2.905e-05, + "loss": 0.233, + "step": 582 + }, + { + "epoch": 0.9268680445151033, + "grad_norm": 1.2739715347103169, + "learning_rate": 2.91e-05, + "loss": 0.2052, + "step": 583 + }, + { + "epoch": 0.9284578696343402, + "grad_norm": 0.8169976480159199, + "learning_rate": 2.915e-05, + "loss": 0.2306, + "step": 584 + }, + { + "epoch": 0.9300476947535771, + "grad_norm": 1.158265859211442, + "learning_rate": 2.92e-05, + "loss": 0.2241, + "step": 585 + }, + { + "epoch": 0.931637519872814, + "grad_norm": 1.155667591594751, + "learning_rate": 2.925e-05, + "loss": 0.2756, + "step": 586 + }, + { + "epoch": 0.9332273449920508, + "grad_norm": 1.2060672193737954, + "learning_rate": 2.93e-05, + "loss": 0.2508, + "step": 587 + }, + { + "epoch": 0.9348171701112877, + "grad_norm": 1.1307378745583698, + "learning_rate": 2.9350000000000002e-05, + "loss": 0.2242, + "step": 588 + }, + { + "epoch": 0.9364069952305246, + "grad_norm": 1.138250046228856, + "learning_rate": 2.94e-05, + "loss": 0.2727, + "step": 589 + }, + { + "epoch": 0.9379968203497615, + "grad_norm": 1.1337602907713822, + "learning_rate": 2.945e-05, + "loss": 0.2292, + "step": 590 + }, + { + "epoch": 0.9395866454689984, + "grad_norm": 0.9103409950201136, + "learning_rate": 2.95e-05, + "loss": 0.2128, + "step": 591 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 1.0614237421911457, + "learning_rate": 2.955e-05, + "loss": 0.2175, + "step": 592 + }, + { + "epoch": 0.9427662957074722, + "grad_norm": 1.2753047562449893, + "learning_rate": 2.96e-05, + "loss": 0.2246, + "step": 593 + }, + { + "epoch": 0.9443561208267091, + "grad_norm": 0.7703276924313528, + "learning_rate": 2.965e-05, + "loss": 0.2425, + "step": 594 + }, + { + "epoch": 0.9459459459459459, + "grad_norm": 1.191085729051244, + "learning_rate": 2.97e-05, + "loss": 0.2098, + "step": 595 + }, + { + "epoch": 0.9475357710651828, + "grad_norm": 1.4335055767357534, + "learning_rate": 2.975e-05, + "loss": 0.2657, + "step": 596 + }, + { + "epoch": 0.9491255961844197, + "grad_norm": 2.5902824163470273, + "learning_rate": 2.98e-05, + "loss": 0.3129, + "step": 597 + }, + { + "epoch": 0.9507154213036566, + "grad_norm": 0.7592792164328412, + "learning_rate": 2.985e-05, + "loss": 0.1866, + "step": 598 + }, + { + "epoch": 0.9523052464228935, + "grad_norm": 1.2323263878273853, + "learning_rate": 2.9900000000000002e-05, + "loss": 0.219, + "step": 599 + }, + { + "epoch": 0.9538950715421304, + "grad_norm": 15.906398995325986, + "learning_rate": 2.995e-05, + "loss": 34.064, + "step": 600 + }, + { + "epoch": 0.9554848966613673, + "grad_norm": 1.7152370438901547, + "learning_rate": 3e-05, + "loss": 0.2537, + "step": 601 + }, + { + "epoch": 0.9570747217806042, + "grad_norm": 0.8289995835163445, + "learning_rate": 3.000000169231895e-05, + "loss": 0.1988, + "step": 602 + }, + { + "epoch": 0.958664546899841, + "grad_norm": 1.0540008720898175, + "learning_rate": 3.0000006769275235e-05, + "loss": 0.2197, + "step": 603 + }, + { + "epoch": 0.9602543720190779, + "grad_norm": 0.7089689894602045, + "learning_rate": 3.000001523086713e-05, + "loss": 0.2212, + "step": 604 + }, + { + "epoch": 0.9618441971383148, + "grad_norm": 0.9315549187227836, + "learning_rate": 3.0000027077091763e-05, + "loss": 0.2128, + "step": 605 + }, + { + "epoch": 0.9634340222575517, + "grad_norm": 1.0733256441562586, + "learning_rate": 3.0000042307945136e-05, + "loss": 0.2399, + "step": 606 + }, + { + "epoch": 0.9650238473767886, + "grad_norm": 0.65838321699355, + "learning_rate": 3.0000060923422093e-05, + "loss": 0.2058, + "step": 607 + }, + { + "epoch": 0.9666136724960255, + "grad_norm": 1.176262274339728, + "learning_rate": 3.0000082923516334e-05, + "loss": 0.2547, + "step": 608 + }, + { + "epoch": 0.9682034976152624, + "grad_norm": 1.5636274083086186, + "learning_rate": 3.0000108308220412e-05, + "loss": 0.597, + "step": 609 + }, + { + "epoch": 0.9697933227344993, + "grad_norm": 1.1129054524387443, + "learning_rate": 3.000013707752573e-05, + "loss": 0.2609, + "step": 610 + }, + { + "epoch": 0.9713831478537361, + "grad_norm": 0.8584432539009351, + "learning_rate": 3.0000169231422557e-05, + "loss": 0.2409, + "step": 611 + }, + { + "epoch": 0.972972972972973, + "grad_norm": 1.3067867415208732, + "learning_rate": 3.000020476990001e-05, + "loss": 0.2282, + "step": 612 + }, + { + "epoch": 0.9745627980922098, + "grad_norm": 0.8080146441665003, + "learning_rate": 3.000024369294605e-05, + "loss": 0.2043, + "step": 613 + }, + { + "epoch": 0.9761526232114467, + "grad_norm": 0.8995723386100994, + "learning_rate": 3.000028600054751e-05, + "loss": 0.2024, + "step": 614 + }, + { + "epoch": 0.9777424483306836, + "grad_norm": 1.049692608457188, + "learning_rate": 3.000033169269009e-05, + "loss": 0.2015, + "step": 615 + }, + { + "epoch": 0.9793322734499205, + "grad_norm": 0.8227808191012699, + "learning_rate": 3.0000380769358285e-05, + "loss": 0.207, + "step": 616 + }, + { + "epoch": 0.9809220985691574, + "grad_norm": 0.7559858101796544, + "learning_rate": 3.0000433230535512e-05, + "loss": 0.1947, + "step": 617 + }, + { + "epoch": 0.9825119236883942, + "grad_norm": 1.3469294454976612, + "learning_rate": 3.0000489076204015e-05, + "loss": 0.3511, + "step": 618 + }, + { + "epoch": 0.9841017488076311, + "grad_norm": 6.575492156700965, + "learning_rate": 3.0000548306344874e-05, + "loss": 9.4442, + "step": 619 + }, + { + "epoch": 0.985691573926868, + "grad_norm": 1.6443233762862122, + "learning_rate": 3.0000610920938065e-05, + "loss": 0.2584, + "step": 620 + }, + { + "epoch": 0.9872813990461049, + "grad_norm": 1.9143339205402958, + "learning_rate": 3.000067691996238e-05, + "loss": 0.2602, + "step": 621 + }, + { + "epoch": 0.9888712241653418, + "grad_norm": 1.1548804450365433, + "learning_rate": 3.0000746303395484e-05, + "loss": 0.2257, + "step": 622 + }, + { + "epoch": 0.9904610492845787, + "grad_norm": 0.8345467468980763, + "learning_rate": 3.0000819071213893e-05, + "loss": 0.2484, + "step": 623 + }, + { + "epoch": 0.9920508744038156, + "grad_norm": 1.4228214355831577, + "learning_rate": 3.0000895223392975e-05, + "loss": 0.2067, + "step": 624 + }, + { + "epoch": 0.9936406995230525, + "grad_norm": 1.0355629065943222, + "learning_rate": 3.0000974759906957e-05, + "loss": 0.1806, + "step": 625 + }, + { + "epoch": 0.9952305246422893, + "grad_norm": 3.156797680044191, + "learning_rate": 3.000105768072892e-05, + "loss": 0.3152, + "step": 626 + }, + { + "epoch": 0.9968203497615262, + "grad_norm": 1.0999265133106408, + "learning_rate": 3.0001143985830813e-05, + "loss": 0.2226, + "step": 627 + }, + { + "epoch": 0.9984101748807631, + "grad_norm": 1.5296788184001642, + "learning_rate": 3.0001233675183396e-05, + "loss": 0.2072, + "step": 628 + }, + { + "epoch": 1.0, + "grad_norm": 0.7712974215796701, + "learning_rate": 3.0001326748756327e-05, + "loss": 0.2053, + "step": 629 + }, + { + "epoch": 1.0015898251192368, + "grad_norm": 2.995340358807991, + "learning_rate": 3.0001423206518105e-05, + "loss": 0.2754, + "step": 630 + }, + { + "epoch": 1.0031796502384738, + "grad_norm": 3.0538351724870103, + "learning_rate": 3.0001523048436092e-05, + "loss": 0.3338, + "step": 631 + }, + { + "epoch": 1.0047694753577106, + "grad_norm": 0.7369463119843839, + "learning_rate": 3.000162627447647e-05, + "loss": 0.2328, + "step": 632 + }, + { + "epoch": 1.0063593004769475, + "grad_norm": 1.3135358500770693, + "learning_rate": 3.000173288460432e-05, + "loss": 0.2649, + "step": 633 + }, + { + "epoch": 1.0079491255961843, + "grad_norm": 0.806784178198853, + "learning_rate": 3.0001842878783563e-05, + "loss": 0.1796, + "step": 634 + }, + { + "epoch": 1.0095389507154213, + "grad_norm": 1.0015602426133656, + "learning_rate": 3.0001956256976943e-05, + "loss": 0.3043, + "step": 635 + }, + { + "epoch": 1.011128775834658, + "grad_norm": 0.918176471529433, + "learning_rate": 3.0002073019146117e-05, + "loss": 0.157, + "step": 636 + }, + { + "epoch": 1.012718600953895, + "grad_norm": 0.7689770476952236, + "learning_rate": 3.000219316525154e-05, + "loss": 0.2029, + "step": 637 + }, + { + "epoch": 1.0143084260731319, + "grad_norm": 2.220612026181284, + "learning_rate": 3.000231669525257e-05, + "loss": 0.3223, + "step": 638 + }, + { + "epoch": 1.0158982511923689, + "grad_norm": 1.3289670267634566, + "learning_rate": 3.0002443609107383e-05, + "loss": 0.2269, + "step": 639 + }, + { + "epoch": 1.0174880763116056, + "grad_norm": 2.279024567850086, + "learning_rate": 3.000257390677301e-05, + "loss": 0.2806, + "step": 640 + }, + { + "epoch": 1.0190779014308426, + "grad_norm": 0.9680651735383388, + "learning_rate": 3.000270758820539e-05, + "loss": 0.198, + "step": 641 + }, + { + "epoch": 1.0206677265500794, + "grad_norm": 1.2366966363728096, + "learning_rate": 3.000284465335923e-05, + "loss": 0.2, + "step": 642 + }, + { + "epoch": 1.0222575516693164, + "grad_norm": 1.366837553320901, + "learning_rate": 3.000298510218817e-05, + "loss": 0.2102, + "step": 643 + }, + { + "epoch": 1.0238473767885532, + "grad_norm": 1.2072891991648633, + "learning_rate": 3.0003128934644662e-05, + "loss": 0.2176, + "step": 644 + }, + { + "epoch": 1.0254372019077902, + "grad_norm": 0.9109499051253581, + "learning_rate": 3.0003276150680025e-05, + "loss": 0.2015, + "step": 645 + }, + { + "epoch": 1.027027027027027, + "grad_norm": 0.7797646968043357, + "learning_rate": 3.000342675024443e-05, + "loss": 0.2053, + "step": 646 + }, + { + "epoch": 1.028616852146264, + "grad_norm": 1.0289395807668062, + "learning_rate": 3.0003580733286906e-05, + "loss": 0.2458, + "step": 647 + }, + { + "epoch": 1.0302066772655007, + "grad_norm": 2.9413031174873745, + "learning_rate": 3.0003738099755337e-05, + "loss": 0.3945, + "step": 648 + }, + { + "epoch": 1.0317965023847377, + "grad_norm": 0.7462123213900558, + "learning_rate": 3.0003898849596456e-05, + "loss": 0.2368, + "step": 649 + }, + { + "epoch": 1.0333863275039745, + "grad_norm": 1.0423295019165562, + "learning_rate": 3.0004062982755864e-05, + "loss": 0.2084, + "step": 650 + }, + { + "epoch": 1.0349761526232115, + "grad_norm": 1.160870145189582, + "learning_rate": 3.0004230499178e-05, + "loss": 0.2278, + "step": 651 + }, + { + "epoch": 1.0365659777424483, + "grad_norm": 0.6262899016298544, + "learning_rate": 3.000440139880616e-05, + "loss": 0.1994, + "step": 652 + }, + { + "epoch": 1.0381558028616853, + "grad_norm": 1.3464194242216732, + "learning_rate": 3.0004575681582512e-05, + "loss": 0.2085, + "step": 653 + }, + { + "epoch": 1.039745627980922, + "grad_norm": 0.8157226116311506, + "learning_rate": 3.0004753347448062e-05, + "loss": 0.1824, + "step": 654 + }, + { + "epoch": 1.041335453100159, + "grad_norm": 0.9204293152722103, + "learning_rate": 3.0004934396342685e-05, + "loss": 0.2348, + "step": 655 + }, + { + "epoch": 1.0429252782193958, + "grad_norm": 4.90660794098986, + "learning_rate": 3.0005118828205097e-05, + "loss": 0.2671, + "step": 656 + }, + { + "epoch": 1.0445151033386328, + "grad_norm": 1.857005791238994, + "learning_rate": 3.000530664297286e-05, + "loss": 0.2608, + "step": 657 + }, + { + "epoch": 1.0461049284578696, + "grad_norm": 1.1515033621134392, + "learning_rate": 3.0005497840582433e-05, + "loss": 0.1962, + "step": 658 + }, + { + "epoch": 1.0476947535771066, + "grad_norm": 0.867912178692954, + "learning_rate": 3.0005692420969074e-05, + "loss": 0.23, + "step": 659 + }, + { + "epoch": 1.0492845786963434, + "grad_norm": 1.4237670599326264, + "learning_rate": 3.000589038406695e-05, + "loss": 0.2189, + "step": 660 + }, + { + "epoch": 1.0508744038155804, + "grad_norm": 1.13260620764936, + "learning_rate": 3.000609172980904e-05, + "loss": 0.2004, + "step": 661 + }, + { + "epoch": 1.0524642289348172, + "grad_norm": 0.9774487001714098, + "learning_rate": 3.0006296458127206e-05, + "loss": 0.2001, + "step": 662 + }, + { + "epoch": 1.054054054054054, + "grad_norm": 0.7915760572935234, + "learning_rate": 3.000650456895215e-05, + "loss": 0.1935, + "step": 663 + }, + { + "epoch": 1.055643879173291, + "grad_norm": 1.2561264365563078, + "learning_rate": 3.0006716062213444e-05, + "loss": 0.2297, + "step": 664 + }, + { + "epoch": 1.0572337042925277, + "grad_norm": 1.2584264794222662, + "learning_rate": 3.000693093783948e-05, + "loss": 0.239, + "step": 665 + }, + { + "epoch": 1.0588235294117647, + "grad_norm": 1.0537006108869125, + "learning_rate": 3.0007149195757554e-05, + "loss": 0.2286, + "step": 666 + }, + { + "epoch": 1.0604133545310015, + "grad_norm": 1.084314164760084, + "learning_rate": 3.0007370835893784e-05, + "loss": 0.1732, + "step": 667 + }, + { + "epoch": 1.0620031796502385, + "grad_norm": 1.35823376435203, + "learning_rate": 3.0007595858173162e-05, + "loss": 0.5996, + "step": 668 + }, + { + "epoch": 1.0635930047694753, + "grad_norm": 0.782134400966495, + "learning_rate": 3.0007824262519512e-05, + "loss": 0.2388, + "step": 669 + }, + { + "epoch": 1.0651828298887123, + "grad_norm": 18.143302024807802, + "learning_rate": 3.0008056048855544e-05, + "loss": 32.3368, + "step": 670 + }, + { + "epoch": 1.066772655007949, + "grad_norm": 0.824814578222989, + "learning_rate": 3.0008291217102792e-05, + "loss": 0.2112, + "step": 671 + }, + { + "epoch": 1.068362480127186, + "grad_norm": 1.0692941873069601, + "learning_rate": 3.0008529767181656e-05, + "loss": 0.2171, + "step": 672 + }, + { + "epoch": 1.0699523052464228, + "grad_norm": 0.8063601598888985, + "learning_rate": 3.0008771699011413e-05, + "loss": 0.1871, + "step": 673 + }, + { + "epoch": 1.0715421303656598, + "grad_norm": 0.8745071764686374, + "learning_rate": 3.0009017012510175e-05, + "loss": 0.2061, + "step": 674 + }, + { + "epoch": 1.0731319554848966, + "grad_norm": 1.8857782407716015, + "learning_rate": 3.0009265707594907e-05, + "loss": 0.6164, + "step": 675 + }, + { + "epoch": 1.0747217806041336, + "grad_norm": 4.614882279340868, + "learning_rate": 3.0009517784181416e-05, + "loss": 0.3255, + "step": 676 + }, + { + "epoch": 1.0763116057233704, + "grad_norm": 0.900318768421948, + "learning_rate": 3.000977324218442e-05, + "loss": 0.1904, + "step": 677 + }, + { + "epoch": 1.0779014308426074, + "grad_norm": 2.9341041546455964, + "learning_rate": 3.0010032081517434e-05, + "loss": 0.2526, + "step": 678 + }, + { + "epoch": 1.0794912559618441, + "grad_norm": 1.1049186237197868, + "learning_rate": 3.0010294302092857e-05, + "loss": 0.2565, + "step": 679 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 0.804126214905185, + "learning_rate": 3.0010559903821927e-05, + "loss": 0.2088, + "step": 680 + }, + { + "epoch": 1.082670906200318, + "grad_norm": 0.97641024718292, + "learning_rate": 3.0010828886614757e-05, + "loss": 0.2016, + "step": 681 + }, + { + "epoch": 1.084260731319555, + "grad_norm": 1.1602108569842764, + "learning_rate": 3.00111012503803e-05, + "loss": 0.1719, + "step": 682 + }, + { + "epoch": 1.0858505564387917, + "grad_norm": 0.897923127085055, + "learning_rate": 3.0011376995026377e-05, + "loss": 0.2062, + "step": 683 + }, + { + "epoch": 1.0874403815580287, + "grad_norm": 0.7900954260590638, + "learning_rate": 3.001165612045965e-05, + "loss": 0.259, + "step": 684 + }, + { + "epoch": 1.0890302066772655, + "grad_norm": 0.5723827780931718, + "learning_rate": 3.001193862658566e-05, + "loss": 0.1622, + "step": 685 + }, + { + "epoch": 1.0906200317965025, + "grad_norm": 1.4438465347745348, + "learning_rate": 3.001222451330877e-05, + "loss": 0.2408, + "step": 686 + }, + { + "epoch": 1.0922098569157392, + "grad_norm": 0.8352273066307121, + "learning_rate": 3.0012513780532238e-05, + "loss": 0.1961, + "step": 687 + }, + { + "epoch": 1.0937996820349762, + "grad_norm": 0.5336144504295944, + "learning_rate": 3.0012806428158144e-05, + "loss": 0.1486, + "step": 688 + }, + { + "epoch": 1.095389507154213, + "grad_norm": 0.7521578643204917, + "learning_rate": 3.0013102456087433e-05, + "loss": 0.2201, + "step": 689 + }, + { + "epoch": 1.09697933227345, + "grad_norm": 1.0580466861864777, + "learning_rate": 3.001340186421992e-05, + "loss": 0.2289, + "step": 690 + }, + { + "epoch": 1.0985691573926868, + "grad_norm": 0.9595923144828853, + "learning_rate": 3.0013704652454258e-05, + "loss": 0.1913, + "step": 691 + }, + { + "epoch": 1.1001589825119238, + "grad_norm": 20.235232629780796, + "learning_rate": 3.0014010820687985e-05, + "loss": 31.6578, + "step": 692 + }, + { + "epoch": 1.1017488076311606, + "grad_norm": 0.7288383364839481, + "learning_rate": 3.0014320368817447e-05, + "loss": 0.1923, + "step": 693 + }, + { + "epoch": 1.1033386327503973, + "grad_norm": 1.35329484407909, + "learning_rate": 3.0014633296737884e-05, + "loss": 0.2766, + "step": 694 + }, + { + "epoch": 1.1049284578696343, + "grad_norm": 1.0517251280563669, + "learning_rate": 3.0014949604343385e-05, + "loss": 0.2479, + "step": 695 + }, + { + "epoch": 1.1065182829888713, + "grad_norm": 1.6814285279188104, + "learning_rate": 3.0015269291526883e-05, + "loss": 0.2154, + "step": 696 + }, + { + "epoch": 1.1081081081081081, + "grad_norm": 1.239004030067652, + "learning_rate": 3.0015592358180193e-05, + "loss": 0.2793, + "step": 697 + }, + { + "epoch": 1.109697933227345, + "grad_norm": 1.3536998931504256, + "learning_rate": 3.001591880419395e-05, + "loss": 0.1767, + "step": 698 + }, + { + "epoch": 1.1112877583465819, + "grad_norm": 8.398825091086719, + "learning_rate": 3.0016248629457668e-05, + "loss": 9.2342, + "step": 699 + }, + { + "epoch": 1.1128775834658187, + "grad_norm": 1.2669175495681713, + "learning_rate": 3.0016581833859716e-05, + "loss": 0.2639, + "step": 700 + }, + { + "epoch": 1.1144674085850557, + "grad_norm": 2.7954048731359356, + "learning_rate": 3.0016918417287312e-05, + "loss": 0.246, + "step": 701 + }, + { + "epoch": 1.1160572337042924, + "grad_norm": 12.70000817440662, + "learning_rate": 3.0017258379626553e-05, + "loss": 0.629, + "step": 702 + }, + { + "epoch": 1.1176470588235294, + "grad_norm": 0.7362099420269006, + "learning_rate": 3.0017601720762342e-05, + "loss": 0.1971, + "step": 703 + }, + { + "epoch": 1.1192368839427662, + "grad_norm": 2.861913927679887, + "learning_rate": 3.0017948440578506e-05, + "loss": 0.223, + "step": 704 + }, + { + "epoch": 1.1208267090620032, + "grad_norm": 1.0656884043231087, + "learning_rate": 3.001829853895766e-05, + "loss": 0.2829, + "step": 705 + }, + { + "epoch": 1.12241653418124, + "grad_norm": 0.989617017808479, + "learning_rate": 3.001865201578133e-05, + "loss": 0.1867, + "step": 706 + }, + { + "epoch": 1.124006359300477, + "grad_norm": 1.2681973934985606, + "learning_rate": 3.0019008870929875e-05, + "loss": 0.2506, + "step": 707 + }, + { + "epoch": 1.1255961844197138, + "grad_norm": 0.9870081947065322, + "learning_rate": 3.0019369104282496e-05, + "loss": 0.2059, + "step": 708 + }, + { + "epoch": 1.1271860095389508, + "grad_norm": 3.8821504495070758, + "learning_rate": 3.0019732715717285e-05, + "loss": 0.2871, + "step": 709 + }, + { + "epoch": 1.1287758346581875, + "grad_norm": 0.8260856707850688, + "learning_rate": 3.0020099705111165e-05, + "loss": 0.1594, + "step": 710 + }, + { + "epoch": 1.1303656597774245, + "grad_norm": 0.9296416050238138, + "learning_rate": 3.002047007233993e-05, + "loss": 0.2365, + "step": 711 + }, + { + "epoch": 1.1319554848966613, + "grad_norm": 0.8576002924604323, + "learning_rate": 3.002084381727821e-05, + "loss": 0.2389, + "step": 712 + }, + { + "epoch": 1.1335453100158983, + "grad_norm": 0.8722577263240856, + "learning_rate": 3.002122093979952e-05, + "loss": 0.2064, + "step": 713 + }, + { + "epoch": 1.135135135135135, + "grad_norm": 1.147061668202372, + "learning_rate": 3.0021601439776213e-05, + "loss": 0.2059, + "step": 714 + }, + { + "epoch": 1.136724960254372, + "grad_norm": 1.1252038707857024, + "learning_rate": 3.0021985317079507e-05, + "loss": 0.2322, + "step": 715 + }, + { + "epoch": 1.1383147853736089, + "grad_norm": 1.0062040686356222, + "learning_rate": 3.002237257157945e-05, + "loss": 0.1978, + "step": 716 + }, + { + "epoch": 1.1399046104928459, + "grad_norm": 0.6995971478803915, + "learning_rate": 3.0022763203145015e-05, + "loss": 0.2228, + "step": 717 + }, + { + "epoch": 1.1414944356120826, + "grad_norm": 1.1613365235570836, + "learning_rate": 3.0023157211643948e-05, + "loss": 0.2192, + "step": 718 + }, + { + "epoch": 1.1430842607313196, + "grad_norm": 0.9236464597255482, + "learning_rate": 3.0023554596942908e-05, + "loss": 0.2274, + "step": 719 + }, + { + "epoch": 1.1446740858505564, + "grad_norm": 0.6910455473134065, + "learning_rate": 3.002395535890739e-05, + "loss": 0.1976, + "step": 720 + }, + { + "epoch": 1.1462639109697934, + "grad_norm": 1.1031161592937788, + "learning_rate": 3.002435949740176e-05, + "loss": 0.207, + "step": 721 + }, + { + "epoch": 1.1478537360890302, + "grad_norm": 0.8455647962810376, + "learning_rate": 3.0024767012289212e-05, + "loss": 0.225, + "step": 722 + }, + { + "epoch": 1.1494435612082672, + "grad_norm": 1.2732967189592286, + "learning_rate": 3.0025177903431845e-05, + "loss": 0.1997, + "step": 723 + }, + { + "epoch": 1.151033386327504, + "grad_norm": 0.8970554786880177, + "learning_rate": 3.002559217069056e-05, + "loss": 0.223, + "step": 724 + }, + { + "epoch": 1.1526232114467407, + "grad_norm": 0.7799759873081547, + "learning_rate": 3.0026009813925165e-05, + "loss": 0.2655, + "step": 725 + }, + { + "epoch": 1.1542130365659777, + "grad_norm": 3.3035632310650893, + "learning_rate": 3.0026430832994277e-05, + "loss": 0.3052, + "step": 726 + }, + { + "epoch": 1.1558028616852147, + "grad_norm": 1.5418044828466606, + "learning_rate": 3.0026855227755425e-05, + "loss": 0.1817, + "step": 727 + }, + { + "epoch": 1.1573926868044515, + "grad_norm": 1.2269863083227035, + "learning_rate": 3.0027282998064946e-05, + "loss": 0.2562, + "step": 728 + }, + { + "epoch": 1.1589825119236883, + "grad_norm": 1.3352862515834467, + "learning_rate": 3.0027714143778058e-05, + "loss": 0.1742, + "step": 729 + }, + { + "epoch": 1.1605723370429253, + "grad_norm": 0.8333132168884884, + "learning_rate": 3.002814866474884e-05, + "loss": 0.1678, + "step": 730 + }, + { + "epoch": 1.1621621621621623, + "grad_norm": 1.0310335429679847, + "learning_rate": 3.0028586560830226e-05, + "loss": 0.2549, + "step": 731 + }, + { + "epoch": 1.163751987281399, + "grad_norm": 0.7136871917430028, + "learning_rate": 3.0029027831873996e-05, + "loss": 0.1984, + "step": 732 + }, + { + "epoch": 1.1653418124006358, + "grad_norm": 17.780055973434013, + "learning_rate": 3.0029472477730798e-05, + "loss": 31.8157, + "step": 733 + }, + { + "epoch": 1.1669316375198728, + "grad_norm": 1.167455638976974, + "learning_rate": 3.0029920498250133e-05, + "loss": 0.2604, + "step": 734 + }, + { + "epoch": 1.1685214626391096, + "grad_norm": 1.2404234827388172, + "learning_rate": 3.0030371893280367e-05, + "loss": 0.1906, + "step": 735 + }, + { + "epoch": 1.1701112877583466, + "grad_norm": 0.837791903645683, + "learning_rate": 3.0030826662668716e-05, + "loss": 0.172, + "step": 736 + }, + { + "epoch": 1.1717011128775834, + "grad_norm": 1.4070595102168377, + "learning_rate": 3.0031284806261258e-05, + "loss": 0.2123, + "step": 737 + }, + { + "epoch": 1.1732909379968204, + "grad_norm": 1.3437154129961775, + "learning_rate": 3.0031746323902934e-05, + "loss": 0.2666, + "step": 738 + }, + { + "epoch": 1.1748807631160572, + "grad_norm": 1.5739642251257215, + "learning_rate": 3.0032211215437525e-05, + "loss": 0.2191, + "step": 739 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 0.7978420162239285, + "learning_rate": 3.0032679480707695e-05, + "loss": 0.2021, + "step": 740 + }, + { + "epoch": 1.178060413354531, + "grad_norm": 0.7706922168917046, + "learning_rate": 3.003315111955494e-05, + "loss": 0.2036, + "step": 741 + }, + { + "epoch": 1.179650238473768, + "grad_norm": 1.0272058075130088, + "learning_rate": 3.0033626131819636e-05, + "loss": 0.2149, + "step": 742 + }, + { + "epoch": 1.1812400635930047, + "grad_norm": 0.7793723921638207, + "learning_rate": 3.0034104517341004e-05, + "loss": 0.2369, + "step": 743 + }, + { + "epoch": 1.1828298887122417, + "grad_norm": 0.6834376944185209, + "learning_rate": 3.0034586275957124e-05, + "loss": 0.1875, + "step": 744 + }, + { + "epoch": 1.1844197138314785, + "grad_norm": 0.8765064667836667, + "learning_rate": 3.0035071407504953e-05, + "loss": 0.1756, + "step": 745 + }, + { + "epoch": 1.1860095389507155, + "grad_norm": 1.60345883316707, + "learning_rate": 3.0035559911820284e-05, + "loss": 0.2378, + "step": 746 + }, + { + "epoch": 1.1875993640699523, + "grad_norm": 1.177364867278975, + "learning_rate": 3.0036051788737776e-05, + "loss": 0.1592, + "step": 747 + }, + { + "epoch": 1.1891891891891893, + "grad_norm": 1.023919463091455, + "learning_rate": 3.003654703809094e-05, + "loss": 0.2229, + "step": 748 + }, + { + "epoch": 1.190779014308426, + "grad_norm": 1.0439842426111843, + "learning_rate": 3.0037045659712147e-05, + "loss": 0.1564, + "step": 749 + }, + { + "epoch": 1.192368839427663, + "grad_norm": 0.7633620925832583, + "learning_rate": 3.003754765343266e-05, + "loss": 0.2503, + "step": 750 + }, + { + "epoch": 1.1939586645468998, + "grad_norm": 1.1326526071965548, + "learning_rate": 3.003805301908255e-05, + "loss": 0.2037, + "step": 751 + }, + { + "epoch": 1.1955484896661368, + "grad_norm": 0.6942691289599621, + "learning_rate": 3.003856175649076e-05, + "loss": 0.1691, + "step": 752 + }, + { + "epoch": 1.1971383147853736, + "grad_norm": 0.8745700553017423, + "learning_rate": 3.003907386548513e-05, + "loss": 0.2151, + "step": 753 + }, + { + "epoch": 1.1987281399046106, + "grad_norm": 0.6141008459907198, + "learning_rate": 3.0039589345892304e-05, + "loss": 0.2172, + "step": 754 + }, + { + "epoch": 1.2003179650238474, + "grad_norm": 0.7400717305573242, + "learning_rate": 3.004010819753782e-05, + "loss": 0.1969, + "step": 755 + }, + { + "epoch": 1.2019077901430844, + "grad_norm": 1.456544355988812, + "learning_rate": 3.004063042024607e-05, + "loss": 0.2032, + "step": 756 + }, + { + "epoch": 1.2034976152623211, + "grad_norm": 1.3549580207856993, + "learning_rate": 3.0041156013840304e-05, + "loss": 0.16, + "step": 757 + }, + { + "epoch": 1.2050874403815581, + "grad_norm": 1.2320206799728637, + "learning_rate": 3.004168497814261e-05, + "loss": 0.2557, + "step": 758 + }, + { + "epoch": 1.206677265500795, + "grad_norm": 0.6908114887182963, + "learning_rate": 3.004221731297396e-05, + "loss": 0.2008, + "step": 759 + }, + { + "epoch": 1.2082670906200317, + "grad_norm": 1.0048467133330425, + "learning_rate": 3.0042753018154174e-05, + "loss": 0.3971, + "step": 760 + }, + { + "epoch": 1.2098569157392687, + "grad_norm": 0.9939338230550647, + "learning_rate": 3.004329209350196e-05, + "loss": 0.1645, + "step": 761 + }, + { + "epoch": 1.2114467408585057, + "grad_norm": 0.8042681411868798, + "learning_rate": 3.0043834538834827e-05, + "loss": 0.1887, + "step": 762 + }, + { + "epoch": 1.2130365659777425, + "grad_norm": 1.0602905139369727, + "learning_rate": 3.0044380353969195e-05, + "loss": 0.1986, + "step": 763 + }, + { + "epoch": 1.2146263910969792, + "grad_norm": 1.5885686015755542, + "learning_rate": 3.0044929538720324e-05, + "loss": 0.1769, + "step": 764 + }, + { + "epoch": 1.2162162162162162, + "grad_norm": 8.572598868690184, + "learning_rate": 3.004548209290234e-05, + "loss": 16.3842, + "step": 765 + }, + { + "epoch": 1.217806041335453, + "grad_norm": 1.1234669067700735, + "learning_rate": 3.0046038016328214e-05, + "loss": 0.2212, + "step": 766 + }, + { + "epoch": 1.21939586645469, + "grad_norm": 1.2345499196619065, + "learning_rate": 3.004659730880978e-05, + "loss": 0.1988, + "step": 767 + }, + { + "epoch": 1.2209856915739268, + "grad_norm": 1.3952594069000674, + "learning_rate": 3.0047159970157762e-05, + "loss": 0.206, + "step": 768 + }, + { + "epoch": 1.2225755166931638, + "grad_norm": 1.4284866874960678, + "learning_rate": 3.0047726000181693e-05, + "loss": 0.2888, + "step": 769 + }, + { + "epoch": 1.2241653418124006, + "grad_norm": 0.6553585379114284, + "learning_rate": 3.0048295398689997e-05, + "loss": 0.1554, + "step": 770 + }, + { + "epoch": 1.2257551669316376, + "grad_norm": 0.966422509607705, + "learning_rate": 3.0048868165489972e-05, + "loss": 0.2207, + "step": 771 + }, + { + "epoch": 1.2273449920508743, + "grad_norm": 1.3265238901837997, + "learning_rate": 3.0049444300387737e-05, + "loss": 0.2271, + "step": 772 + }, + { + "epoch": 1.2289348171701113, + "grad_norm": 2.0733680500106906, + "learning_rate": 3.00500238031883e-05, + "loss": 0.3018, + "step": 773 + }, + { + "epoch": 1.230524642289348, + "grad_norm": 1.7820103069693711, + "learning_rate": 3.0050606673695528e-05, + "loss": 0.2597, + "step": 774 + }, + { + "epoch": 1.232114467408585, + "grad_norm": 1.6044999969137503, + "learning_rate": 3.0051192911712115e-05, + "loss": 0.246, + "step": 775 + }, + { + "epoch": 1.2337042925278219, + "grad_norm": 0.7609386579343164, + "learning_rate": 3.0051782517039675e-05, + "loss": 0.1615, + "step": 776 + }, + { + "epoch": 1.2352941176470589, + "grad_norm": 0.7873628425158388, + "learning_rate": 3.005237548947861e-05, + "loss": 0.201, + "step": 777 + }, + { + "epoch": 1.2368839427662957, + "grad_norm": 1.417914849653636, + "learning_rate": 3.005297182882826e-05, + "loss": 0.2042, + "step": 778 + }, + { + "epoch": 1.2384737678855327, + "grad_norm": 3.5276793435719482, + "learning_rate": 3.005357153488675e-05, + "loss": 0.3237, + "step": 779 + }, + { + "epoch": 1.2400635930047694, + "grad_norm": 0.9157155060139784, + "learning_rate": 3.005417460745113e-05, + "loss": 0.2949, + "step": 780 + }, + { + "epoch": 1.2416534181240064, + "grad_norm": 1.0359542930699284, + "learning_rate": 3.005478104631727e-05, + "loss": 0.1611, + "step": 781 + }, + { + "epoch": 1.2432432432432432, + "grad_norm": 0.5770360344770976, + "learning_rate": 3.0055390851279902e-05, + "loss": 0.1837, + "step": 782 + }, + { + "epoch": 1.2448330683624802, + "grad_norm": 1.3851905903665145, + "learning_rate": 3.0056004022132648e-05, + "loss": 0.3399, + "step": 783 + }, + { + "epoch": 1.246422893481717, + "grad_norm": 1.0969313876374431, + "learning_rate": 3.0056620558667954e-05, + "loss": 0.2424, + "step": 784 + }, + { + "epoch": 1.248012718600954, + "grad_norm": 1.2255312794821909, + "learning_rate": 3.0057240460677158e-05, + "loss": 0.1819, + "step": 785 + }, + { + "epoch": 1.2496025437201908, + "grad_norm": 0.6217081656017792, + "learning_rate": 3.0057863727950443e-05, + "loss": 0.1814, + "step": 786 + }, + { + "epoch": 1.2511923688394275, + "grad_norm": 2.259275293684557, + "learning_rate": 3.0058490360276844e-05, + "loss": 0.1968, + "step": 787 + }, + { + "epoch": 1.2527821939586645, + "grad_norm": 33.305238996872305, + "learning_rate": 3.005912035744429e-05, + "loss": 30.4028, + "step": 788 + }, + { + "epoch": 1.2543720190779015, + "grad_norm": 0.8373926477766842, + "learning_rate": 3.005975371923953e-05, + "loss": 0.2383, + "step": 789 + }, + { + "epoch": 1.2559618441971383, + "grad_norm": 0.9624395872583089, + "learning_rate": 3.0060390445448207e-05, + "loss": 0.2119, + "step": 790 + }, + { + "epoch": 1.257551669316375, + "grad_norm": 1.3852595564730366, + "learning_rate": 3.0061030535854805e-05, + "loss": 0.2476, + "step": 791 + }, + { + "epoch": 1.259141494435612, + "grad_norm": 0.9154353270724166, + "learning_rate": 3.006167399024267e-05, + "loss": 0.1856, + "step": 792 + }, + { + "epoch": 1.260731319554849, + "grad_norm": 1.342732112673621, + "learning_rate": 3.0062320808394038e-05, + "loss": 0.1877, + "step": 793 + }, + { + "epoch": 1.2623211446740858, + "grad_norm": 1.5805588353923163, + "learning_rate": 3.0062970990089966e-05, + "loss": 0.1977, + "step": 794 + }, + { + "epoch": 1.2639109697933226, + "grad_norm": 0.8193460966900378, + "learning_rate": 3.0063624535110395e-05, + "loss": 0.1636, + "step": 795 + }, + { + "epoch": 1.2655007949125596, + "grad_norm": 1.7480720900396385, + "learning_rate": 3.0064281443234124e-05, + "loss": 0.2202, + "step": 796 + }, + { + "epoch": 1.2670906200317966, + "grad_norm": 0.8360298390813324, + "learning_rate": 3.006494171423882e-05, + "loss": 0.1836, + "step": 797 + }, + { + "epoch": 1.2686804451510334, + "grad_norm": 1.0019310752543276, + "learning_rate": 3.006560534790099e-05, + "loss": 0.197, + "step": 798 + }, + { + "epoch": 1.2702702702702702, + "grad_norm": 1.3541536849473803, + "learning_rate": 3.0066272343996042e-05, + "loss": 0.2492, + "step": 799 + }, + { + "epoch": 1.2718600953895072, + "grad_norm": 1.2661965553614343, + "learning_rate": 3.006694270229819e-05, + "loss": 0.2518, + "step": 800 + }, + { + "epoch": 1.2734499205087442, + "grad_norm": 1.5746695145803955, + "learning_rate": 3.0067616422580567e-05, + "loss": 0.2524, + "step": 801 + }, + { + "epoch": 1.275039745627981, + "grad_norm": 1.5142345958159908, + "learning_rate": 3.0068293504615137e-05, + "loss": 0.7057, + "step": 802 + }, + { + "epoch": 1.2766295707472177, + "grad_norm": 1.4115845920403867, + "learning_rate": 3.0068973948172732e-05, + "loss": 0.2185, + "step": 803 + }, + { + "epoch": 1.2782193958664547, + "grad_norm": 0.9711763828341654, + "learning_rate": 3.0069657753023048e-05, + "loss": 0.2486, + "step": 804 + }, + { + "epoch": 1.2798092209856915, + "grad_norm": 1.0006831313664426, + "learning_rate": 3.0070344918934633e-05, + "loss": 0.1453, + "step": 805 + }, + { + "epoch": 1.2813990461049285, + "grad_norm": 1.037333183685246, + "learning_rate": 3.0071035445674916e-05, + "loss": 0.1607, + "step": 806 + }, + { + "epoch": 1.2829888712241653, + "grad_norm": 1.8406570681094314, + "learning_rate": 3.007172933301017e-05, + "loss": 0.1836, + "step": 807 + }, + { + "epoch": 1.2845786963434023, + "grad_norm": 0.7318388590187899, + "learning_rate": 3.0072426580705546e-05, + "loss": 0.2004, + "step": 808 + }, + { + "epoch": 1.286168521462639, + "grad_norm": 0.9238310107679456, + "learning_rate": 3.0073127188525044e-05, + "loss": 0.1967, + "step": 809 + }, + { + "epoch": 1.287758346581876, + "grad_norm": 1.3424978332659343, + "learning_rate": 3.0073831156231546e-05, + "loss": 0.1955, + "step": 810 + }, + { + "epoch": 1.2893481717011128, + "grad_norm": 1.369310769603383, + "learning_rate": 3.007453848358678e-05, + "loss": 0.2139, + "step": 811 + }, + { + "epoch": 1.2909379968203498, + "grad_norm": 1.686854262564021, + "learning_rate": 3.0075249170351336e-05, + "loss": 0.2092, + "step": 812 + }, + { + "epoch": 1.2925278219395866, + "grad_norm": 1.121282028074738, + "learning_rate": 3.0075963216284673e-05, + "loss": 0.1689, + "step": 813 + }, + { + "epoch": 1.2941176470588236, + "grad_norm": 1.0692057808991164, + "learning_rate": 3.0076680621145115e-05, + "loss": 0.2817, + "step": 814 + }, + { + "epoch": 1.2957074721780604, + "grad_norm": 0.8861886635461192, + "learning_rate": 3.0077401384689846e-05, + "loss": 0.1537, + "step": 815 + }, + { + "epoch": 1.2972972972972974, + "grad_norm": 1.161476359594777, + "learning_rate": 3.0078125506674913e-05, + "loss": 0.1986, + "step": 816 + }, + { + "epoch": 1.2988871224165341, + "grad_norm": 1.5436705101346593, + "learning_rate": 3.007885298685522e-05, + "loss": 0.2138, + "step": 817 + }, + { + "epoch": 1.3004769475357711, + "grad_norm": 1.8051666878249613, + "learning_rate": 3.0079583824984557e-05, + "loss": 0.243, + "step": 818 + }, + { + "epoch": 1.302066772655008, + "grad_norm": 0.9784423637175137, + "learning_rate": 3.0080318020815553e-05, + "loss": 0.159, + "step": 819 + }, + { + "epoch": 1.303656597774245, + "grad_norm": 2.309855956485202, + "learning_rate": 3.0081055574099707e-05, + "loss": 0.3139, + "step": 820 + }, + { + "epoch": 1.3052464228934817, + "grad_norm": 0.9276127761408681, + "learning_rate": 3.008179648458739e-05, + "loss": 0.2219, + "step": 821 + }, + { + "epoch": 1.3068362480127185, + "grad_norm": 0.7304748699703378, + "learning_rate": 3.0082540752027812e-05, + "loss": 0.1646, + "step": 822 + }, + { + "epoch": 1.3084260731319555, + "grad_norm": 14.64675823800732, + "learning_rate": 3.00832883761691e-05, + "loss": 15.586, + "step": 823 + }, + { + "epoch": 1.3100158982511925, + "grad_norm": 1.2378236279257848, + "learning_rate": 3.0084039356758177e-05, + "loss": 0.2368, + "step": 824 + }, + { + "epoch": 1.3116057233704292, + "grad_norm": 1.4936926507539516, + "learning_rate": 3.008479369354088e-05, + "loss": 0.2347, + "step": 825 + }, + { + "epoch": 1.313195548489666, + "grad_norm": 1.0890834190802827, + "learning_rate": 3.00855513862619e-05, + "loss": 0.2153, + "step": 826 + }, + { + "epoch": 1.314785373608903, + "grad_norm": 0.7375420724284333, + "learning_rate": 3.0086312434664765e-05, + "loss": 0.1897, + "step": 827 + }, + { + "epoch": 1.31637519872814, + "grad_norm": 1.4687967108843725, + "learning_rate": 3.00870768384919e-05, + "loss": 0.2224, + "step": 828 + }, + { + "epoch": 1.3179650238473768, + "grad_norm": 1.2028786692470428, + "learning_rate": 3.0087844597484587e-05, + "loss": 0.2819, + "step": 829 + }, + { + "epoch": 1.3195548489666136, + "grad_norm": 23.29656376556712, + "learning_rate": 3.0088615711382948e-05, + "loss": 29.9045, + "step": 830 + }, + { + "epoch": 1.3211446740858506, + "grad_norm": 2.366796724192845, + "learning_rate": 3.008939017992602e-05, + "loss": 0.2086, + "step": 831 + }, + { + "epoch": 1.3227344992050876, + "grad_norm": 0.855058340401459, + "learning_rate": 3.0090168002851636e-05, + "loss": 0.1756, + "step": 832 + }, + { + "epoch": 1.3243243243243243, + "grad_norm": 0.9771343101291011, + "learning_rate": 3.0090949179896565e-05, + "loss": 0.177, + "step": 833 + }, + { + "epoch": 1.3259141494435611, + "grad_norm": 1.4331130342697376, + "learning_rate": 3.0091733710796384e-05, + "loss": 0.2416, + "step": 834 + }, + { + "epoch": 1.3275039745627981, + "grad_norm": 1.0121241450697682, + "learning_rate": 3.0092521595285568e-05, + "loss": 0.1753, + "step": 835 + }, + { + "epoch": 1.329093799682035, + "grad_norm": 1.3819689284377823, + "learning_rate": 3.0093312833097437e-05, + "loss": 0.2043, + "step": 836 + }, + { + "epoch": 1.330683624801272, + "grad_norm": 0.7731294905433505, + "learning_rate": 3.0094107423964208e-05, + "loss": 0.2082, + "step": 837 + }, + { + "epoch": 1.3322734499205087, + "grad_norm": 0.7444981052077096, + "learning_rate": 3.0094905367616906e-05, + "loss": 0.134, + "step": 838 + }, + { + "epoch": 1.3338632750397457, + "grad_norm": 3.060526791884808, + "learning_rate": 3.0095706663785498e-05, + "loss": 0.2738, + "step": 839 + }, + { + "epoch": 1.3354531001589824, + "grad_norm": 1.317461446818166, + "learning_rate": 3.0096511312198732e-05, + "loss": 0.2112, + "step": 840 + }, + { + "epoch": 1.3370429252782194, + "grad_norm": 1.2935365377902823, + "learning_rate": 3.0097319312584298e-05, + "loss": 0.261, + "step": 841 + }, + { + "epoch": 1.3386327503974562, + "grad_norm": 1.602858597091533, + "learning_rate": 3.0098130664668703e-05, + "loss": 0.2384, + "step": 842 + }, + { + "epoch": 1.3402225755166932, + "grad_norm": 1.1569057949109862, + "learning_rate": 3.0098945368177318e-05, + "loss": 0.2119, + "step": 843 + }, + { + "epoch": 1.34181240063593, + "grad_norm": 1.0456882112116346, + "learning_rate": 3.0099763422834424e-05, + "loss": 0.183, + "step": 844 + }, + { + "epoch": 1.343402225755167, + "grad_norm": 0.9923362747451282, + "learning_rate": 3.0100584828363125e-05, + "loss": 0.1606, + "step": 845 + }, + { + "epoch": 1.3449920508744038, + "grad_norm": 22.01693406696935, + "learning_rate": 3.0101409584485403e-05, + "loss": 28.9673, + "step": 846 + }, + { + "epoch": 1.3465818759936408, + "grad_norm": 1.351951339377561, + "learning_rate": 3.0102237690922108e-05, + "loss": 0.2667, + "step": 847 + }, + { + "epoch": 1.3481717011128775, + "grad_norm": 1.416879158804829, + "learning_rate": 3.0103069147392967e-05, + "loss": 0.442, + "step": 848 + }, + { + "epoch": 1.3497615262321145, + "grad_norm": 1.8620752797877833, + "learning_rate": 3.0103903953616543e-05, + "loss": 0.3537, + "step": 849 + }, + { + "epoch": 1.3513513513513513, + "grad_norm": 1.9014671785542079, + "learning_rate": 3.0104742109310305e-05, + "loss": 0.2472, + "step": 850 + }, + { + "epoch": 1.3529411764705883, + "grad_norm": 1.1523999923145103, + "learning_rate": 3.0105583614190558e-05, + "loss": 0.2196, + "step": 851 + }, + { + "epoch": 1.354531001589825, + "grad_norm": 1.2084480862630775, + "learning_rate": 3.0106428467972476e-05, + "loss": 0.2012, + "step": 852 + }, + { + "epoch": 1.3561208267090619, + "grad_norm": 1.1004271284305638, + "learning_rate": 3.0107276670370118e-05, + "loss": 0.2708, + "step": 853 + }, + { + "epoch": 1.3577106518282989, + "grad_norm": 1.4587022076778249, + "learning_rate": 3.0108128221096396e-05, + "loss": 0.2202, + "step": 854 + }, + { + "epoch": 1.3593004769475359, + "grad_norm": 1.2345319072548069, + "learning_rate": 3.010898311986308e-05, + "loss": 0.2529, + "step": 855 + }, + { + "epoch": 1.3608903020667726, + "grad_norm": 1.1949215048100184, + "learning_rate": 3.0109841366380828e-05, + "loss": 0.1775, + "step": 856 + }, + { + "epoch": 1.3624801271860094, + "grad_norm": 2.1709009442675486, + "learning_rate": 3.0110702960359164e-05, + "loss": 0.2037, + "step": 857 + }, + { + "epoch": 1.3640699523052464, + "grad_norm": 1.7798981566833383, + "learning_rate": 3.0111567901506444e-05, + "loss": 0.2198, + "step": 858 + }, + { + "epoch": 1.3656597774244834, + "grad_norm": 2.5850752638868832, + "learning_rate": 3.0112436189529936e-05, + "loss": 0.2908, + "step": 859 + }, + { + "epoch": 1.3672496025437202, + "grad_norm": 1.29148844483089, + "learning_rate": 3.0113307824135764e-05, + "loss": 0.2208, + "step": 860 + }, + { + "epoch": 1.368839427662957, + "grad_norm": 1.11118813127308, + "learning_rate": 3.011418280502889e-05, + "loss": 0.1947, + "step": 861 + }, + { + "epoch": 1.370429252782194, + "grad_norm": 2.9678883622061236, + "learning_rate": 3.0115061131913166e-05, + "loss": 0.2461, + "step": 862 + }, + { + "epoch": 1.372019077901431, + "grad_norm": 1.667169318803546, + "learning_rate": 3.0115942804491326e-05, + "loss": 0.2413, + "step": 863 + }, + { + "epoch": 1.3736089030206677, + "grad_norm": 2.6753934201807468, + "learning_rate": 3.011682782246494e-05, + "loss": 0.2481, + "step": 864 + }, + { + "epoch": 1.3751987281399045, + "grad_norm": 1.755937072758792, + "learning_rate": 3.011771618553447e-05, + "loss": 0.3807, + "step": 865 + }, + { + "epoch": 1.3767885532591415, + "grad_norm": 1.4280951549376069, + "learning_rate": 3.0118607893399245e-05, + "loss": 0.2097, + "step": 866 + }, + { + "epoch": 1.3783783783783785, + "grad_norm": 4.0611054950607315, + "learning_rate": 3.0119502945757437e-05, + "loss": 0.2905, + "step": 867 + }, + { + "epoch": 1.3799682034976153, + "grad_norm": 1.574806507287194, + "learning_rate": 3.012040134230611e-05, + "loss": 0.2587, + "step": 868 + }, + { + "epoch": 1.381558028616852, + "grad_norm": 1.2915169012661318, + "learning_rate": 3.012130308274119e-05, + "loss": 0.2202, + "step": 869 + }, + { + "epoch": 1.383147853736089, + "grad_norm": 2.4218785615165497, + "learning_rate": 3.0122208166757473e-05, + "loss": 0.6821, + "step": 870 + }, + { + "epoch": 1.3847376788553258, + "grad_norm": 2.2373306717355512, + "learning_rate": 3.0123116594048624e-05, + "loss": 0.2793, + "step": 871 + }, + { + "epoch": 1.3863275039745628, + "grad_norm": 1.6023318381656593, + "learning_rate": 3.0124028364307165e-05, + "loss": 0.1828, + "step": 872 + }, + { + "epoch": 1.3879173290937996, + "grad_norm": 1.225687758839214, + "learning_rate": 3.0124943477224493e-05, + "loss": 0.1997, + "step": 873 + }, + { + "epoch": 1.3895071542130366, + "grad_norm": 1.381824957715516, + "learning_rate": 3.0125861932490883e-05, + "loss": 0.2219, + "step": 874 + }, + { + "epoch": 1.3910969793322734, + "grad_norm": 1.4890748513157812, + "learning_rate": 3.0126783729795474e-05, + "loss": 0.2373, + "step": 875 + }, + { + "epoch": 1.3926868044515104, + "grad_norm": 1.8693579765653567, + "learning_rate": 3.012770886882626e-05, + "loss": 0.2776, + "step": 876 + }, + { + "epoch": 1.3942766295707472, + "grad_norm": 1.418823598558248, + "learning_rate": 3.0128637349270122e-05, + "loss": 0.2283, + "step": 877 + }, + { + "epoch": 1.3958664546899842, + "grad_norm": 2.32597871848742, + "learning_rate": 3.0129569170812802e-05, + "loss": 0.278, + "step": 878 + }, + { + "epoch": 1.397456279809221, + "grad_norm": 1.3603750153749643, + "learning_rate": 3.0130504333138905e-05, + "loss": 0.2469, + "step": 879 + }, + { + "epoch": 1.399046104928458, + "grad_norm": 1.4727902631137277, + "learning_rate": 3.013144283593193e-05, + "loss": 0.2477, + "step": 880 + }, + { + "epoch": 1.4006359300476947, + "grad_norm": 1.346381053880536, + "learning_rate": 3.0132384678874206e-05, + "loss": 0.1832, + "step": 881 + }, + { + "epoch": 1.4022257551669317, + "grad_norm": 1.3112243961597747, + "learning_rate": 3.0133329861646977e-05, + "loss": 0.2444, + "step": 882 + }, + { + "epoch": 1.4038155802861685, + "grad_norm": 1.4578806038947298, + "learning_rate": 3.0134278383930308e-05, + "loss": 0.2265, + "step": 883 + }, + { + "epoch": 1.4054054054054055, + "grad_norm": 1.809708055055694, + "learning_rate": 3.0135230245403176e-05, + "loss": 0.2963, + "step": 884 + }, + { + "epoch": 1.4069952305246423, + "grad_norm": 1.7017998120695168, + "learning_rate": 3.01361854457434e-05, + "loss": 0.2172, + "step": 885 + }, + { + "epoch": 1.4085850556438793, + "grad_norm": 1.2430424848140813, + "learning_rate": 3.0137143984627687e-05, + "loss": 0.2237, + "step": 886 + }, + { + "epoch": 1.410174880763116, + "grad_norm": 1.255817553321443, + "learning_rate": 3.0138105861731607e-05, + "loss": 0.2524, + "step": 887 + }, + { + "epoch": 1.4117647058823528, + "grad_norm": 0.9168654408152741, + "learning_rate": 3.013907107672959e-05, + "loss": 0.1558, + "step": 888 + }, + { + "epoch": 1.4133545310015898, + "grad_norm": 2.1306284057490297, + "learning_rate": 3.0140039629294952e-05, + "loss": 0.2802, + "step": 889 + }, + { + "epoch": 1.4149443561208268, + "grad_norm": 1.4155743544498551, + "learning_rate": 3.0141011519099878e-05, + "loss": 0.2404, + "step": 890 + }, + { + "epoch": 1.4165341812400636, + "grad_norm": 1.5796315947385693, + "learning_rate": 3.014198674581541e-05, + "loss": 0.1945, + "step": 891 + }, + { + "epoch": 1.4181240063593004, + "grad_norm": 1.4635847421138366, + "learning_rate": 3.014296530911147e-05, + "loss": 0.2789, + "step": 892 + }, + { + "epoch": 1.4197138314785374, + "grad_norm": 16.435185938297618, + "learning_rate": 3.014394720865685e-05, + "loss": 13.6999, + "step": 893 + }, + { + "epoch": 1.4213036565977744, + "grad_norm": 0.6246273850460909, + "learning_rate": 3.014493244411921e-05, + "loss": 0.1835, + "step": 894 + }, + { + "epoch": 1.4228934817170111, + "grad_norm": 1.0282269555885089, + "learning_rate": 3.0145921015165098e-05, + "loss": 0.2071, + "step": 895 + }, + { + "epoch": 1.424483306836248, + "grad_norm": 0.9012248805350297, + "learning_rate": 3.0146912921459907e-05, + "loss": 0.871, + "step": 896 + }, + { + "epoch": 1.426073131955485, + "grad_norm": 0.8977785080790269, + "learning_rate": 3.0147908162667912e-05, + "loss": 0.238, + "step": 897 + }, + { + "epoch": 1.427662957074722, + "grad_norm": 1.687812654963962, + "learning_rate": 3.0148906738452266e-05, + "loss": 0.338, + "step": 898 + }, + { + "epoch": 1.4292527821939587, + "grad_norm": 1.336690455955879, + "learning_rate": 3.0149908648474973e-05, + "loss": 0.2349, + "step": 899 + }, + { + "epoch": 1.4308426073131955, + "grad_norm": 1.1672328680973998, + "learning_rate": 3.0150913892396944e-05, + "loss": 0.2123, + "step": 900 + }, + { + "epoch": 1.4324324324324325, + "grad_norm": 16.932585097407248, + "learning_rate": 3.0151922469877916e-05, + "loss": 20.3812, + "step": 901 + }, + { + "epoch": 1.4340222575516695, + "grad_norm": 2.4584874470032, + "learning_rate": 3.015293438057655e-05, + "loss": 0.2275, + "step": 902 + }, + { + "epoch": 1.4356120826709062, + "grad_norm": 1.1736699657732497, + "learning_rate": 3.0153949624150332e-05, + "loss": 0.1995, + "step": 903 + }, + { + "epoch": 1.437201907790143, + "grad_norm": 1.316575957160399, + "learning_rate": 3.0154968200255632e-05, + "loss": 0.2001, + "step": 904 + }, + { + "epoch": 1.43879173290938, + "grad_norm": 1.2790587596447025, + "learning_rate": 3.0155990108547726e-05, + "loss": 0.2156, + "step": 905 + }, + { + "epoch": 1.4403815580286168, + "grad_norm": 1.5542338028877063, + "learning_rate": 3.0157015348680703e-05, + "loss": 0.3059, + "step": 906 + }, + { + "epoch": 1.4419713831478538, + "grad_norm": 0.8785217394873726, + "learning_rate": 3.0158043920307588e-05, + "loss": 0.194, + "step": 907 + }, + { + "epoch": 1.4435612082670906, + "grad_norm": 1.014232513583885, + "learning_rate": 3.0159075823080216e-05, + "loss": 0.2055, + "step": 908 + }, + { + "epoch": 1.4451510333863276, + "grad_norm": 2.588731507982686, + "learning_rate": 3.0160111056649346e-05, + "loss": 0.3299, + "step": 909 + }, + { + "epoch": 1.4467408585055643, + "grad_norm": 4.90682119754212, + "learning_rate": 3.016114962066458e-05, + "loss": 0.2438, + "step": 910 + }, + { + "epoch": 1.4483306836248013, + "grad_norm": 1.0354297708483435, + "learning_rate": 3.016219151477441e-05, + "loss": 0.2581, + "step": 911 + }, + { + "epoch": 1.449920508744038, + "grad_norm": 1.338112520760405, + "learning_rate": 3.0163236738626186e-05, + "loss": 0.2241, + "step": 912 + }, + { + "epoch": 1.451510333863275, + "grad_norm": 0.9493098029348258, + "learning_rate": 3.016428529186614e-05, + "loss": 0.2269, + "step": 913 + }, + { + "epoch": 1.4531001589825119, + "grad_norm": 0.9583977133614148, + "learning_rate": 3.016533717413937e-05, + "loss": 0.1663, + "step": 914 + }, + { + "epoch": 1.4546899841017489, + "grad_norm": 1.7063586588431303, + "learning_rate": 3.0166392385089863e-05, + "loss": 0.2247, + "step": 915 + }, + { + "epoch": 1.4562798092209857, + "grad_norm": 0.9554173200113827, + "learning_rate": 3.0167450924360454e-05, + "loss": 0.1655, + "step": 916 + }, + { + "epoch": 1.4578696343402227, + "grad_norm": 1.49206518916465, + "learning_rate": 3.0168512791592876e-05, + "loss": 0.198, + "step": 917 + }, + { + "epoch": 1.4594594594594594, + "grad_norm": 1.3305470627065514, + "learning_rate": 3.016957798642772e-05, + "loss": 0.198, + "step": 918 + }, + { + "epoch": 1.4610492845786962, + "grad_norm": 2.5890646646748885, + "learning_rate": 3.017064650850446e-05, + "loss": 0.36, + "step": 919 + }, + { + "epoch": 1.4626391096979332, + "grad_norm": 1.3259346263383753, + "learning_rate": 3.0171718357461436e-05, + "loss": 0.1933, + "step": 920 + }, + { + "epoch": 1.4642289348171702, + "grad_norm": 1.2617894160843537, + "learning_rate": 3.0172793532935862e-05, + "loss": 0.251, + "step": 921 + }, + { + "epoch": 1.465818759936407, + "grad_norm": 2.421694446667709, + "learning_rate": 3.0173872034563853e-05, + "loss": 0.2503, + "step": 922 + }, + { + "epoch": 1.4674085850556438, + "grad_norm": 0.7648883678245367, + "learning_rate": 3.0174953861980344e-05, + "loss": 0.1747, + "step": 923 + }, + { + "epoch": 1.4689984101748808, + "grad_norm": 0.9160641425796828, + "learning_rate": 3.0176039014819198e-05, + "loss": 0.1673, + "step": 924 + }, + { + "epoch": 1.4705882352941178, + "grad_norm": 1.1373239847956165, + "learning_rate": 3.017712749271311e-05, + "loss": 0.2425, + "step": 925 + }, + { + "epoch": 1.4721780604133545, + "grad_norm": 0.9160398991089251, + "learning_rate": 3.017821929529369e-05, + "loss": 0.2094, + "step": 926 + }, + { + "epoch": 1.4737678855325913, + "grad_norm": 1.0430762381227112, + "learning_rate": 3.0179314422191398e-05, + "loss": 0.1925, + "step": 927 + }, + { + "epoch": 1.4753577106518283, + "grad_norm": 1.3191802162196546, + "learning_rate": 3.0180412873035567e-05, + "loss": 0.2344, + "step": 928 + }, + { + "epoch": 1.4769475357710653, + "grad_norm": 4.947424583482794, + "learning_rate": 3.0181514647454415e-05, + "loss": 0.219, + "step": 929 + }, + { + "epoch": 1.478537360890302, + "grad_norm": 1.5146220055792687, + "learning_rate": 3.018261974507502e-05, + "loss": 0.215, + "step": 930 + }, + { + "epoch": 1.4801271860095389, + "grad_norm": 0.9962538822639876, + "learning_rate": 3.018372816552336e-05, + "loss": 0.2011, + "step": 931 + }, + { + "epoch": 1.4817170111287759, + "grad_norm": 2.341206820361946, + "learning_rate": 3.0184839908424272e-05, + "loss": 0.2776, + "step": 932 + }, + { + "epoch": 1.4833068362480128, + "grad_norm": 2.1388460215707865, + "learning_rate": 3.0185954973401477e-05, + "loss": 0.2068, + "step": 933 + }, + { + "epoch": 1.4848966613672496, + "grad_norm": 3.694706601643514, + "learning_rate": 3.0187073360077545e-05, + "loss": 0.3092, + "step": 934 + }, + { + "epoch": 1.4864864864864864, + "grad_norm": 0.9262196418337145, + "learning_rate": 3.0188195068073968e-05, + "loss": 0.2149, + "step": 935 + }, + { + "epoch": 1.4880763116057234, + "grad_norm": 1.1163350507429823, + "learning_rate": 3.018932009701107e-05, + "loss": 0.2333, + "step": 936 + }, + { + "epoch": 1.4896661367249602, + "grad_norm": 0.9522124896965085, + "learning_rate": 3.019044844650808e-05, + "loss": 0.1414, + "step": 937 + }, + { + "epoch": 1.4912559618441972, + "grad_norm": 1.2939616754619978, + "learning_rate": 3.019158011618309e-05, + "loss": 0.2287, + "step": 938 + }, + { + "epoch": 1.492845786963434, + "grad_norm": 1.0092810060608215, + "learning_rate": 3.0192715105653073e-05, + "loss": 0.2127, + "step": 939 + }, + { + "epoch": 1.494435612082671, + "grad_norm": 1.0406147993798665, + "learning_rate": 3.0193853414533866e-05, + "loss": 0.2095, + "step": 940 + }, + { + "epoch": 1.4960254372019077, + "grad_norm": 0.9071449289935577, + "learning_rate": 3.019499504244021e-05, + "loss": 0.1844, + "step": 941 + }, + { + "epoch": 1.4976152623211447, + "grad_norm": 1.7153900258721835, + "learning_rate": 3.0196139988985688e-05, + "loss": 0.262, + "step": 942 + }, + { + "epoch": 1.4992050874403815, + "grad_norm": 0.5904487087150878, + "learning_rate": 3.019728825378278e-05, + "loss": 0.145, + "step": 943 + }, + { + "epoch": 1.5007949125596185, + "grad_norm": 1.8247822877116087, + "learning_rate": 3.0198439836442845e-05, + "loss": 0.193, + "step": 944 + }, + { + "epoch": 1.5023847376788553, + "grad_norm": 1.579385206385149, + "learning_rate": 3.019959473657612e-05, + "loss": 0.276, + "step": 945 + }, + { + "epoch": 1.503974562798092, + "grad_norm": 21.560067862668113, + "learning_rate": 3.020075295379171e-05, + "loss": 29.8694, + "step": 946 + }, + { + "epoch": 1.505564387917329, + "grad_norm": 0.972419391222347, + "learning_rate": 3.020191448769758e-05, + "loss": 0.1602, + "step": 947 + }, + { + "epoch": 1.507154213036566, + "grad_norm": 1.0211964289986513, + "learning_rate": 3.020307933790062e-05, + "loss": 0.1912, + "step": 948 + }, + { + "epoch": 1.5087440381558028, + "grad_norm": 1.3650571017645552, + "learning_rate": 3.0204247504006562e-05, + "loss": 0.228, + "step": 949 + }, + { + "epoch": 1.5103338632750396, + "grad_norm": 0.8363498829193096, + "learning_rate": 3.020541898562001e-05, + "loss": 0.1629, + "step": 950 + }, + { + "epoch": 1.5119236883942766, + "grad_norm": 2.1189402580073846, + "learning_rate": 3.0206593782344486e-05, + "loss": 0.365, + "step": 951 + }, + { + "epoch": 1.5135135135135136, + "grad_norm": 0.9685848960108028, + "learning_rate": 3.0207771893782342e-05, + "loss": 0.2005, + "step": 952 + }, + { + "epoch": 1.5151033386327504, + "grad_norm": 1.2180447744468175, + "learning_rate": 3.0208953319534837e-05, + "loss": 0.2581, + "step": 953 + }, + { + "epoch": 1.5166931637519872, + "grad_norm": 0.7446223748266646, + "learning_rate": 3.0210138059202102e-05, + "loss": 0.1906, + "step": 954 + }, + { + "epoch": 1.5182829888712241, + "grad_norm": 9.531759256497438, + "learning_rate": 3.021132611238315e-05, + "loss": 15.5882, + "step": 955 + }, + { + "epoch": 1.5198728139904611, + "grad_norm": 1.4016145484029638, + "learning_rate": 3.021251747867586e-05, + "loss": 0.2358, + "step": 956 + }, + { + "epoch": 1.521462639109698, + "grad_norm": 2.310035025908386, + "learning_rate": 3.0213712157677e-05, + "loss": 0.2706, + "step": 957 + }, + { + "epoch": 1.5230524642289347, + "grad_norm": 0.8749840567237552, + "learning_rate": 3.021491014898221e-05, + "loss": 0.2259, + "step": 958 + }, + { + "epoch": 1.5246422893481717, + "grad_norm": 0.9057572896693453, + "learning_rate": 3.0216111452186032e-05, + "loss": 0.1754, + "step": 959 + }, + { + "epoch": 1.5262321144674087, + "grad_norm": 1.3417570903182994, + "learning_rate": 3.021731606688185e-05, + "loss": 0.2093, + "step": 960 + }, + { + "epoch": 1.5278219395866455, + "grad_norm": 0.9549433719026599, + "learning_rate": 3.0218523992661945e-05, + "loss": 0.1812, + "step": 961 + }, + { + "epoch": 1.5294117647058822, + "grad_norm": 1.52526316606716, + "learning_rate": 3.021973522911749e-05, + "loss": 0.2287, + "step": 962 + }, + { + "epoch": 1.5310015898251192, + "grad_norm": 1.3326802009714298, + "learning_rate": 3.0220949775838515e-05, + "loss": 0.2242, + "step": 963 + }, + { + "epoch": 1.5325914149443562, + "grad_norm": 1.1738312452307538, + "learning_rate": 3.022216763241394e-05, + "loss": 0.2364, + "step": 964 + }, + { + "epoch": 1.534181240063593, + "grad_norm": 2.1191503748880214, + "learning_rate": 3.0223388798431565e-05, + "loss": 0.2455, + "step": 965 + }, + { + "epoch": 1.5357710651828298, + "grad_norm": 1.6451229321504048, + "learning_rate": 3.0224613273478083e-05, + "loss": 0.2382, + "step": 966 + }, + { + "epoch": 1.5373608903020668, + "grad_norm": 1.1760832431101025, + "learning_rate": 3.0225841057139037e-05, + "loss": 0.2042, + "step": 967 + }, + { + "epoch": 1.5389507154213038, + "grad_norm": 0.9870826186562225, + "learning_rate": 3.0227072148998876e-05, + "loss": 0.1567, + "step": 968 + }, + { + "epoch": 1.5405405405405406, + "grad_norm": 0.8098166878215339, + "learning_rate": 3.0228306548640926e-05, + "loss": 0.1664, + "step": 969 + }, + { + "epoch": 1.5421303656597773, + "grad_norm": 0.9277490490796043, + "learning_rate": 3.022954425564736e-05, + "loss": 0.193, + "step": 970 + }, + { + "epoch": 1.5437201907790143, + "grad_norm": 0.938893933094649, + "learning_rate": 3.0230785269599295e-05, + "loss": 0.1918, + "step": 971 + }, + { + "epoch": 1.5453100158982513, + "grad_norm": 1.3099503737392495, + "learning_rate": 3.0232029590076657e-05, + "loss": 0.1976, + "step": 972 + }, + { + "epoch": 1.5468998410174881, + "grad_norm": 0.8096349538133699, + "learning_rate": 3.0233277216658317e-05, + "loss": 0.2131, + "step": 973 + }, + { + "epoch": 1.548489666136725, + "grad_norm": 1.1287311944686815, + "learning_rate": 3.0234528148922e-05, + "loss": 0.2728, + "step": 974 + }, + { + "epoch": 1.550079491255962, + "grad_norm": 1.7202651937718745, + "learning_rate": 3.023578238644428e-05, + "loss": 0.2265, + "step": 975 + }, + { + "epoch": 1.551669316375199, + "grad_norm": 1.0953245974833956, + "learning_rate": 3.023703992880067e-05, + "loss": 0.2155, + "step": 976 + }, + { + "epoch": 1.5532591414944354, + "grad_norm": 1.070851671683841, + "learning_rate": 3.0238300775565523e-05, + "loss": 0.1758, + "step": 977 + }, + { + "epoch": 1.5548489666136724, + "grad_norm": 1.0680349712292683, + "learning_rate": 3.0239564926312096e-05, + "loss": 0.219, + "step": 978 + }, + { + "epoch": 1.5564387917329094, + "grad_norm": 2.095857618985453, + "learning_rate": 3.024083238061253e-05, + "loss": 0.2736, + "step": 979 + }, + { + "epoch": 1.5580286168521462, + "grad_norm": 1.2518626882494475, + "learning_rate": 3.0242103138037816e-05, + "loss": 0.1724, + "step": 980 + }, + { + "epoch": 1.559618441971383, + "grad_norm": 0.9340994254212922, + "learning_rate": 3.0243377198157862e-05, + "loss": 0.2236, + "step": 981 + }, + { + "epoch": 1.56120826709062, + "grad_norm": 1.345376697276294, + "learning_rate": 3.0244654560541437e-05, + "loss": 0.2625, + "step": 982 + }, + { + "epoch": 1.562798092209857, + "grad_norm": 0.7581661333420866, + "learning_rate": 3.0245935224756205e-05, + "loss": 0.1954, + "step": 983 + }, + { + "epoch": 1.5643879173290938, + "grad_norm": 0.7877809864590615, + "learning_rate": 3.0247219190368703e-05, + "loss": 0.2382, + "step": 984 + }, + { + "epoch": 1.5659777424483305, + "grad_norm": 0.7596260285949576, + "learning_rate": 3.0248506456944368e-05, + "loss": 0.1981, + "step": 985 + }, + { + "epoch": 1.5675675675675675, + "grad_norm": 1.3782681509129533, + "learning_rate": 3.0249797024047494e-05, + "loss": 0.1895, + "step": 986 + }, + { + "epoch": 1.5691573926868045, + "grad_norm": 1.6252659542161834, + "learning_rate": 3.0251090891241272e-05, + "loss": 0.224, + "step": 987 + }, + { + "epoch": 1.5707472178060413, + "grad_norm": 1.5071942409217602, + "learning_rate": 3.0252388058087784e-05, + "loss": 0.2437, + "step": 988 + }, + { + "epoch": 1.572337042925278, + "grad_norm": 1.3492742680725291, + "learning_rate": 3.0253688524147967e-05, + "loss": 0.2216, + "step": 989 + }, + { + "epoch": 1.573926868044515, + "grad_norm": 1.0371093488450234, + "learning_rate": 3.0254992288981687e-05, + "loss": 0.1956, + "step": 990 + }, + { + "epoch": 1.575516693163752, + "grad_norm": 1.0109677958154635, + "learning_rate": 3.0256299352147643e-05, + "loss": 0.195, + "step": 991 + }, + { + "epoch": 1.5771065182829889, + "grad_norm": 1.3165468239691032, + "learning_rate": 3.0257609713203464e-05, + "loss": 0.3447, + "step": 992 + }, + { + "epoch": 1.5786963434022256, + "grad_norm": 1.2345693498313024, + "learning_rate": 3.0258923371705615e-05, + "loss": 0.2736, + "step": 993 + }, + { + "epoch": 1.5802861685214626, + "grad_norm": 0.7140179416371315, + "learning_rate": 3.026024032720948e-05, + "loss": 0.1818, + "step": 994 + }, + { + "epoch": 1.5818759936406996, + "grad_norm": 0.8561793343494055, + "learning_rate": 3.0261560579269328e-05, + "loss": 0.2059, + "step": 995 + }, + { + "epoch": 1.5834658187599364, + "grad_norm": 0.6707325810119518, + "learning_rate": 3.0262884127438286e-05, + "loss": 0.1424, + "step": 996 + }, + { + "epoch": 1.5850556438791732, + "grad_norm": 1.1564566657592257, + "learning_rate": 3.02642109712684e-05, + "loss": 0.1924, + "step": 997 + }, + { + "epoch": 1.5866454689984102, + "grad_norm": 2.7521413005165845, + "learning_rate": 3.0265541110310563e-05, + "loss": 0.3609, + "step": 998 + }, + { + "epoch": 1.5882352941176472, + "grad_norm": 20.195928288242115, + "learning_rate": 3.0266874544114577e-05, + "loss": 28.6857, + "step": 999 + }, + { + "epoch": 1.589825119236884, + "grad_norm": 0.798479618225531, + "learning_rate": 3.026821127222912e-05, + "loss": 0.2308, + "step": 1000 + }, + { + "epoch": 1.5914149443561207, + "grad_norm": 0.7347165195035272, + "learning_rate": 3.026955129420176e-05, + "loss": 0.1767, + "step": 1001 + }, + { + "epoch": 1.5930047694753577, + "grad_norm": 1.6135172401601705, + "learning_rate": 3.0270894609578962e-05, + "loss": 0.2331, + "step": 1002 + }, + { + "epoch": 1.5945945945945947, + "grad_norm": 1.3851580712301266, + "learning_rate": 3.0272241217906033e-05, + "loss": 0.1947, + "step": 1003 + }, + { + "epoch": 1.5961844197138315, + "grad_norm": 1.198578547234958, + "learning_rate": 3.0273591118727226e-05, + "loss": 0.1809, + "step": 1004 + }, + { + "epoch": 1.5977742448330683, + "grad_norm": 0.6671479533763305, + "learning_rate": 3.0274944311585624e-05, + "loss": 0.139, + "step": 1005 + }, + { + "epoch": 1.5993640699523053, + "grad_norm": 1.1322573374797236, + "learning_rate": 3.0276300796023234e-05, + "loss": 0.2419, + "step": 1006 + }, + { + "epoch": 1.6009538950715423, + "grad_norm": 0.8070991166539896, + "learning_rate": 3.0277660571580933e-05, + "loss": 0.1624, + "step": 1007 + }, + { + "epoch": 1.602543720190779, + "grad_norm": 1.1182059640193802, + "learning_rate": 3.027902363779848e-05, + "loss": 0.2045, + "step": 1008 + }, + { + "epoch": 1.6041335453100158, + "grad_norm": 1.0690797330581425, + "learning_rate": 3.0280389994214533e-05, + "loss": 0.244, + "step": 1009 + }, + { + "epoch": 1.6057233704292528, + "grad_norm": 0.6576839834045582, + "learning_rate": 3.028175964036664e-05, + "loss": 0.127, + "step": 1010 + }, + { + "epoch": 1.6073131955484896, + "grad_norm": 1.116952111367835, + "learning_rate": 3.02831325757912e-05, + "loss": 0.1877, + "step": 1011 + }, + { + "epoch": 1.6089030206677264, + "grad_norm": 0.7363357072866892, + "learning_rate": 3.0284508800023537e-05, + "loss": 0.1853, + "step": 1012 + }, + { + "epoch": 1.6104928457869634, + "grad_norm": 1.3332374340482385, + "learning_rate": 3.0285888312597856e-05, + "loss": 0.2172, + "step": 1013 + }, + { + "epoch": 1.6120826709062004, + "grad_norm": 1.0798458422470192, + "learning_rate": 3.0287271113047227e-05, + "loss": 0.1868, + "step": 1014 + }, + { + "epoch": 1.6136724960254372, + "grad_norm": 1.4191422200107795, + "learning_rate": 3.028865720090364e-05, + "loss": 0.3528, + "step": 1015 + }, + { + "epoch": 1.615262321144674, + "grad_norm": 22.39968352243893, + "learning_rate": 3.0290046575697942e-05, + "loss": 29.3057, + "step": 1016 + }, + { + "epoch": 1.616852146263911, + "grad_norm": 0.8251643810932937, + "learning_rate": 3.0291439236959885e-05, + "loss": 0.216, + "step": 1017 + }, + { + "epoch": 1.618441971383148, + "grad_norm": 1.0129065451329513, + "learning_rate": 3.0292835184218094e-05, + "loss": 0.2122, + "step": 1018 + }, + { + "epoch": 1.6200317965023847, + "grad_norm": 1.3383227005604625, + "learning_rate": 3.02942344170001e-05, + "loss": 0.2426, + "step": 1019 + }, + { + "epoch": 1.6216216216216215, + "grad_norm": 1.039568889190428, + "learning_rate": 3.0295636934832317e-05, + "loss": 0.2075, + "step": 1020 + }, + { + "epoch": 1.6232114467408585, + "grad_norm": 1.4551943038682469, + "learning_rate": 3.029704273724004e-05, + "loss": 0.2207, + "step": 1021 + }, + { + "epoch": 1.6248012718600955, + "grad_norm": 1.7130016701184296, + "learning_rate": 3.029845182374745e-05, + "loss": 0.2328, + "step": 1022 + }, + { + "epoch": 1.6263910969793323, + "grad_norm": 1.3177197780658378, + "learning_rate": 3.029986419387762e-05, + "loss": 0.2341, + "step": 1023 + }, + { + "epoch": 1.627980922098569, + "grad_norm": 1.1661081801724766, + "learning_rate": 3.030127984715253e-05, + "loss": 0.1661, + "step": 1024 + }, + { + "epoch": 1.629570747217806, + "grad_norm": 1.2114763315599728, + "learning_rate": 3.0302698783093024e-05, + "loss": 0.2163, + "step": 1025 + }, + { + "epoch": 1.631160572337043, + "grad_norm": 1.7429045215392118, + "learning_rate": 3.0304121001218837e-05, + "loss": 0.2368, + "step": 1026 + }, + { + "epoch": 1.6327503974562798, + "grad_norm": 1.229976150320729, + "learning_rate": 3.0305546501048617e-05, + "loss": 0.3057, + "step": 1027 + }, + { + "epoch": 1.6343402225755166, + "grad_norm": 0.8277196856122726, + "learning_rate": 3.030697528209986e-05, + "loss": 0.1898, + "step": 1028 + }, + { + "epoch": 1.6359300476947536, + "grad_norm": 1.2518459872273497, + "learning_rate": 3.0308407343888985e-05, + "loss": 0.1998, + "step": 1029 + }, + { + "epoch": 1.6375198728139906, + "grad_norm": 1.1442554900078146, + "learning_rate": 3.0309842685931303e-05, + "loss": 0.2276, + "step": 1030 + }, + { + "epoch": 1.6391096979332274, + "grad_norm": 0.999737171772209, + "learning_rate": 3.0311281307740995e-05, + "loss": 0.1835, + "step": 1031 + }, + { + "epoch": 1.6406995230524641, + "grad_norm": 0.8762389730055639, + "learning_rate": 3.0312723208831133e-05, + "loss": 0.2216, + "step": 1032 + }, + { + "epoch": 1.6422893481717011, + "grad_norm": 0.8722779791702073, + "learning_rate": 3.0314168388713687e-05, + "loss": 0.2085, + "step": 1033 + }, + { + "epoch": 1.6438791732909381, + "grad_norm": 1.651788641872156, + "learning_rate": 3.031561684689953e-05, + "loss": 0.9461, + "step": 1034 + }, + { + "epoch": 1.645468998410175, + "grad_norm": 1.1482000486638182, + "learning_rate": 3.0317068582898385e-05, + "loss": 0.3155, + "step": 1035 + }, + { + "epoch": 1.6470588235294117, + "grad_norm": 1.365565134471463, + "learning_rate": 3.031852359621892e-05, + "loss": 0.2357, + "step": 1036 + }, + { + "epoch": 1.6486486486486487, + "grad_norm": 1.0540632683825748, + "learning_rate": 3.031998188636865e-05, + "loss": 0.2552, + "step": 1037 + }, + { + "epoch": 1.6502384737678857, + "grad_norm": 0.8327786071028546, + "learning_rate": 3.032144345285401e-05, + "loss": 0.1849, + "step": 1038 + }, + { + "epoch": 1.6518282988871225, + "grad_norm": 0.7438323277877603, + "learning_rate": 3.032290829518029e-05, + "loss": 0.1984, + "step": 1039 + }, + { + "epoch": 1.6534181240063592, + "grad_norm": 0.7180055674139668, + "learning_rate": 3.0324376412851707e-05, + "loss": 0.1646, + "step": 1040 + }, + { + "epoch": 1.6550079491255962, + "grad_norm": 1.026385899745576, + "learning_rate": 3.032584780537136e-05, + "loss": 0.2127, + "step": 1041 + }, + { + "epoch": 1.6565977742448332, + "grad_norm": 0.9562139614886938, + "learning_rate": 3.0327322472241228e-05, + "loss": 0.2218, + "step": 1042 + }, + { + "epoch": 1.6581875993640698, + "grad_norm": 6.774052589047219, + "learning_rate": 3.0328800412962206e-05, + "loss": 0.3589, + "step": 1043 + }, + { + "epoch": 1.6597774244833068, + "grad_norm": 0.9231033126659791, + "learning_rate": 3.0330281627034043e-05, + "loss": 0.1542, + "step": 1044 + }, + { + "epoch": 1.6613672496025438, + "grad_norm": 0.961980245210236, + "learning_rate": 3.0331766113955405e-05, + "loss": 0.2483, + "step": 1045 + }, + { + "epoch": 1.6629570747217806, + "grad_norm": 1.8238879872432594, + "learning_rate": 3.033325387322386e-05, + "loss": 0.2628, + "step": 1046 + }, + { + "epoch": 1.6645468998410173, + "grad_norm": 1.2353631896320034, + "learning_rate": 3.0334744904335844e-05, + "loss": 0.1984, + "step": 1047 + }, + { + "epoch": 1.6661367249602543, + "grad_norm": 1.5247606680022396, + "learning_rate": 3.03362392067867e-05, + "loss": 0.2095, + "step": 1048 + }, + { + "epoch": 1.6677265500794913, + "grad_norm": 1.6499831849247582, + "learning_rate": 3.033773678007067e-05, + "loss": 0.1796, + "step": 1049 + }, + { + "epoch": 1.669316375198728, + "grad_norm": 1.4813913359466746, + "learning_rate": 3.0339237623680876e-05, + "loss": 0.2163, + "step": 1050 + }, + { + "epoch": 1.6709062003179649, + "grad_norm": 7.596141787336853, + "learning_rate": 3.0340741737109322e-05, + "loss": 9.2847, + "step": 1051 + }, + { + "epoch": 1.6724960254372019, + "grad_norm": 1.0369600353881152, + "learning_rate": 3.034224911984693e-05, + "loss": 0.2121, + "step": 1052 + }, + { + "epoch": 1.6740858505564389, + "grad_norm": 1.1020226538041997, + "learning_rate": 3.034375977138351e-05, + "loss": 0.1804, + "step": 1053 + }, + { + "epoch": 1.6756756756756757, + "grad_norm": 0.908365170734913, + "learning_rate": 3.0345273691207747e-05, + "loss": 0.1553, + "step": 1054 + }, + { + "epoch": 1.6772655007949124, + "grad_norm": 1.2695625802478705, + "learning_rate": 3.034679087880726e-05, + "loss": 0.2499, + "step": 1055 + }, + { + "epoch": 1.6788553259141494, + "grad_norm": 0.8283985508808241, + "learning_rate": 3.0348311333668503e-05, + "loss": 0.1774, + "step": 1056 + }, + { + "epoch": 1.6804451510333864, + "grad_norm": 1.719289282047525, + "learning_rate": 3.0349835055276883e-05, + "loss": 0.2125, + "step": 1057 + }, + { + "epoch": 1.6820349761526232, + "grad_norm": 1.3724527186516684, + "learning_rate": 3.035136204311667e-05, + "loss": 0.2118, + "step": 1058 + }, + { + "epoch": 1.68362480127186, + "grad_norm": 1.199393190286099, + "learning_rate": 3.035289229667102e-05, + "loss": 0.2746, + "step": 1059 + }, + { + "epoch": 1.685214626391097, + "grad_norm": 1.4838050003907521, + "learning_rate": 3.0354425815422017e-05, + "loss": 0.2221, + "step": 1060 + }, + { + "epoch": 1.686804451510334, + "grad_norm": 0.9449772673819271, + "learning_rate": 3.035596259885061e-05, + "loss": 0.2046, + "step": 1061 + }, + { + "epoch": 1.6883942766295708, + "grad_norm": 0.7772298693527745, + "learning_rate": 3.0357502646436654e-05, + "loss": 0.1979, + "step": 1062 + }, + { + "epoch": 1.6899841017488075, + "grad_norm": 1.7922532106276083, + "learning_rate": 3.03590459576589e-05, + "loss": 0.1719, + "step": 1063 + }, + { + "epoch": 1.6915739268680445, + "grad_norm": 1.4589296485518637, + "learning_rate": 3.036059253199499e-05, + "loss": 0.1684, + "step": 1064 + }, + { + "epoch": 1.6931637519872815, + "grad_norm": 1.0788749078493898, + "learning_rate": 3.0362142368921467e-05, + "loss": 0.1816, + "step": 1065 + }, + { + "epoch": 1.6947535771065183, + "grad_norm": 1.7689777578839445, + "learning_rate": 3.036369546791377e-05, + "loss": 0.2081, + "step": 1066 + }, + { + "epoch": 1.696343402225755, + "grad_norm": 1.0230698090068941, + "learning_rate": 3.0365251828446224e-05, + "loss": 0.1671, + "step": 1067 + }, + { + "epoch": 1.697933227344992, + "grad_norm": 0.7742377541212698, + "learning_rate": 3.0366811449992066e-05, + "loss": 0.2232, + "step": 1068 + }, + { + "epoch": 1.699523052464229, + "grad_norm": 1.0728562233160406, + "learning_rate": 3.0368374332023418e-05, + "loss": 0.2004, + "step": 1069 + }, + { + "epoch": 1.7011128775834659, + "grad_norm": 1.6340510251172695, + "learning_rate": 3.03699404740113e-05, + "loss": 0.1809, + "step": 1070 + }, + { + "epoch": 1.7027027027027026, + "grad_norm": 2.5581660674212094, + "learning_rate": 3.037150987542562e-05, + "loss": 0.2397, + "step": 1071 + }, + { + "epoch": 1.7042925278219396, + "grad_norm": 0.8876447250480038, + "learning_rate": 3.0373082535735213e-05, + "loss": 0.1879, + "step": 1072 + }, + { + "epoch": 1.7058823529411766, + "grad_norm": 2.2660333787437197, + "learning_rate": 3.037465845440777e-05, + "loss": 0.3689, + "step": 1073 + }, + { + "epoch": 1.7074721780604134, + "grad_norm": 1.081687108896482, + "learning_rate": 3.0376237630909923e-05, + "loss": 0.1641, + "step": 1074 + }, + { + "epoch": 1.7090620031796502, + "grad_norm": 1.1329291134230637, + "learning_rate": 3.0377820064707148e-05, + "loss": 0.2323, + "step": 1075 + }, + { + "epoch": 1.7106518282988872, + "grad_norm": 1.3131083197263513, + "learning_rate": 3.0379405755263873e-05, + "loss": 0.1821, + "step": 1076 + }, + { + "epoch": 1.712241653418124, + "grad_norm": 1.263863001755885, + "learning_rate": 3.0380994702043387e-05, + "loss": 0.2343, + "step": 1077 + }, + { + "epoch": 1.7138314785373607, + "grad_norm": 0.9150777263038812, + "learning_rate": 3.0382586904507885e-05, + "loss": 0.1948, + "step": 1078 + }, + { + "epoch": 1.7154213036565977, + "grad_norm": 1.015222000314037, + "learning_rate": 3.0384182362118484e-05, + "loss": 0.2387, + "step": 1079 + }, + { + "epoch": 1.7170111287758347, + "grad_norm": 2.411797433549411, + "learning_rate": 3.0385781074335162e-05, + "loss": 0.3066, + "step": 1080 + }, + { + "epoch": 1.7186009538950715, + "grad_norm": 0.9377602754823323, + "learning_rate": 3.0387383040616815e-05, + "loss": 0.2277, + "step": 1081 + }, + { + "epoch": 1.7201907790143083, + "grad_norm": 0.8059865844581234, + "learning_rate": 3.0388988260421242e-05, + "loss": 0.1716, + "step": 1082 + }, + { + "epoch": 1.7217806041335453, + "grad_norm": 1.6384585871448132, + "learning_rate": 3.039059673320513e-05, + "loss": 0.2632, + "step": 1083 + }, + { + "epoch": 1.7233704292527823, + "grad_norm": 0.7980509939317432, + "learning_rate": 3.0392208458424052e-05, + "loss": 0.2545, + "step": 1084 + }, + { + "epoch": 1.724960254372019, + "grad_norm": 1.104975954260964, + "learning_rate": 3.0393823435532537e-05, + "loss": 0.1616, + "step": 1085 + }, + { + "epoch": 1.7265500794912558, + "grad_norm": 0.8893150566169342, + "learning_rate": 3.039544166398395e-05, + "loss": 0.2233, + "step": 1086 + }, + { + "epoch": 1.7281399046104928, + "grad_norm": 1.28439427250214, + "learning_rate": 3.0397063143230567e-05, + "loss": 0.259, + "step": 1087 + }, + { + "epoch": 1.7297297297297298, + "grad_norm": 0.9780766769616641, + "learning_rate": 3.0398687872723604e-05, + "loss": 0.139, + "step": 1088 + }, + { + "epoch": 1.7313195548489666, + "grad_norm": 0.7245102415263498, + "learning_rate": 3.0400315851913126e-05, + "loss": 0.1496, + "step": 1089 + }, + { + "epoch": 1.7329093799682034, + "grad_norm": 0.8705576855126401, + "learning_rate": 3.0401947080248135e-05, + "loss": 0.2256, + "step": 1090 + }, + { + "epoch": 1.7344992050874404, + "grad_norm": 0.7183604194049938, + "learning_rate": 3.040358155717651e-05, + "loss": 0.1597, + "step": 1091 + }, + { + "epoch": 1.7360890302066774, + "grad_norm": 0.709877694558199, + "learning_rate": 3.0405219282145045e-05, + "loss": 0.1465, + "step": 1092 + }, + { + "epoch": 1.7376788553259142, + "grad_norm": 1.8034292202089164, + "learning_rate": 3.040686025459943e-05, + "loss": 0.2167, + "step": 1093 + }, + { + "epoch": 1.739268680445151, + "grad_norm": 1.0610575768570916, + "learning_rate": 3.0408504473984248e-05, + "loss": 0.2708, + "step": 1094 + }, + { + "epoch": 1.740858505564388, + "grad_norm": 0.7450606370961692, + "learning_rate": 3.0410151939742995e-05, + "loss": 0.19, + "step": 1095 + }, + { + "epoch": 1.742448330683625, + "grad_norm": 0.6325677092298524, + "learning_rate": 3.0411802651318065e-05, + "loss": 0.1757, + "step": 1096 + }, + { + "epoch": 1.7440381558028617, + "grad_norm": 0.82653665205964, + "learning_rate": 3.041345660815076e-05, + "loss": 0.1904, + "step": 1097 + }, + { + "epoch": 1.7456279809220985, + "grad_norm": 1.0873877231830769, + "learning_rate": 3.0415113809681256e-05, + "loss": 0.1822, + "step": 1098 + }, + { + "epoch": 1.7472178060413355, + "grad_norm": 0.9919977370324129, + "learning_rate": 3.041677425534867e-05, + "loss": 0.1911, + "step": 1099 + }, + { + "epoch": 1.7488076311605725, + "grad_norm": 0.589273071212331, + "learning_rate": 3.0418437944590988e-05, + "loss": 0.1832, + "step": 1100 + }, + { + "epoch": 1.7503974562798092, + "grad_norm": 1.0208086383484447, + "learning_rate": 3.042010487684511e-05, + "loss": 0.2155, + "step": 1101 + }, + { + "epoch": 1.751987281399046, + "grad_norm": 1.0176715594852468, + "learning_rate": 3.042177505154685e-05, + "loss": 0.1866, + "step": 1102 + }, + { + "epoch": 1.753577106518283, + "grad_norm": 1.0756777747046793, + "learning_rate": 3.042344846813091e-05, + "loss": 0.2109, + "step": 1103 + }, + { + "epoch": 1.75516693163752, + "grad_norm": 0.8227900305398175, + "learning_rate": 3.0425125126030896e-05, + "loss": 0.1818, + "step": 1104 + }, + { + "epoch": 1.7567567567567568, + "grad_norm": 0.9301873768339785, + "learning_rate": 3.0426805024679327e-05, + "loss": 0.2194, + "step": 1105 + }, + { + "epoch": 1.7583465818759936, + "grad_norm": 1.5434973299121142, + "learning_rate": 3.042848816350761e-05, + "loss": 0.4487, + "step": 1106 + }, + { + "epoch": 1.7599364069952306, + "grad_norm": 1.1556004683668828, + "learning_rate": 3.0430174541946077e-05, + "loss": 0.2309, + "step": 1107 + }, + { + "epoch": 1.7615262321144676, + "grad_norm": 0.870734955334347, + "learning_rate": 3.0431864159423924e-05, + "loss": 0.2494, + "step": 1108 + }, + { + "epoch": 1.7631160572337043, + "grad_norm": 0.8468628260534098, + "learning_rate": 3.043355701536931e-05, + "loss": 0.2457, + "step": 1109 + }, + { + "epoch": 1.7647058823529411, + "grad_norm": 1.1019902423387296, + "learning_rate": 3.043525310920923e-05, + "loss": 0.176, + "step": 1110 + }, + { + "epoch": 1.7662957074721781, + "grad_norm": 1.115427506141178, + "learning_rate": 3.0436952440369646e-05, + "loss": 0.2072, + "step": 1111 + }, + { + "epoch": 1.767885532591415, + "grad_norm": 0.6635894962732646, + "learning_rate": 3.0438655008275384e-05, + "loss": 0.2005, + "step": 1112 + }, + { + "epoch": 1.7694753577106517, + "grad_norm": 0.9838085526265301, + "learning_rate": 3.044036081235019e-05, + "loss": 0.2019, + "step": 1113 + }, + { + "epoch": 1.7710651828298887, + "grad_norm": 1.0367992713995047, + "learning_rate": 3.0442069852016696e-05, + "loss": 0.2042, + "step": 1114 + }, + { + "epoch": 1.7726550079491257, + "grad_norm": 0.9270555243151636, + "learning_rate": 3.0443782126696473e-05, + "loss": 0.1869, + "step": 1115 + }, + { + "epoch": 1.7742448330683624, + "grad_norm": 0.7373418222253314, + "learning_rate": 3.0445497635809985e-05, + "loss": 0.1739, + "step": 1116 + }, + { + "epoch": 1.7758346581875992, + "grad_norm": 0.9030385129023926, + "learning_rate": 3.0447216378776562e-05, + "loss": 0.2012, + "step": 1117 + }, + { + "epoch": 1.7774244833068362, + "grad_norm": 0.9298645783725915, + "learning_rate": 3.0448938355014496e-05, + "loss": 0.1638, + "step": 1118 + }, + { + "epoch": 1.7790143084260732, + "grad_norm": 1.0646618244471782, + "learning_rate": 3.045066356394096e-05, + "loss": 0.1644, + "step": 1119 + }, + { + "epoch": 1.78060413354531, + "grad_norm": 0.6626942473834987, + "learning_rate": 3.045239200497202e-05, + "loss": 0.1658, + "step": 1120 + }, + { + "epoch": 1.7821939586645468, + "grad_norm": 0.5893559076292791, + "learning_rate": 3.045412367752268e-05, + "loss": 0.1768, + "step": 1121 + }, + { + "epoch": 1.7837837837837838, + "grad_norm": 0.7776302265151359, + "learning_rate": 3.045585858100682e-05, + "loss": 0.1573, + "step": 1122 + }, + { + "epoch": 1.7853736089030208, + "grad_norm": 1.0535407381034128, + "learning_rate": 3.0457596714837234e-05, + "loss": 0.2013, + "step": 1123 + }, + { + "epoch": 1.7869634340222575, + "grad_norm": 1.4270550339893346, + "learning_rate": 3.0459338078425624e-05, + "loss": 0.287, + "step": 1124 + }, + { + "epoch": 1.7885532591414943, + "grad_norm": 0.9610717184811765, + "learning_rate": 3.046108267118263e-05, + "loss": 0.2835, + "step": 1125 + }, + { + "epoch": 1.7901430842607313, + "grad_norm": 0.6208644091114227, + "learning_rate": 3.0462830492517734e-05, + "loss": 0.152, + "step": 1126 + }, + { + "epoch": 1.7917329093799683, + "grad_norm": 1.747713820343999, + "learning_rate": 3.046458154183938e-05, + "loss": 0.4747, + "step": 1127 + }, + { + "epoch": 1.793322734499205, + "grad_norm": 1.1523204040244563, + "learning_rate": 3.04663358185549e-05, + "loss": 0.2613, + "step": 1128 + }, + { + "epoch": 1.7949125596184419, + "grad_norm": 1.5645458998789783, + "learning_rate": 3.0468093322070527e-05, + "loss": 0.218, + "step": 1129 + }, + { + "epoch": 1.7965023847376789, + "grad_norm": 1.0264419892393362, + "learning_rate": 3.0469854051791432e-05, + "loss": 0.1817, + "step": 1130 + }, + { + "epoch": 1.7980922098569159, + "grad_norm": 1.2162302125395081, + "learning_rate": 3.047161800712164e-05, + "loss": 1.0549, + "step": 1131 + }, + { + "epoch": 1.7996820349761526, + "grad_norm": 1.0745801815243352, + "learning_rate": 3.0473385187464133e-05, + "loss": 0.2527, + "step": 1132 + }, + { + "epoch": 1.8012718600953894, + "grad_norm": 0.728424129168066, + "learning_rate": 3.0475155592220794e-05, + "loss": 0.1685, + "step": 1133 + }, + { + "epoch": 1.8028616852146264, + "grad_norm": 14.728235801658727, + "learning_rate": 3.0476929220792394e-05, + "loss": 20.7016, + "step": 1134 + }, + { + "epoch": 1.8044515103338634, + "grad_norm": 2.1920150345067864, + "learning_rate": 3.0478706072578618e-05, + "loss": 0.186, + "step": 1135 + }, + { + "epoch": 1.8060413354531002, + "grad_norm": 1.0751156691894526, + "learning_rate": 3.0480486146978074e-05, + "loss": 0.1747, + "step": 1136 + }, + { + "epoch": 1.807631160572337, + "grad_norm": 0.8908957051791061, + "learning_rate": 3.048226944338827e-05, + "loss": 0.1716, + "step": 1137 + }, + { + "epoch": 1.809220985691574, + "grad_norm": 1.8754035281827828, + "learning_rate": 3.0484055961205618e-05, + "loss": 0.2283, + "step": 1138 + }, + { + "epoch": 1.810810810810811, + "grad_norm": 0.9899432694217898, + "learning_rate": 3.0485845699825457e-05, + "loss": 0.1687, + "step": 1139 + }, + { + "epoch": 1.8124006359300477, + "grad_norm": 4.106341204357688, + "learning_rate": 3.0487638658642025e-05, + "loss": 0.3294, + "step": 1140 + }, + { + "epoch": 1.8139904610492845, + "grad_norm": 0.6275997073949349, + "learning_rate": 3.0489434837048468e-05, + "loss": 0.18, + "step": 1141 + }, + { + "epoch": 1.8155802861685215, + "grad_norm": 0.8831001047730209, + "learning_rate": 3.049123423443684e-05, + "loss": 0.215, + "step": 1142 + }, + { + "epoch": 1.8171701112877583, + "grad_norm": 0.7183832030294394, + "learning_rate": 3.0493036850198112e-05, + "loss": 0.1682, + "step": 1143 + }, + { + "epoch": 1.818759936406995, + "grad_norm": 1.9953112788424914, + "learning_rate": 3.0494842683722162e-05, + "loss": 0.2591, + "step": 1144 + }, + { + "epoch": 1.820349761526232, + "grad_norm": 1.3171750197182506, + "learning_rate": 3.049665173439779e-05, + "loss": 0.1961, + "step": 1145 + }, + { + "epoch": 1.821939586645469, + "grad_norm": 0.8114874081953771, + "learning_rate": 3.049846400161269e-05, + "loss": 0.1802, + "step": 1146 + }, + { + "epoch": 1.8235294117647058, + "grad_norm": 3.2290139570364276, + "learning_rate": 3.0500279484753472e-05, + "loss": 0.2699, + "step": 1147 + }, + { + "epoch": 1.8251192368839426, + "grad_norm": 1.3570592685888634, + "learning_rate": 3.0502098183205673e-05, + "loss": 0.2247, + "step": 1148 + }, + { + "epoch": 1.8267090620031796, + "grad_norm": 0.8368270553854885, + "learning_rate": 3.0503920096353727e-05, + "loss": 0.157, + "step": 1149 + }, + { + "epoch": 1.8282988871224166, + "grad_norm": 0.9969744333628553, + "learning_rate": 3.0505745223580955e-05, + "loss": 0.1836, + "step": 1150 + }, + { + "epoch": 1.8298887122416534, + "grad_norm": 1.234835673420031, + "learning_rate": 3.0507573564269658e-05, + "loss": 0.1584, + "step": 1151 + }, + { + "epoch": 1.8314785373608902, + "grad_norm": 27.351762679014104, + "learning_rate": 3.0509405117800992e-05, + "loss": 28.4039, + "step": 1152 + }, + { + "epoch": 1.8330683624801272, + "grad_norm": 1.000350779444888, + "learning_rate": 3.0511239883555036e-05, + "loss": 0.1827, + "step": 1153 + }, + { + "epoch": 1.8346581875993642, + "grad_norm": 0.8498434814140499, + "learning_rate": 3.051307786091079e-05, + "loss": 0.1638, + "step": 1154 + }, + { + "epoch": 1.836248012718601, + "grad_norm": 0.8981793988731618, + "learning_rate": 3.051491904924617e-05, + "loss": 0.2264, + "step": 1155 + }, + { + "epoch": 1.8378378378378377, + "grad_norm": 0.8397168290103142, + "learning_rate": 3.0516763447938013e-05, + "loss": 0.2034, + "step": 1156 + }, + { + "epoch": 1.8394276629570747, + "grad_norm": 1.1000143944864458, + "learning_rate": 3.0518611056362026e-05, + "loss": 0.2631, + "step": 1157 + }, + { + "epoch": 1.8410174880763117, + "grad_norm": 0.7999298844899428, + "learning_rate": 3.052046187389289e-05, + "loss": 0.2086, + "step": 1158 + }, + { + "epoch": 1.8426073131955485, + "grad_norm": 1.3985210103400227, + "learning_rate": 3.052231589990414e-05, + "loss": 0.2358, + "step": 1159 + }, + { + "epoch": 1.8441971383147853, + "grad_norm": 0.7009887118889492, + "learning_rate": 3.052417313376829e-05, + "loss": 0.1429, + "step": 1160 + }, + { + "epoch": 1.8457869634340223, + "grad_norm": 0.8599550223032569, + "learning_rate": 3.0526033574856707e-05, + "loss": 0.1608, + "step": 1161 + }, + { + "epoch": 1.8473767885532593, + "grad_norm": 0.8294913983800233, + "learning_rate": 3.052789722253971e-05, + "loss": 0.1515, + "step": 1162 + }, + { + "epoch": 1.848966613672496, + "grad_norm": 1.0388904243923291, + "learning_rate": 3.052976407618652e-05, + "loss": 0.3636, + "step": 1163 + }, + { + "epoch": 1.8505564387917328, + "grad_norm": 6.67236447135243, + "learning_rate": 3.0531634135165287e-05, + "loss": 39.2069, + "step": 1164 + }, + { + "epoch": 1.8521462639109698, + "grad_norm": 1.1797548102934703, + "learning_rate": 3.0533507398843035e-05, + "loss": 0.3433, + "step": 1165 + }, + { + "epoch": 1.8537360890302068, + "grad_norm": 0.9093756763452346, + "learning_rate": 3.053538386658576e-05, + "loss": 0.1486, + "step": 1166 + }, + { + "epoch": 1.8553259141494436, + "grad_norm": 2.1277926822781215, + "learning_rate": 3.053726353775832e-05, + "loss": 0.2027, + "step": 1167 + }, + { + "epoch": 1.8569157392686804, + "grad_norm": 1.3430101632288174, + "learning_rate": 3.053914641172455e-05, + "loss": 0.2591, + "step": 1168 + }, + { + "epoch": 1.8585055643879174, + "grad_norm": 1.4628339002093025, + "learning_rate": 3.0541032487847134e-05, + "loss": 0.1841, + "step": 1169 + }, + { + "epoch": 1.8600953895071544, + "grad_norm": 1.2276040061269424, + "learning_rate": 3.05429217654877e-05, + "loss": 0.2073, + "step": 1170 + }, + { + "epoch": 1.8616852146263911, + "grad_norm": 1.3848795173779, + "learning_rate": 3.0544814244006825e-05, + "loss": 0.2098, + "step": 1171 + }, + { + "epoch": 1.863275039745628, + "grad_norm": 1.3364132832604656, + "learning_rate": 3.054670992276397e-05, + "loss": 0.2028, + "step": 1172 + }, + { + "epoch": 1.864864864864865, + "grad_norm": 0.68593815657712, + "learning_rate": 3.054860880111748e-05, + "loss": 0.166, + "step": 1173 + }, + { + "epoch": 1.866454689984102, + "grad_norm": 1.0344260040302988, + "learning_rate": 3.055051087842469e-05, + "loss": 0.2093, + "step": 1174 + }, + { + "epoch": 1.8680445151033387, + "grad_norm": 1.9159640101298376, + "learning_rate": 3.0552416154041804e-05, + "loss": 0.2449, + "step": 1175 + }, + { + "epoch": 1.8696343402225755, + "grad_norm": 1.318210499922234, + "learning_rate": 3.055432462732395e-05, + "loss": 0.1904, + "step": 1176 + }, + { + "epoch": 1.8712241653418125, + "grad_norm": 2.0388627574881166, + "learning_rate": 3.0556236297625195e-05, + "loss": 0.2366, + "step": 1177 + }, + { + "epoch": 1.8728139904610492, + "grad_norm": 0.9143372839341986, + "learning_rate": 3.055815116429849e-05, + "loss": 0.1673, + "step": 1178 + }, + { + "epoch": 1.874403815580286, + "grad_norm": 0.8141043688671673, + "learning_rate": 3.056006922669572e-05, + "loss": 0.1781, + "step": 1179 + }, + { + "epoch": 1.875993640699523, + "grad_norm": 1.0319270471681976, + "learning_rate": 3.056199048416771e-05, + "loss": 0.1863, + "step": 1180 + }, + { + "epoch": 1.87758346581876, + "grad_norm": 1.2351467416731574, + "learning_rate": 3.0563914936064166e-05, + "loss": 0.2838, + "step": 1181 + }, + { + "epoch": 1.8791732909379968, + "grad_norm": 1.5447496122492863, + "learning_rate": 3.0565842581733744e-05, + "loss": 0.2709, + "step": 1182 + }, + { + "epoch": 1.8807631160572336, + "grad_norm": 0.9662387779468968, + "learning_rate": 3.0567773420523996e-05, + "loss": 0.2416, + "step": 1183 + }, + { + "epoch": 1.8823529411764706, + "grad_norm": 1.1894142022997076, + "learning_rate": 3.05697074517814e-05, + "loss": 0.2758, + "step": 1184 + }, + { + "epoch": 1.8839427662957076, + "grad_norm": 1.5308181855530882, + "learning_rate": 3.057164467485137e-05, + "loss": 0.2031, + "step": 1185 + }, + { + "epoch": 1.8855325914149443, + "grad_norm": 1.2721557156335144, + "learning_rate": 3.0573585089078214e-05, + "loss": 0.1981, + "step": 1186 + }, + { + "epoch": 1.8871224165341811, + "grad_norm": 1.0550759938044105, + "learning_rate": 3.0575528693805184e-05, + "loss": 0.1587, + "step": 1187 + }, + { + "epoch": 1.8887122416534181, + "grad_norm": 1.2722089586666157, + "learning_rate": 3.057747548837443e-05, + "loss": 0.1592, + "step": 1188 + }, + { + "epoch": 1.890302066772655, + "grad_norm": 1.0533233246999543, + "learning_rate": 3.057942547212703e-05, + "loss": 0.2189, + "step": 1189 + }, + { + "epoch": 1.8918918918918919, + "grad_norm": 1.052896557333416, + "learning_rate": 3.0581378644403e-05, + "loss": 0.1603, + "step": 1190 + }, + { + "epoch": 1.8934817170111287, + "grad_norm": 0.8152404984290667, + "learning_rate": 3.0583335004541253e-05, + "loss": 0.1689, + "step": 1191 + }, + { + "epoch": 1.8950715421303657, + "grad_norm": 0.8616097768579959, + "learning_rate": 3.058529455187962e-05, + "loss": 0.1801, + "step": 1192 + }, + { + "epoch": 1.8966613672496027, + "grad_norm": 0.8114540104972947, + "learning_rate": 3.0587257285754886e-05, + "loss": 0.1542, + "step": 1193 + }, + { + "epoch": 1.8982511923688394, + "grad_norm": 0.8255352330421866, + "learning_rate": 3.058922320550273e-05, + "loss": 0.175, + "step": 1194 + }, + { + "epoch": 1.8998410174880762, + "grad_norm": 1.255939338168114, + "learning_rate": 3.059119231045774e-05, + "loss": 0.2363, + "step": 1195 + }, + { + "epoch": 1.9014308426073132, + "grad_norm": 1.1534406008715932, + "learning_rate": 3.0593164599953476e-05, + "loss": 0.2003, + "step": 1196 + }, + { + "epoch": 1.9030206677265502, + "grad_norm": 0.841267698651767, + "learning_rate": 3.0595140073322374e-05, + "loss": 0.1791, + "step": 1197 + }, + { + "epoch": 1.904610492845787, + "grad_norm": 23.111491486757316, + "learning_rate": 3.0597118729895814e-05, + "loss": 28.0035, + "step": 1198 + }, + { + "epoch": 1.9062003179650238, + "grad_norm": 22.984836443191032, + "learning_rate": 3.059910056900408e-05, + "loss": 27.4725, + "step": 1199 + }, + { + "epoch": 1.9077901430842608, + "grad_norm": 2.383496193752526, + "learning_rate": 3.06010855899764e-05, + "loss": 0.2763, + "step": 1200 + }, + { + "epoch": 1.9093799682034978, + "grad_norm": 1.0386300945084397, + "learning_rate": 3.0603073792140914e-05, + "loss": 0.2217, + "step": 1201 + }, + { + "epoch": 1.9109697933227345, + "grad_norm": 2.1819630915536083, + "learning_rate": 3.0605065174824694e-05, + "loss": 0.2774, + "step": 1202 + }, + { + "epoch": 1.9125596184419713, + "grad_norm": 1.116663408805725, + "learning_rate": 3.060705973735372e-05, + "loss": 0.1846, + "step": 1203 + }, + { + "epoch": 1.9141494435612083, + "grad_norm": 1.8400553429080386, + "learning_rate": 3.0609057479052914e-05, + "loss": 0.2305, + "step": 1204 + }, + { + "epoch": 1.9157392686804453, + "grad_norm": 1.3384370966855554, + "learning_rate": 3.06110583992461e-05, + "loss": 0.2272, + "step": 1205 + }, + { + "epoch": 1.917329093799682, + "grad_norm": 0.9916247321354082, + "learning_rate": 3.061306249725604e-05, + "loss": 0.1669, + "step": 1206 + }, + { + "epoch": 1.9189189189189189, + "grad_norm": 2.1532884746270304, + "learning_rate": 3.0615069772404445e-05, + "loss": 0.2051, + "step": 1207 + }, + { + "epoch": 1.9205087440381559, + "grad_norm": 1.1550652922094646, + "learning_rate": 3.061708022401189e-05, + "loss": 0.1715, + "step": 1208 + }, + { + "epoch": 1.9220985691573926, + "grad_norm": 1.7970752445970744, + "learning_rate": 3.061909385139793e-05, + "loss": 0.2117, + "step": 1209 + }, + { + "epoch": 1.9236883942766294, + "grad_norm": 2.7436833634249336, + "learning_rate": 3.062111065388102e-05, + "loss": 0.2071, + "step": 1210 + }, + { + "epoch": 1.9252782193958664, + "grad_norm": 0.9887203777722529, + "learning_rate": 3.062313063077855e-05, + "loss": 0.1485, + "step": 1211 + }, + { + "epoch": 1.9268680445151034, + "grad_norm": 0.8717724199440463, + "learning_rate": 3.0625153781406824e-05, + "loss": 0.1809, + "step": 1212 + }, + { + "epoch": 1.9284578696343402, + "grad_norm": 1.5202963704600487, + "learning_rate": 3.062718010508108e-05, + "loss": 0.2287, + "step": 1213 + }, + { + "epoch": 1.930047694753577, + "grad_norm": 1.0846154052420984, + "learning_rate": 3.06292096011155e-05, + "loss": 0.167, + "step": 1214 + }, + { + "epoch": 1.931637519872814, + "grad_norm": 1.3130975733395767, + "learning_rate": 3.0631242268823125e-05, + "loss": 0.2015, + "step": 1215 + }, + { + "epoch": 1.933227344992051, + "grad_norm": 1.422738872683954, + "learning_rate": 3.063327810751602e-05, + "loss": 0.2304, + "step": 1216 + }, + { + "epoch": 1.9348171701112877, + "grad_norm": 0.7513321902431526, + "learning_rate": 3.0635317116505114e-05, + "loss": 0.1633, + "step": 1217 + }, + { + "epoch": 1.9364069952305245, + "grad_norm": 1.0408888186528638, + "learning_rate": 3.063735929510026e-05, + "loss": 0.1904, + "step": 1218 + }, + { + "epoch": 1.9379968203497615, + "grad_norm": 1.2013757121906299, + "learning_rate": 3.063940464261026e-05, + "loss": 0.2122, + "step": 1219 + }, + { + "epoch": 1.9395866454689985, + "grad_norm": 2.0673398921607804, + "learning_rate": 3.0641453158342855e-05, + "loss": 0.1858, + "step": 1220 + }, + { + "epoch": 1.9411764705882353, + "grad_norm": 1.2237917388343624, + "learning_rate": 3.064350484160468e-05, + "loss": 0.2016, + "step": 1221 + }, + { + "epoch": 1.942766295707472, + "grad_norm": 1.3159782878890969, + "learning_rate": 3.064555969170132e-05, + "loss": 0.1893, + "step": 1222 + }, + { + "epoch": 1.944356120826709, + "grad_norm": 0.8139917527390591, + "learning_rate": 3.06476177079373e-05, + "loss": 0.1429, + "step": 1223 + }, + { + "epoch": 1.945945945945946, + "grad_norm": 1.3520108765734955, + "learning_rate": 3.064967888961601e-05, + "loss": 0.1843, + "step": 1224 + }, + { + "epoch": 1.9475357710651828, + "grad_norm": 1.0314267662277503, + "learning_rate": 3.065174323603986e-05, + "loss": 0.1893, + "step": 1225 + }, + { + "epoch": 1.9491255961844196, + "grad_norm": 0.9901778127937841, + "learning_rate": 3.0653810746510115e-05, + "loss": 0.1796, + "step": 1226 + }, + { + "epoch": 1.9507154213036566, + "grad_norm": 1.2652652435030727, + "learning_rate": 3.065588142032702e-05, + "loss": 0.1685, + "step": 1227 + }, + { + "epoch": 1.9523052464228936, + "grad_norm": 1.230162636423344, + "learning_rate": 3.0657955256789714e-05, + "loss": 0.2624, + "step": 1228 + }, + { + "epoch": 1.9538950715421304, + "grad_norm": 25.80489300702429, + "learning_rate": 3.066003225519627e-05, + "loss": 29.0589, + "step": 1229 + }, + { + "epoch": 1.9554848966613672, + "grad_norm": 0.861437387897021, + "learning_rate": 3.066211241484371e-05, + "loss": 0.2161, + "step": 1230 + }, + { + "epoch": 1.9570747217806042, + "grad_norm": 0.9196769074957234, + "learning_rate": 3.066419573502798e-05, + "loss": 0.1982, + "step": 1231 + }, + { + "epoch": 1.9586645468998412, + "grad_norm": 27.02897188598479, + "learning_rate": 3.066628221504396e-05, + "loss": 28.2177, + "step": 1232 + }, + { + "epoch": 1.960254372019078, + "grad_norm": 1.0694183116944747, + "learning_rate": 3.066837185418541e-05, + "loss": 0.1928, + "step": 1233 + }, + { + "epoch": 1.9618441971383147, + "grad_norm": 25.387448672311354, + "learning_rate": 3.0670464651745116e-05, + "loss": 29.1451, + "step": 1234 + }, + { + "epoch": 1.9634340222575517, + "grad_norm": 2.7887665419062166, + "learning_rate": 3.0672560607014695e-05, + "loss": 0.2766, + "step": 1235 + }, + { + "epoch": 1.9650238473767887, + "grad_norm": 1.2931539957994642, + "learning_rate": 3.067465971928478e-05, + "loss": 0.1806, + "step": 1236 + }, + { + "epoch": 1.9666136724960255, + "grad_norm": 1.661704258988567, + "learning_rate": 3.067676198784488e-05, + "loss": 0.251, + "step": 1237 + }, + { + "epoch": 1.9682034976152623, + "grad_norm": 1.4693391465834271, + "learning_rate": 3.067886741198345e-05, + "loss": 0.1431, + "step": 1238 + }, + { + "epoch": 1.9697933227344993, + "grad_norm": 1.7196498024808196, + "learning_rate": 3.068097599098789e-05, + "loss": 0.2262, + "step": 1239 + }, + { + "epoch": 1.9713831478537363, + "grad_norm": 0.9381067480056908, + "learning_rate": 3.068308772414451e-05, + "loss": 0.1605, + "step": 1240 + }, + { + "epoch": 1.972972972972973, + "grad_norm": 1.0975186508458736, + "learning_rate": 3.068520261073857e-05, + "loss": 0.204, + "step": 1241 + }, + { + "epoch": 1.9745627980922098, + "grad_norm": 1.242065103804873, + "learning_rate": 3.0687320650054265e-05, + "loss": 0.2281, + "step": 1242 + }, + { + "epoch": 1.9761526232114468, + "grad_norm": 1.2945520946449744, + "learning_rate": 3.068944184137471e-05, + "loss": 0.154, + "step": 1243 + }, + { + "epoch": 1.9777424483306836, + "grad_norm": 1.1931142929894483, + "learning_rate": 3.069156618398196e-05, + "loss": 0.1531, + "step": 1244 + }, + { + "epoch": 1.9793322734499204, + "grad_norm": 1.2589152711075082, + "learning_rate": 3.069369367715701e-05, + "loss": 0.2512, + "step": 1245 + }, + { + "epoch": 1.9809220985691574, + "grad_norm": 1.2612485440947874, + "learning_rate": 3.069582432017975e-05, + "loss": 0.1843, + "step": 1246 + }, + { + "epoch": 1.9825119236883944, + "grad_norm": 1.2883629283670672, + "learning_rate": 3.069795811232907e-05, + "loss": 0.1846, + "step": 1247 + }, + { + "epoch": 1.9841017488076311, + "grad_norm": 1.347256843993211, + "learning_rate": 3.070009505288274e-05, + "loss": 0.2223, + "step": 1248 + }, + { + "epoch": 1.985691573926868, + "grad_norm": 1.7677287754457878, + "learning_rate": 3.0702235141117486e-05, + "loss": 0.2138, + "step": 1249 + }, + { + "epoch": 1.987281399046105, + "grad_norm": 1.6566777398256423, + "learning_rate": 3.0704378376308966e-05, + "loss": 0.2761, + "step": 1250 + }, + { + "epoch": 1.988871224165342, + "grad_norm": 1.3827746323573025, + "learning_rate": 3.0706524757731775e-05, + "loss": 0.1946, + "step": 1251 + }, + { + "epoch": 1.9904610492845787, + "grad_norm": 1.5813255890648994, + "learning_rate": 3.0708674284659444e-05, + "loss": 0.1801, + "step": 1252 + }, + { + "epoch": 1.9920508744038155, + "grad_norm": 1.6070535806157233, + "learning_rate": 3.071082695636442e-05, + "loss": 0.2014, + "step": 1253 + }, + { + "epoch": 1.9936406995230525, + "grad_norm": 2.02366144478435, + "learning_rate": 3.0712982772118114e-05, + "loss": 0.4164, + "step": 1254 + }, + { + "epoch": 1.9952305246422894, + "grad_norm": 1.5040536599606593, + "learning_rate": 3.0715141731190864e-05, + "loss": 0.189, + "step": 1255 + }, + { + "epoch": 1.9968203497615262, + "grad_norm": 1.1947304883545589, + "learning_rate": 3.071730383285194e-05, + "loss": 0.2146, + "step": 1256 + }, + { + "epoch": 1.998410174880763, + "grad_norm": 1.7214270799293931, + "learning_rate": 3.0719469076369525e-05, + "loss": 0.1878, + "step": 1257 + }, + { + "epoch": 2.0, + "grad_norm": 1.0157704917836166, + "learning_rate": 3.0721637461010796e-05, + "loss": 0.187, + "step": 1258 + }, + { + "epoch": 2.001589825119237, + "grad_norm": 1.6285401869986473, + "learning_rate": 3.0723808986041815e-05, + "loss": 0.2418, + "step": 1259 + }, + { + "epoch": 2.0031796502384736, + "grad_norm": 1.4518016869277235, + "learning_rate": 3.072598365072761e-05, + "loss": 0.2176, + "step": 1260 + }, + { + "epoch": 2.0047694753577106, + "grad_norm": 2.3885563057829784, + "learning_rate": 3.072816145433213e-05, + "loss": 0.2059, + "step": 1261 + }, + { + "epoch": 2.0063593004769475, + "grad_norm": 1.0261735852628373, + "learning_rate": 3.073034239611826e-05, + "loss": 0.1838, + "step": 1262 + }, + { + "epoch": 2.0079491255961845, + "grad_norm": 1.5374133458675676, + "learning_rate": 3.073252647534784e-05, + "loss": 0.2478, + "step": 1263 + }, + { + "epoch": 2.009538950715421, + "grad_norm": 0.8128358265322938, + "learning_rate": 3.073471369128163e-05, + "loss": 0.1714, + "step": 1264 + }, + { + "epoch": 2.011128775834658, + "grad_norm": 0.8482097073946372, + "learning_rate": 3.0736904043179346e-05, + "loss": 0.1727, + "step": 1265 + }, + { + "epoch": 2.012718600953895, + "grad_norm": 0.8648768881046509, + "learning_rate": 3.0739097530299624e-05, + "loss": 0.1765, + "step": 1266 + }, + { + "epoch": 2.014308426073132, + "grad_norm": 1.107564625767209, + "learning_rate": 3.074129415190006e-05, + "loss": 0.1614, + "step": 1267 + }, + { + "epoch": 2.0158982511923687, + "grad_norm": 1.1544088229763227, + "learning_rate": 3.074349390723716e-05, + "loss": 0.1799, + "step": 1268 + }, + { + "epoch": 2.0174880763116056, + "grad_norm": 1.485674176504748, + "learning_rate": 3.07456967955664e-05, + "loss": 0.21, + "step": 1269 + }, + { + "epoch": 2.0190779014308426, + "grad_norm": 0.8111920965928121, + "learning_rate": 3.074790281614218e-05, + "loss": 0.1911, + "step": 1270 + }, + { + "epoch": 2.0206677265500796, + "grad_norm": 1.6692129560972535, + "learning_rate": 3.075011196821784e-05, + "loss": 0.2232, + "step": 1271 + }, + { + "epoch": 2.022257551669316, + "grad_norm": 0.9910243616953991, + "learning_rate": 3.0752324251045664e-05, + "loss": 0.2019, + "step": 1272 + }, + { + "epoch": 2.023847376788553, + "grad_norm": 1.851604710969435, + "learning_rate": 3.075453966387686e-05, + "loss": 0.1962, + "step": 1273 + }, + { + "epoch": 2.02543720190779, + "grad_norm": 1.6315246047077385, + "learning_rate": 3.0756758205961626e-05, + "loss": 0.2336, + "step": 1274 + }, + { + "epoch": 2.027027027027027, + "grad_norm": 1.1332389023345852, + "learning_rate": 3.0758979876549034e-05, + "loss": 0.1975, + "step": 1275 + }, + { + "epoch": 2.0286168521462637, + "grad_norm": 1.3854639647356968, + "learning_rate": 3.076120467488714e-05, + "loss": 0.1859, + "step": 1276 + }, + { + "epoch": 2.0302066772655007, + "grad_norm": 27.627483185333432, + "learning_rate": 3.0763432600222913e-05, + "loss": 30.238, + "step": 1277 + }, + { + "epoch": 2.0317965023847377, + "grad_norm": 1.3216702131635056, + "learning_rate": 3.076566365180232e-05, + "loss": 0.2133, + "step": 1278 + }, + { + "epoch": 2.0333863275039747, + "grad_norm": 2.0037253060939353, + "learning_rate": 3.076789782887019e-05, + "loss": 0.1948, + "step": 1279 + }, + { + "epoch": 2.0349761526232113, + "grad_norm": 0.8846634052804119, + "learning_rate": 3.077013513067036e-05, + "loss": 0.2177, + "step": 1280 + }, + { + "epoch": 2.0365659777424483, + "grad_norm": 1.3785150586824486, + "learning_rate": 3.077237555644558e-05, + "loss": 0.1767, + "step": 1281 + }, + { + "epoch": 2.0381558028616853, + "grad_norm": 1.14319076750038, + "learning_rate": 3.077461910543754e-05, + "loss": 0.1793, + "step": 1282 + }, + { + "epoch": 2.0397456279809223, + "grad_norm": 1.3058599818187528, + "learning_rate": 3.077686577688689e-05, + "loss": 0.2194, + "step": 1283 + }, + { + "epoch": 2.041335453100159, + "grad_norm": 1.1608873509256605, + "learning_rate": 3.077911557003319e-05, + "loss": 0.1365, + "step": 1284 + }, + { + "epoch": 2.042925278219396, + "grad_norm": 2.2215253383767357, + "learning_rate": 3.0781368484114995e-05, + "loss": 0.318, + "step": 1285 + }, + { + "epoch": 2.044515103338633, + "grad_norm": 1.2337850885799377, + "learning_rate": 3.0783624518369764e-05, + "loss": 0.2016, + "step": 1286 + }, + { + "epoch": 2.04610492845787, + "grad_norm": 1.4386787568710655, + "learning_rate": 3.078588367203391e-05, + "loss": 0.1446, + "step": 1287 + }, + { + "epoch": 2.0476947535771064, + "grad_norm": 1.3686187612205987, + "learning_rate": 3.078814594434279e-05, + "loss": 0.2448, + "step": 1288 + }, + { + "epoch": 2.0492845786963434, + "grad_norm": 1.5259367157698687, + "learning_rate": 3.079041133453071e-05, + "loss": 0.2683, + "step": 1289 + }, + { + "epoch": 2.0508744038155804, + "grad_norm": 2.3301193196051186, + "learning_rate": 3.0792679841830915e-05, + "loss": 0.2524, + "step": 1290 + }, + { + "epoch": 2.0524642289348174, + "grad_norm": 2.2628810487868543, + "learning_rate": 3.07949514654756e-05, + "loss": 0.288, + "step": 1291 + }, + { + "epoch": 2.054054054054054, + "grad_norm": 1.3029376832118935, + "learning_rate": 3.0797226204695895e-05, + "loss": 0.1805, + "step": 1292 + }, + { + "epoch": 2.055643879173291, + "grad_norm": 0.8003632043606429, + "learning_rate": 3.0799504058721894e-05, + "loss": 0.143, + "step": 1293 + }, + { + "epoch": 2.057233704292528, + "grad_norm": 0.830571401756114, + "learning_rate": 3.080178502678262e-05, + "loss": 0.1529, + "step": 1294 + }, + { + "epoch": 2.0588235294117645, + "grad_norm": 28.12440574477103, + "learning_rate": 3.080406910810606e-05, + "loss": 26.6375, + "step": 1295 + }, + { + "epoch": 2.0604133545310015, + "grad_norm": 1.4661546883797156, + "learning_rate": 3.080635630191911e-05, + "loss": 0.2362, + "step": 1296 + }, + { + "epoch": 2.0620031796502385, + "grad_norm": 0.9270319189359216, + "learning_rate": 3.080864660744766e-05, + "loss": 0.2385, + "step": 1297 + }, + { + "epoch": 2.0635930047694755, + "grad_norm": 1.1312945465551818, + "learning_rate": 3.0810940023916513e-05, + "loss": 0.2121, + "step": 1298 + }, + { + "epoch": 2.065182829888712, + "grad_norm": 1.235096449226978, + "learning_rate": 3.0813236550549424e-05, + "loss": 0.2027, + "step": 1299 + }, + { + "epoch": 2.066772655007949, + "grad_norm": 1.2968467014482736, + "learning_rate": 3.0815536186569125e-05, + "loss": 0.2507, + "step": 1300 + }, + { + "epoch": 2.068362480127186, + "grad_norm": 0.9252848126570336, + "learning_rate": 3.081783893119726e-05, + "loss": 0.1492, + "step": 1301 + }, + { + "epoch": 2.069952305246423, + "grad_norm": 1.6222466340123076, + "learning_rate": 3.082014478365443e-05, + "loss": 0.1601, + "step": 1302 + }, + { + "epoch": 2.0715421303656596, + "grad_norm": 1.592797926521896, + "learning_rate": 3.0822453743160196e-05, + "loss": 0.2032, + "step": 1303 + }, + { + "epoch": 2.0731319554848966, + "grad_norm": 1.3739962578434508, + "learning_rate": 3.082476580893305e-05, + "loss": 0.239, + "step": 1304 + }, + { + "epoch": 2.0747217806041336, + "grad_norm": 0.7306467378447722, + "learning_rate": 3.082708098019043e-05, + "loss": 0.1721, + "step": 1305 + }, + { + "epoch": 2.0763116057233706, + "grad_norm": 1.542581159735289, + "learning_rate": 3.0829399256148764e-05, + "loss": 0.1936, + "step": 1306 + }, + { + "epoch": 2.077901430842607, + "grad_norm": 0.8831623975443234, + "learning_rate": 3.083172063602337e-05, + "loss": 0.1978, + "step": 1307 + }, + { + "epoch": 2.079491255961844, + "grad_norm": 0.9217243949168382, + "learning_rate": 3.083404511902857e-05, + "loss": 0.162, + "step": 1308 + }, + { + "epoch": 2.081081081081081, + "grad_norm": 0.8569836982226501, + "learning_rate": 3.08363727043776e-05, + "loss": 0.1854, + "step": 1309 + }, + { + "epoch": 2.082670906200318, + "grad_norm": 1.199840413070969, + "learning_rate": 3.0838703391282664e-05, + "loss": 0.209, + "step": 1310 + }, + { + "epoch": 2.0842607313195547, + "grad_norm": 1.160933636914246, + "learning_rate": 3.0841037178954886e-05, + "loss": 0.1708, + "step": 1311 + }, + { + "epoch": 2.0858505564387917, + "grad_norm": 1.2875084460130533, + "learning_rate": 3.0843374066604395e-05, + "loss": 0.2431, + "step": 1312 + }, + { + "epoch": 2.0874403815580287, + "grad_norm": 0.9992658402973111, + "learning_rate": 3.084571405344021e-05, + "loss": 0.1902, + "step": 1313 + }, + { + "epoch": 2.0890302066772657, + "grad_norm": 1.3380287852543482, + "learning_rate": 3.084805713867034e-05, + "loss": 0.2166, + "step": 1314 + }, + { + "epoch": 2.0906200317965022, + "grad_norm": 1.0043156815831882, + "learning_rate": 3.085040332150176e-05, + "loss": 0.1766, + "step": 1315 + }, + { + "epoch": 2.0922098569157392, + "grad_norm": 1.5805326822218397, + "learning_rate": 3.0852752601140325e-05, + "loss": 0.2404, + "step": 1316 + }, + { + "epoch": 2.0937996820349762, + "grad_norm": 1.8861521541509516, + "learning_rate": 3.0855104976790934e-05, + "loss": 0.2548, + "step": 1317 + }, + { + "epoch": 2.0953895071542132, + "grad_norm": 1.1831321120577094, + "learning_rate": 3.085746044765737e-05, + "loss": 0.2035, + "step": 1318 + }, + { + "epoch": 2.09697933227345, + "grad_norm": 1.3843009415573677, + "learning_rate": 3.0859819012942376e-05, + "loss": 0.2024, + "step": 1319 + }, + { + "epoch": 2.098569157392687, + "grad_norm": 1.2148524106972587, + "learning_rate": 3.0862180671847705e-05, + "loss": 0.223, + "step": 1320 + }, + { + "epoch": 2.100158982511924, + "grad_norm": 0.9085094684700772, + "learning_rate": 3.0864545423573996e-05, + "loss": 0.1804, + "step": 1321 + }, + { + "epoch": 2.101748807631161, + "grad_norm": 0.7691273452199137, + "learning_rate": 3.086691326732086e-05, + "loss": 0.2157, + "step": 1322 + }, + { + "epoch": 2.1033386327503973, + "grad_norm": 1.3821088501911378, + "learning_rate": 3.086928420228688e-05, + "loss": 0.1955, + "step": 1323 + }, + { + "epoch": 2.1049284578696343, + "grad_norm": 3.761306881942381, + "learning_rate": 3.087165822766958e-05, + "loss": 0.2686, + "step": 1324 + }, + { + "epoch": 2.1065182829888713, + "grad_norm": 0.8839596204913662, + "learning_rate": 3.0874035342665416e-05, + "loss": 0.1476, + "step": 1325 + }, + { + "epoch": 2.108108108108108, + "grad_norm": 0.9452616510008075, + "learning_rate": 3.087641554646986e-05, + "loss": 0.1355, + "step": 1326 + }, + { + "epoch": 2.109697933227345, + "grad_norm": 0.995148220704798, + "learning_rate": 3.087879883827727e-05, + "loss": 0.1535, + "step": 1327 + }, + { + "epoch": 2.111287758346582, + "grad_norm": 1.1429686926063791, + "learning_rate": 3.0881185217281e-05, + "loss": 0.2246, + "step": 1328 + }, + { + "epoch": 2.112877583465819, + "grad_norm": 2.1319652522118706, + "learning_rate": 3.0883574682673345e-05, + "loss": 0.2647, + "step": 1329 + }, + { + "epoch": 2.1144674085850554, + "grad_norm": 1.1920604635860412, + "learning_rate": 3.088596723364555e-05, + "loss": 0.2352, + "step": 1330 + }, + { + "epoch": 2.1160572337042924, + "grad_norm": 1.1387436937305278, + "learning_rate": 3.088836286938783e-05, + "loss": 0.155, + "step": 1331 + }, + { + "epoch": 2.1176470588235294, + "grad_norm": 1.652141133560016, + "learning_rate": 3.089076158908935e-05, + "loss": 0.1905, + "step": 1332 + }, + { + "epoch": 2.1192368839427664, + "grad_norm": 1.5354167292005385, + "learning_rate": 3.0893163391938235e-05, + "loss": 0.1922, + "step": 1333 + }, + { + "epoch": 2.120826709062003, + "grad_norm": 4.1241360067695965, + "learning_rate": 3.089556827712155e-05, + "loss": 0.5256, + "step": 1334 + }, + { + "epoch": 2.12241653418124, + "grad_norm": 0.645114217644848, + "learning_rate": 3.089797624382533e-05, + "loss": 0.1425, + "step": 1335 + }, + { + "epoch": 2.124006359300477, + "grad_norm": 1.3938989618247661, + "learning_rate": 3.090038729123457e-05, + "loss": 0.2083, + "step": 1336 + }, + { + "epoch": 2.125596184419714, + "grad_norm": 1.7730708341280415, + "learning_rate": 3.090280141853322e-05, + "loss": 0.196, + "step": 1337 + }, + { + "epoch": 2.1271860095389505, + "grad_norm": 0.7394137132287154, + "learning_rate": 3.090521862490418e-05, + "loss": 0.1577, + "step": 1338 + }, + { + "epoch": 2.1287758346581875, + "grad_norm": 1.0336190678270467, + "learning_rate": 3.090763890952931e-05, + "loss": 0.1801, + "step": 1339 + }, + { + "epoch": 2.1303656597774245, + "grad_norm": 0.9576929373168033, + "learning_rate": 3.091006227158945e-05, + "loss": 0.1855, + "step": 1340 + }, + { + "epoch": 2.1319554848966615, + "grad_norm": 1.775426378239908, + "learning_rate": 3.091248871026436e-05, + "loss": 0.2412, + "step": 1341 + }, + { + "epoch": 2.133545310015898, + "grad_norm": 0.7418853133482567, + "learning_rate": 3.091491822473278e-05, + "loss": 0.1824, + "step": 1342 + }, + { + "epoch": 2.135135135135135, + "grad_norm": 1.0328649429102252, + "learning_rate": 3.091735081417242e-05, + "loss": 0.2358, + "step": 1343 + }, + { + "epoch": 2.136724960254372, + "grad_norm": 0.7623939676804489, + "learning_rate": 3.091978647775993e-05, + "loss": 0.1741, + "step": 1344 + }, + { + "epoch": 2.138314785373609, + "grad_norm": 0.8738992243465746, + "learning_rate": 3.092222521467092e-05, + "loss": 0.177, + "step": 1345 + }, + { + "epoch": 2.1399046104928456, + "grad_norm": 0.804723471310374, + "learning_rate": 3.092466702407996e-05, + "loss": 0.1416, + "step": 1346 + }, + { + "epoch": 2.1414944356120826, + "grad_norm": 1.5047521938901964, + "learning_rate": 3.092711190516062e-05, + "loss": 0.1858, + "step": 1347 + }, + { + "epoch": 2.1430842607313196, + "grad_norm": 1.2461860763926413, + "learning_rate": 3.0929559857085355e-05, + "loss": 0.1795, + "step": 1348 + }, + { + "epoch": 2.1446740858505566, + "grad_norm": 21.777714304884103, + "learning_rate": 3.093201087902565e-05, + "loss": 19.7598, + "step": 1349 + }, + { + "epoch": 2.146263910969793, + "grad_norm": 0.9756223762482187, + "learning_rate": 3.093446497015189e-05, + "loss": 0.1724, + "step": 1350 + }, + { + "epoch": 2.14785373608903, + "grad_norm": 0.8967810526427992, + "learning_rate": 3.09369221296335e-05, + "loss": 0.2258, + "step": 1351 + }, + { + "epoch": 2.149443561208267, + "grad_norm": 1.022614819271286, + "learning_rate": 3.0939382356638785e-05, + "loss": 0.2299, + "step": 1352 + }, + { + "epoch": 2.151033386327504, + "grad_norm": 1.1832029028290982, + "learning_rate": 3.094184565033508e-05, + "loss": 0.2154, + "step": 1353 + }, + { + "epoch": 2.1526232114467407, + "grad_norm": 1.0559999502153423, + "learning_rate": 3.094431200988861e-05, + "loss": 0.2091, + "step": 1354 + }, + { + "epoch": 2.1542130365659777, + "grad_norm": 1.654526643164502, + "learning_rate": 3.094678143446462e-05, + "loss": 0.2314, + "step": 1355 + }, + { + "epoch": 2.1558028616852147, + "grad_norm": 1.020730279015888, + "learning_rate": 3.09492539232273e-05, + "loss": 0.1849, + "step": 1356 + }, + { + "epoch": 2.1573926868044513, + "grad_norm": 0.8790264686352803, + "learning_rate": 3.095172947533981e-05, + "loss": 0.1258, + "step": 1357 + }, + { + "epoch": 2.1589825119236883, + "grad_norm": 0.7786196705654902, + "learning_rate": 3.095420808996425e-05, + "loss": 0.1793, + "step": 1358 + }, + { + "epoch": 2.1605723370429253, + "grad_norm": 1.4067081602417018, + "learning_rate": 3.09566897662617e-05, + "loss": 0.2755, + "step": 1359 + }, + { + "epoch": 2.1621621621621623, + "grad_norm": 0.9830172756733001, + "learning_rate": 3.095917450339221e-05, + "loss": 0.0915, + "step": 1360 + }, + { + "epoch": 2.1637519872813993, + "grad_norm": 1.7798638063114107, + "learning_rate": 3.09616623005148e-05, + "loss": 0.2358, + "step": 1361 + }, + { + "epoch": 2.165341812400636, + "grad_norm": 1.089155433635473, + "learning_rate": 3.0964153156787414e-05, + "loss": 0.2222, + "step": 1362 + }, + { + "epoch": 2.166931637519873, + "grad_norm": 1.0603013444145613, + "learning_rate": 3.0966647071366996e-05, + "loss": 0.1896, + "step": 1363 + }, + { + "epoch": 2.16852146263911, + "grad_norm": 1.1657683753221677, + "learning_rate": 3.0969144043409444e-05, + "loss": 0.2287, + "step": 1364 + }, + { + "epoch": 2.1701112877583464, + "grad_norm": 1.1150513983382129, + "learning_rate": 3.0971644072069636e-05, + "loss": 0.1709, + "step": 1365 + }, + { + "epoch": 2.1717011128775834, + "grad_norm": 1.413617806629585, + "learning_rate": 3.09741471565014e-05, + "loss": 0.1981, + "step": 1366 + }, + { + "epoch": 2.1732909379968204, + "grad_norm": 0.7387448412322494, + "learning_rate": 3.097665329585752e-05, + "loss": 0.1522, + "step": 1367 + }, + { + "epoch": 2.1748807631160574, + "grad_norm": 1.6831016678797022, + "learning_rate": 3.097916248928976e-05, + "loss": 0.1816, + "step": 1368 + }, + { + "epoch": 2.176470588235294, + "grad_norm": 0.7478624113142754, + "learning_rate": 3.098167473594886e-05, + "loss": 0.1838, + "step": 1369 + }, + { + "epoch": 2.178060413354531, + "grad_norm": 1.5408256716519728, + "learning_rate": 3.0984190034984514e-05, + "loss": 0.1894, + "step": 1370 + }, + { + "epoch": 2.179650238473768, + "grad_norm": 0.9093900350749017, + "learning_rate": 3.0986708385545384e-05, + "loss": 0.2198, + "step": 1371 + }, + { + "epoch": 2.181240063593005, + "grad_norm": 1.018773190322291, + "learning_rate": 3.0989229786779086e-05, + "loss": 0.1618, + "step": 1372 + }, + { + "epoch": 2.1828298887122415, + "grad_norm": 1.0429560744402189, + "learning_rate": 3.099175423783223e-05, + "loss": 0.2116, + "step": 1373 + }, + { + "epoch": 2.1844197138314785, + "grad_norm": 1.6267291985483283, + "learning_rate": 3.0994281737850384e-05, + "loss": 0.2586, + "step": 1374 + }, + { + "epoch": 2.1860095389507155, + "grad_norm": 0.9472913844800965, + "learning_rate": 3.099681228597806e-05, + "loss": 0.1786, + "step": 1375 + }, + { + "epoch": 2.1875993640699525, + "grad_norm": 0.9482453520338047, + "learning_rate": 3.0999345881358784e-05, + "loss": 0.183, + "step": 1376 + }, + { + "epoch": 2.189189189189189, + "grad_norm": 1.4272694389725795, + "learning_rate": 3.100188252313501e-05, + "loss": 0.3825, + "step": 1377 + }, + { + "epoch": 2.190779014308426, + "grad_norm": 1.3120816831611022, + "learning_rate": 3.10044222104482e-05, + "loss": 0.1812, + "step": 1378 + }, + { + "epoch": 2.192368839427663, + "grad_norm": 0.8936454200936483, + "learning_rate": 3.1006964942438725e-05, + "loss": 0.1752, + "step": 1379 + }, + { + "epoch": 2.1939586645469, + "grad_norm": 1.0308275926343504, + "learning_rate": 3.100951071824599e-05, + "loss": 0.1979, + "step": 1380 + }, + { + "epoch": 2.1955484896661366, + "grad_norm": 2.121736356022081, + "learning_rate": 3.101205953700833e-05, + "loss": 0.1966, + "step": 1381 + }, + { + "epoch": 2.1971383147853736, + "grad_norm": 1.361500398221776, + "learning_rate": 3.101461139786307e-05, + "loss": 0.1865, + "step": 1382 + }, + { + "epoch": 2.1987281399046106, + "grad_norm": 1.0551844275054563, + "learning_rate": 3.101716629994648e-05, + "loss": 0.1408, + "step": 1383 + }, + { + "epoch": 2.2003179650238476, + "grad_norm": 1.286749262176366, + "learning_rate": 3.101972424239384e-05, + "loss": 0.2008, + "step": 1384 + }, + { + "epoch": 2.201907790143084, + "grad_norm": 1.1440829715368968, + "learning_rate": 3.102228522433937e-05, + "loss": 0.2574, + "step": 1385 + }, + { + "epoch": 2.203497615262321, + "grad_norm": 1.6675692498369448, + "learning_rate": 3.102484924491628e-05, + "loss": 0.1788, + "step": 1386 + }, + { + "epoch": 2.205087440381558, + "grad_norm": 0.9370870178549853, + "learning_rate": 3.102741630325672e-05, + "loss": 0.1415, + "step": 1387 + }, + { + "epoch": 2.2066772655007947, + "grad_norm": 0.9637874312804572, + "learning_rate": 3.102998639849185e-05, + "loss": 0.1722, + "step": 1388 + }, + { + "epoch": 2.2082670906200317, + "grad_norm": 1.2556838699580377, + "learning_rate": 3.103255952975178e-05, + "loss": 0.1686, + "step": 1389 + }, + { + "epoch": 2.2098569157392687, + "grad_norm": 26.087843924102206, + "learning_rate": 3.10351356961656e-05, + "loss": 24.7238, + "step": 1390 + }, + { + "epoch": 2.2114467408585057, + "grad_norm": 1.2306052553326015, + "learning_rate": 3.103771489686136e-05, + "loss": 0.2127, + "step": 1391 + }, + { + "epoch": 2.2130365659777427, + "grad_norm": 1.3925769784119306, + "learning_rate": 3.104029713096612e-05, + "loss": 0.1536, + "step": 1392 + }, + { + "epoch": 2.2146263910969792, + "grad_norm": 1.0886456722807778, + "learning_rate": 3.104288239760587e-05, + "loss": 0.1605, + "step": 1393 + }, + { + "epoch": 2.2162162162162162, + "grad_norm": 1.9733735581123586, + "learning_rate": 3.10454706959056e-05, + "loss": 0.3207, + "step": 1394 + }, + { + "epoch": 2.2178060413354532, + "grad_norm": 1.6391136638969375, + "learning_rate": 3.104806202498924e-05, + "loss": 0.2376, + "step": 1395 + }, + { + "epoch": 2.21939586645469, + "grad_norm": 1.338525473265953, + "learning_rate": 3.105065638397975e-05, + "loss": 0.1535, + "step": 1396 + }, + { + "epoch": 2.220985691573927, + "grad_norm": 1.2636598834016204, + "learning_rate": 3.1053253771999016e-05, + "loss": 0.2016, + "step": 1397 + }, + { + "epoch": 2.2225755166931638, + "grad_norm": 2.2258591390768534, + "learning_rate": 3.105585418816792e-05, + "loss": 0.2009, + "step": 1398 + }, + { + "epoch": 2.2241653418124008, + "grad_norm": 2.1742096409072453, + "learning_rate": 3.105845763160632e-05, + "loss": 0.2063, + "step": 1399 + }, + { + "epoch": 2.2257551669316373, + "grad_norm": 1.4436772759142573, + "learning_rate": 3.106106410143304e-05, + "loss": 0.2107, + "step": 1400 + }, + { + "epoch": 2.2273449920508743, + "grad_norm": 1.8040607557530957, + "learning_rate": 3.106367359676588e-05, + "loss": 0.8534, + "step": 1401 + }, + { + "epoch": 2.2289348171701113, + "grad_norm": 9.457669343721667, + "learning_rate": 3.106628611672163e-05, + "loss": 7.0631, + "step": 1402 + }, + { + "epoch": 2.2305246422893483, + "grad_norm": 2.24096394779888, + "learning_rate": 3.106890166041604e-05, + "loss": 0.2575, + "step": 1403 + }, + { + "epoch": 2.232114467408585, + "grad_norm": 29.494664006121866, + "learning_rate": 3.107152022696384e-05, + "loss": 25.3667, + "step": 1404 + }, + { + "epoch": 2.233704292527822, + "grad_norm": 2.3338503820567937, + "learning_rate": 3.107414181547875e-05, + "loss": 0.2012, + "step": 1405 + }, + { + "epoch": 2.235294117647059, + "grad_norm": 1.2334326685402701, + "learning_rate": 3.107676642507345e-05, + "loss": 0.1994, + "step": 1406 + }, + { + "epoch": 2.236883942766296, + "grad_norm": 1.3481853582852126, + "learning_rate": 3.10793940548596e-05, + "loss": 0.2168, + "step": 1407 + }, + { + "epoch": 2.2384737678855324, + "grad_norm": 1.449509800560888, + "learning_rate": 3.108202470394786e-05, + "loss": 0.1952, + "step": 1408 + }, + { + "epoch": 2.2400635930047694, + "grad_norm": 1.8300766146417315, + "learning_rate": 3.108465837144784e-05, + "loss": 0.1902, + "step": 1409 + }, + { + "epoch": 2.2416534181240064, + "grad_norm": 1.769688151517097, + "learning_rate": 3.108729505646813e-05, + "loss": 0.176, + "step": 1410 + }, + { + "epoch": 2.2432432432432434, + "grad_norm": 1.4020338307777473, + "learning_rate": 3.1089934758116316e-05, + "loss": 0.1532, + "step": 1411 + }, + { + "epoch": 2.24483306836248, + "grad_norm": 1.4975358502023568, + "learning_rate": 3.1092577475498965e-05, + "loss": 0.2138, + "step": 1412 + }, + { + "epoch": 2.246422893481717, + "grad_norm": 1.4443855686176623, + "learning_rate": 3.10952232077216e-05, + "loss": 0.1922, + "step": 1413 + }, + { + "epoch": 2.248012718600954, + "grad_norm": 2.054295804059935, + "learning_rate": 3.109787195388874e-05, + "loss": 0.146, + "step": 1414 + }, + { + "epoch": 2.249602543720191, + "grad_norm": 1.4258794369642493, + "learning_rate": 3.110052371310387e-05, + "loss": 0.2534, + "step": 1415 + }, + { + "epoch": 2.2511923688394275, + "grad_norm": 1.2449009899718306, + "learning_rate": 3.110317848446948e-05, + "loss": 0.1951, + "step": 1416 + }, + { + "epoch": 2.2527821939586645, + "grad_norm": 4.49563701828563, + "learning_rate": 3.110583626708703e-05, + "loss": 0.2438, + "step": 1417 + }, + { + "epoch": 2.2543720190779015, + "grad_norm": 1.121447415027865, + "learning_rate": 3.110849706005694e-05, + "loss": 0.197, + "step": 1418 + }, + { + "epoch": 2.255961844197138, + "grad_norm": 2.3025981361064303, + "learning_rate": 3.111116086247864e-05, + "loss": 0.212, + "step": 1419 + }, + { + "epoch": 2.257551669316375, + "grad_norm": 1.6567050848724652, + "learning_rate": 3.111382767345051e-05, + "loss": 0.2821, + "step": 1420 + }, + { + "epoch": 2.259141494435612, + "grad_norm": 1.65254330309698, + "learning_rate": 3.1116497492069965e-05, + "loss": 0.1978, + "step": 1421 + }, + { + "epoch": 2.260731319554849, + "grad_norm": 0.9328317215158144, + "learning_rate": 3.111917031743333e-05, + "loss": 0.1586, + "step": 1422 + }, + { + "epoch": 2.262321144674086, + "grad_norm": 3.9300092505161173, + "learning_rate": 3.112184614863599e-05, + "loss": 0.2695, + "step": 1423 + }, + { + "epoch": 2.2639109697933226, + "grad_norm": 1.1284450729805386, + "learning_rate": 3.1124524984772236e-05, + "loss": 0.2547, + "step": 1424 + }, + { + "epoch": 2.2655007949125596, + "grad_norm": 1.1805631868626294, + "learning_rate": 3.112720682493541e-05, + "loss": 0.1871, + "step": 1425 + }, + { + "epoch": 2.2670906200317966, + "grad_norm": 0.8668640445889406, + "learning_rate": 3.1129891668217784e-05, + "loss": 0.1531, + "step": 1426 + }, + { + "epoch": 2.268680445151033, + "grad_norm": 1.192741584684383, + "learning_rate": 3.113257951371064e-05, + "loss": 0.1734, + "step": 1427 + }, + { + "epoch": 2.27027027027027, + "grad_norm": 1.138895563046949, + "learning_rate": 3.1135270360504254e-05, + "loss": 0.1943, + "step": 1428 + }, + { + "epoch": 2.271860095389507, + "grad_norm": 1.1522008460207795, + "learning_rate": 3.113796420768786e-05, + "loss": 0.2182, + "step": 1429 + }, + { + "epoch": 2.273449920508744, + "grad_norm": 0.8063615699751957, + "learning_rate": 3.1140661054349684e-05, + "loss": 0.1991, + "step": 1430 + }, + { + "epoch": 2.275039745627981, + "grad_norm": 1.6500668772821634, + "learning_rate": 3.114336089957694e-05, + "loss": 0.1959, + "step": 1431 + }, + { + "epoch": 2.2766295707472177, + "grad_norm": 0.911827838077677, + "learning_rate": 3.114606374245584e-05, + "loss": 0.1994, + "step": 1432 + }, + { + "epoch": 2.2782193958664547, + "grad_norm": 1.1650142407170838, + "learning_rate": 3.114876958207157e-05, + "loss": 0.172, + "step": 1433 + }, + { + "epoch": 2.2798092209856917, + "grad_norm": 1.1770580807873885, + "learning_rate": 3.115147841750829e-05, + "loss": 0.2057, + "step": 1434 + }, + { + "epoch": 2.2813990461049283, + "grad_norm": 1.029010642784493, + "learning_rate": 3.115419024784916e-05, + "loss": 0.1747, + "step": 1435 + }, + { + "epoch": 2.2829888712241653, + "grad_norm": 1.3165787016867403, + "learning_rate": 3.1156905072176335e-05, + "loss": 0.1619, + "step": 1436 + }, + { + "epoch": 2.2845786963434023, + "grad_norm": 3.9147528399400153, + "learning_rate": 3.115962288957092e-05, + "loss": 0.2271, + "step": 1437 + }, + { + "epoch": 2.2861685214626393, + "grad_norm": 0.9777591782951672, + "learning_rate": 3.116234369911307e-05, + "loss": 0.1541, + "step": 1438 + }, + { + "epoch": 2.287758346581876, + "grad_norm": 1.495771594125863, + "learning_rate": 3.1165067499881854e-05, + "loss": 0.2517, + "step": 1439 + }, + { + "epoch": 2.289348171701113, + "grad_norm": 13.44438587844138, + "learning_rate": 3.116779429095538e-05, + "loss": 13.3598, + "step": 1440 + }, + { + "epoch": 2.29093799682035, + "grad_norm": 0.7960710683865352, + "learning_rate": 3.117052407141073e-05, + "loss": 0.1753, + "step": 1441 + }, + { + "epoch": 2.292527821939587, + "grad_norm": 1.3534450241088505, + "learning_rate": 3.117325684032397e-05, + "loss": 0.1832, + "step": 1442 + }, + { + "epoch": 2.2941176470588234, + "grad_norm": 1.613919394331104, + "learning_rate": 3.117599259677015e-05, + "loss": 0.2604, + "step": 1443 + }, + { + "epoch": 2.2957074721780604, + "grad_norm": 1.52159447878477, + "learning_rate": 3.117873133982332e-05, + "loss": 0.1978, + "step": 1444 + }, + { + "epoch": 2.2972972972972974, + "grad_norm": 1.0298035677691213, + "learning_rate": 3.118147306855653e-05, + "loss": 0.1993, + "step": 1445 + }, + { + "epoch": 2.2988871224165344, + "grad_norm": 1.6253228994815916, + "learning_rate": 3.118421778204176e-05, + "loss": 0.1908, + "step": 1446 + }, + { + "epoch": 2.300476947535771, + "grad_norm": 1.5852938545374853, + "learning_rate": 3.118696547935008e-05, + "loss": 0.1759, + "step": 1447 + }, + { + "epoch": 2.302066772655008, + "grad_norm": 1.476417529322313, + "learning_rate": 3.118971615955146e-05, + "loss": 0.1843, + "step": 1448 + }, + { + "epoch": 2.303656597774245, + "grad_norm": 1.3566778600157008, + "learning_rate": 3.1192469821714894e-05, + "loss": 0.1337, + "step": 1449 + }, + { + "epoch": 2.3052464228934815, + "grad_norm": 1.241183457423938, + "learning_rate": 3.119522646490838e-05, + "loss": 0.258, + "step": 1450 + }, + { + "epoch": 2.3068362480127185, + "grad_norm": 1.0807210567397258, + "learning_rate": 3.119798608819889e-05, + "loss": 0.2256, + "step": 1451 + }, + { + "epoch": 2.3084260731319555, + "grad_norm": 0.8586670655980936, + "learning_rate": 3.120074869065238e-05, + "loss": 0.1914, + "step": 1452 + }, + { + "epoch": 2.3100158982511925, + "grad_norm": 1.9377225721977147, + "learning_rate": 3.120351427133383e-05, + "loss": 0.2266, + "step": 1453 + }, + { + "epoch": 2.3116057233704295, + "grad_norm": 1.074866897397345, + "learning_rate": 3.120628282930719e-05, + "loss": 0.1655, + "step": 1454 + }, + { + "epoch": 2.313195548489666, + "grad_norm": 1.9670171516378898, + "learning_rate": 3.120905436363537e-05, + "loss": 0.2426, + "step": 1455 + }, + { + "epoch": 2.314785373608903, + "grad_norm": 2.139449917829324, + "learning_rate": 3.1211828873380356e-05, + "loss": 0.1572, + "step": 1456 + }, + { + "epoch": 2.31637519872814, + "grad_norm": 2.063541341841825, + "learning_rate": 3.121460635760302e-05, + "loss": 0.2264, + "step": 1457 + }, + { + "epoch": 2.3179650238473766, + "grad_norm": 2.4224371702730187, + "learning_rate": 3.121738681536333e-05, + "loss": 0.2049, + "step": 1458 + }, + { + "epoch": 2.3195548489666136, + "grad_norm": 3.62872067157744, + "learning_rate": 3.12201702457202e-05, + "loss": 0.2589, + "step": 1459 + }, + { + "epoch": 2.3211446740858506, + "grad_norm": 1.4396023571567318, + "learning_rate": 3.122295664773151e-05, + "loss": 0.18, + "step": 1460 + }, + { + "epoch": 2.3227344992050876, + "grad_norm": 3.103519046325617, + "learning_rate": 3.122574602045418e-05, + "loss": 0.261, + "step": 1461 + }, + { + "epoch": 2.3243243243243246, + "grad_norm": 2.5467438977510026, + "learning_rate": 3.1228538362944116e-05, + "loss": 0.1827, + "step": 1462 + }, + { + "epoch": 2.325914149443561, + "grad_norm": 1.9904982085596932, + "learning_rate": 3.1231333674256194e-05, + "loss": 0.2013, + "step": 1463 + }, + { + "epoch": 2.327503974562798, + "grad_norm": 1.373483581934219, + "learning_rate": 3.123413195344432e-05, + "loss": 0.1924, + "step": 1464 + }, + { + "epoch": 2.329093799682035, + "grad_norm": 1.1076693552105425, + "learning_rate": 3.123693319956137e-05, + "loss": 0.1964, + "step": 1465 + }, + { + "epoch": 2.3306836248012717, + "grad_norm": 3.007703511021293, + "learning_rate": 3.123973741165922e-05, + "loss": 0.2712, + "step": 1466 + }, + { + "epoch": 2.3322734499205087, + "grad_norm": 1.7664803715549058, + "learning_rate": 3.124254458878874e-05, + "loss": 0.1569, + "step": 1467 + }, + { + "epoch": 2.3338632750397457, + "grad_norm": 1.7564175490661225, + "learning_rate": 3.124535472999982e-05, + "loss": 0.2135, + "step": 1468 + }, + { + "epoch": 2.3354531001589827, + "grad_norm": 1.4712268378911442, + "learning_rate": 3.1248167834341324e-05, + "loss": 0.2375, + "step": 1469 + }, + { + "epoch": 2.337042925278219, + "grad_norm": 1.3563940373821437, + "learning_rate": 3.125098390086111e-05, + "loss": 0.252, + "step": 1470 + }, + { + "epoch": 2.338632750397456, + "grad_norm": 1.465444794929157, + "learning_rate": 3.125380292860604e-05, + "loss": 0.2565, + "step": 1471 + }, + { + "epoch": 2.340222575516693, + "grad_norm": 1.723767544562639, + "learning_rate": 3.125662491662199e-05, + "loss": 0.1954, + "step": 1472 + }, + { + "epoch": 2.34181240063593, + "grad_norm": 1.6142840453062304, + "learning_rate": 3.125944986395381e-05, + "loss": 0.1923, + "step": 1473 + }, + { + "epoch": 2.3434022257551668, + "grad_norm": 1.2379579648768835, + "learning_rate": 3.1262277769645345e-05, + "loss": 0.1736, + "step": 1474 + }, + { + "epoch": 2.3449920508744038, + "grad_norm": 1.2350038003794837, + "learning_rate": 3.1265108632739475e-05, + "loss": 0.1613, + "step": 1475 + }, + { + "epoch": 2.3465818759936408, + "grad_norm": 1.3831981324879117, + "learning_rate": 3.126794245227805e-05, + "loss": 0.2191, + "step": 1476 + }, + { + "epoch": 2.3481717011128778, + "grad_norm": 2.1957354071496913, + "learning_rate": 3.12707792273019e-05, + "loss": 0.219, + "step": 1477 + }, + { + "epoch": 2.3497615262321143, + "grad_norm": 1.878883005683085, + "learning_rate": 3.127361895685091e-05, + "loss": 0.2139, + "step": 1478 + }, + { + "epoch": 2.3513513513513513, + "grad_norm": 0.8345237246048842, + "learning_rate": 3.127646163996393e-05, + "loss": 0.1472, + "step": 1479 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 1.2364929495587558, + "learning_rate": 3.1279307275678795e-05, + "loss": 0.1458, + "step": 1480 + }, + { + "epoch": 2.3545310015898253, + "grad_norm": 0.9867351119302167, + "learning_rate": 3.128215586303238e-05, + "loss": 0.234, + "step": 1481 + }, + { + "epoch": 2.356120826709062, + "grad_norm": 1.223627732869825, + "learning_rate": 3.128500740106052e-05, + "loss": 0.1667, + "step": 1482 + }, + { + "epoch": 2.357710651828299, + "grad_norm": 1.2289676538415812, + "learning_rate": 3.1287861888798105e-05, + "loss": 0.1516, + "step": 1483 + }, + { + "epoch": 2.359300476947536, + "grad_norm": 1.319048088212486, + "learning_rate": 3.1290719325278975e-05, + "loss": 0.1729, + "step": 1484 + }, + { + "epoch": 2.360890302066773, + "grad_norm": 1.0350250071026756, + "learning_rate": 3.1293579709535983e-05, + "loss": 0.1457, + "step": 1485 + }, + { + "epoch": 2.3624801271860094, + "grad_norm": 1.953397548119439, + "learning_rate": 3.1296443040601005e-05, + "loss": 0.1837, + "step": 1486 + }, + { + "epoch": 2.3640699523052464, + "grad_norm": 1.0521509408943108, + "learning_rate": 3.12993093175049e-05, + "loss": 0.1388, + "step": 1487 + }, + { + "epoch": 2.3656597774244834, + "grad_norm": 1.26756396836801, + "learning_rate": 3.130217853927755e-05, + "loss": 0.1912, + "step": 1488 + }, + { + "epoch": 2.36724960254372, + "grad_norm": 1.9305598386292797, + "learning_rate": 3.130505070494781e-05, + "loss": 0.231, + "step": 1489 + }, + { + "epoch": 2.368839427662957, + "grad_norm": 7.252905728105691, + "learning_rate": 3.130792581354357e-05, + "loss": 0.9758, + "step": 1490 + }, + { + "epoch": 2.370429252782194, + "grad_norm": 1.5431665380552189, + "learning_rate": 3.1310803864091696e-05, + "loss": 0.2032, + "step": 1491 + }, + { + "epoch": 2.372019077901431, + "grad_norm": 2.656444420676739, + "learning_rate": 3.1313684855618095e-05, + "loss": 0.2217, + "step": 1492 + }, + { + "epoch": 2.373608903020668, + "grad_norm": 1.556059919802715, + "learning_rate": 3.1316568787147627e-05, + "loss": 0.1659, + "step": 1493 + }, + { + "epoch": 2.3751987281399045, + "grad_norm": 1.3698426597390472, + "learning_rate": 3.1319455657704205e-05, + "loss": 0.1839, + "step": 1494 + }, + { + "epoch": 2.3767885532591415, + "grad_norm": 3.0423826028284724, + "learning_rate": 3.132234546631072e-05, + "loss": 0.3538, + "step": 1495 + }, + { + "epoch": 2.3783783783783785, + "grad_norm": 1.4872500970599571, + "learning_rate": 3.132523821198908e-05, + "loss": 0.147, + "step": 1496 + }, + { + "epoch": 2.379968203497615, + "grad_norm": 1.3747266674234895, + "learning_rate": 3.132813389376019e-05, + "loss": 0.1832, + "step": 1497 + }, + { + "epoch": 2.381558028616852, + "grad_norm": 1.5486370602213821, + "learning_rate": 3.133103251064397e-05, + "loss": 0.2028, + "step": 1498 + }, + { + "epoch": 2.383147853736089, + "grad_norm": 1.3118235203310677, + "learning_rate": 3.1333934061659345e-05, + "loss": 0.1406, + "step": 1499 + }, + { + "epoch": 2.384737678855326, + "grad_norm": 0.8265749362964889, + "learning_rate": 3.1336838545824255e-05, + "loss": 0.1292, + "step": 1500 + }, + { + "epoch": 2.3863275039745626, + "grad_norm": 1.1049670753997374, + "learning_rate": 3.133974596215561e-05, + "loss": 0.1853, + "step": 1501 + }, + { + "epoch": 2.3879173290937996, + "grad_norm": 1.52507113499075, + "learning_rate": 3.1342656309669384e-05, + "loss": 0.1474, + "step": 1502 + }, + { + "epoch": 2.3895071542130366, + "grad_norm": 1.0958351723191888, + "learning_rate": 3.134556958738051e-05, + "loss": 0.1782, + "step": 1503 + }, + { + "epoch": 2.3910969793322736, + "grad_norm": 1.2887028391507356, + "learning_rate": 3.1348485794302956e-05, + "loss": 0.1773, + "step": 1504 + }, + { + "epoch": 2.39268680445151, + "grad_norm": 0.9680746236960877, + "learning_rate": 3.13514049294497e-05, + "loss": 0.2013, + "step": 1505 + }, + { + "epoch": 2.394276629570747, + "grad_norm": 3.4367125383495685, + "learning_rate": 3.135432699183269e-05, + "loss": 0.1734, + "step": 1506 + }, + { + "epoch": 2.395866454689984, + "grad_norm": 1.057120088249028, + "learning_rate": 3.1357251980462956e-05, + "loss": 0.2048, + "step": 1507 + }, + { + "epoch": 2.397456279809221, + "grad_norm": 1.1650375662031185, + "learning_rate": 3.1360179894350465e-05, + "loss": 0.1992, + "step": 1508 + }, + { + "epoch": 2.3990461049284577, + "grad_norm": 1.7979175555589268, + "learning_rate": 3.136311073250424e-05, + "loss": 0.1266, + "step": 1509 + }, + { + "epoch": 2.4006359300476947, + "grad_norm": 1.1944252438466139, + "learning_rate": 3.136604449393228e-05, + "loss": 0.1409, + "step": 1510 + }, + { + "epoch": 2.4022257551669317, + "grad_norm": 30.518064575713485, + "learning_rate": 3.136898117764164e-05, + "loss": 26.6968, + "step": 1511 + }, + { + "epoch": 2.4038155802861687, + "grad_norm": 28.21925598666198, + "learning_rate": 3.1371920782638336e-05, + "loss": 26.635, + "step": 1512 + }, + { + "epoch": 2.4054054054054053, + "grad_norm": 2.0987458298031303, + "learning_rate": 3.137486330792742e-05, + "loss": 0.2136, + "step": 1513 + }, + { + "epoch": 2.4069952305246423, + "grad_norm": 1.0021039433603938, + "learning_rate": 3.137780875251297e-05, + "loss": 0.2344, + "step": 1514 + }, + { + "epoch": 2.4085850556438793, + "grad_norm": 1.1394833635766866, + "learning_rate": 3.138075711539805e-05, + "loss": 0.1472, + "step": 1515 + }, + { + "epoch": 2.4101748807631163, + "grad_norm": 0.9624810031376568, + "learning_rate": 3.138370839558474e-05, + "loss": 0.18, + "step": 1516 + }, + { + "epoch": 2.411764705882353, + "grad_norm": 1.2790589209620769, + "learning_rate": 3.138666259207415e-05, + "loss": 0.2613, + "step": 1517 + }, + { + "epoch": 2.41335453100159, + "grad_norm": 1.1680169508386977, + "learning_rate": 3.138961970386638e-05, + "loss": 0.1454, + "step": 1518 + }, + { + "epoch": 2.414944356120827, + "grad_norm": 0.8273120941151018, + "learning_rate": 3.1392579729960564e-05, + "loss": 0.1644, + "step": 1519 + }, + { + "epoch": 2.4165341812400634, + "grad_norm": 1.150587108819813, + "learning_rate": 3.139554266935484e-05, + "loss": 0.1579, + "step": 1520 + }, + { + "epoch": 2.4181240063593004, + "grad_norm": 34.95835080594549, + "learning_rate": 3.1398508521046344e-05, + "loss": 25.178, + "step": 1521 + }, + { + "epoch": 2.4197138314785374, + "grad_norm": 0.8969091537680663, + "learning_rate": 3.1401477284031273e-05, + "loss": 0.1639, + "step": 1522 + }, + { + "epoch": 2.4213036565977744, + "grad_norm": 34.49501627345301, + "learning_rate": 3.140444895730478e-05, + "loss": 24.3633, + "step": 1523 + }, + { + "epoch": 2.4228934817170114, + "grad_norm": 0.8764485747777683, + "learning_rate": 3.140742353986106e-05, + "loss": 0.1497, + "step": 1524 + }, + { + "epoch": 2.424483306836248, + "grad_norm": 0.6552112156199094, + "learning_rate": 3.141040103069335e-05, + "loss": 0.1249, + "step": 1525 + }, + { + "epoch": 2.426073131955485, + "grad_norm": 1.1845888791516181, + "learning_rate": 3.141338142879387e-05, + "loss": 0.1936, + "step": 1526 + }, + { + "epoch": 2.427662957074722, + "grad_norm": 2.4087522854115533, + "learning_rate": 3.141636473315384e-05, + "loss": 0.2414, + "step": 1527 + }, + { + "epoch": 2.4292527821939585, + "grad_norm": 1.0040074881775796, + "learning_rate": 3.1419350942763557e-05, + "loss": 0.2171, + "step": 1528 + }, + { + "epoch": 2.4308426073131955, + "grad_norm": 1.130652196648176, + "learning_rate": 3.142234005661226e-05, + "loss": 0.191, + "step": 1529 + }, + { + "epoch": 2.4324324324324325, + "grad_norm": 1.140773132991218, + "learning_rate": 3.142533207368826e-05, + "loss": 0.1944, + "step": 1530 + }, + { + "epoch": 2.4340222575516695, + "grad_norm": 0.9822099132244579, + "learning_rate": 3.1428326992978875e-05, + "loss": 0.2321, + "step": 1531 + }, + { + "epoch": 2.435612082670906, + "grad_norm": 0.9373274885012641, + "learning_rate": 3.143132481347042e-05, + "loss": 0.1313, + "step": 1532 + }, + { + "epoch": 2.437201907790143, + "grad_norm": 0.8130735396798979, + "learning_rate": 3.143432553414825e-05, + "loss": 0.1926, + "step": 1533 + }, + { + "epoch": 2.43879173290938, + "grad_norm": 2.3315274090712337, + "learning_rate": 3.143732915399672e-05, + "loss": 0.3846, + "step": 1534 + }, + { + "epoch": 2.440381558028617, + "grad_norm": 1.0935092122841343, + "learning_rate": 3.144033567199922e-05, + "loss": 0.1562, + "step": 1535 + }, + { + "epoch": 2.4419713831478536, + "grad_norm": 1.43334673121355, + "learning_rate": 3.1443345087138156e-05, + "loss": 0.188, + "step": 1536 + }, + { + "epoch": 2.4435612082670906, + "grad_norm": 0.9968599019221744, + "learning_rate": 3.144635739839493e-05, + "loss": 0.1672, + "step": 1537 + }, + { + "epoch": 2.4451510333863276, + "grad_norm": 1.319604075396449, + "learning_rate": 3.144937260475001e-05, + "loss": 0.1569, + "step": 1538 + }, + { + "epoch": 2.4467408585055646, + "grad_norm": 1.127694113037551, + "learning_rate": 3.145239070518285e-05, + "loss": 0.1736, + "step": 1539 + }, + { + "epoch": 2.448330683624801, + "grad_norm": 1.312037599884244, + "learning_rate": 3.145541169867192e-05, + "loss": 0.1597, + "step": 1540 + }, + { + "epoch": 2.449920508744038, + "grad_norm": 2.1475541168287817, + "learning_rate": 3.145843558419474e-05, + "loss": 0.2302, + "step": 1541 + }, + { + "epoch": 2.451510333863275, + "grad_norm": 1.4954514035933968, + "learning_rate": 3.146146236072783e-05, + "loss": 0.2064, + "step": 1542 + }, + { + "epoch": 2.453100158982512, + "grad_norm": 1.0531233797293011, + "learning_rate": 3.1464492027246734e-05, + "loss": 0.1396, + "step": 1543 + }, + { + "epoch": 2.4546899841017487, + "grad_norm": 1.2449728626300245, + "learning_rate": 3.1467524582726e-05, + "loss": 0.1279, + "step": 1544 + }, + { + "epoch": 2.4562798092209857, + "grad_norm": 1.1274244747338644, + "learning_rate": 3.147056002613925e-05, + "loss": 0.1711, + "step": 1545 + }, + { + "epoch": 2.4578696343402227, + "grad_norm": 1.7982378255027534, + "learning_rate": 3.147359835645908e-05, + "loss": 0.169, + "step": 1546 + }, + { + "epoch": 2.4594594594594597, + "grad_norm": 0.8689095376740332, + "learning_rate": 3.1476639572657125e-05, + "loss": 0.1984, + "step": 1547 + }, + { + "epoch": 2.461049284578696, + "grad_norm": 1.0218448093979877, + "learning_rate": 3.147968367370404e-05, + "loss": 0.1839, + "step": 1548 + }, + { + "epoch": 2.462639109697933, + "grad_norm": 1.5260519817874239, + "learning_rate": 3.1482730658569524e-05, + "loss": 0.19, + "step": 1549 + }, + { + "epoch": 2.46422893481717, + "grad_norm": 27.353099202814303, + "learning_rate": 3.148578052622227e-05, + "loss": 25.1552, + "step": 1550 + }, + { + "epoch": 2.4658187599364068, + "grad_norm": 1.2370950473362627, + "learning_rate": 3.148883327563e-05, + "loss": 0.1732, + "step": 1551 + }, + { + "epoch": 2.4674085850556438, + "grad_norm": 1.782406424734523, + "learning_rate": 3.1491888905759486e-05, + "loss": 0.2094, + "step": 1552 + }, + { + "epoch": 2.4689984101748808, + "grad_norm": 4.058523803805113, + "learning_rate": 3.1494947415576506e-05, + "loss": 0.3051, + "step": 1553 + }, + { + "epoch": 2.4705882352941178, + "grad_norm": 1.7735133259534277, + "learning_rate": 3.1498008804045846e-05, + "loss": 0.1888, + "step": 1554 + }, + { + "epoch": 2.4721780604133547, + "grad_norm": 1.3410614826828207, + "learning_rate": 3.150107307013136e-05, + "loss": 0.1547, + "step": 1555 + }, + { + "epoch": 2.4737678855325913, + "grad_norm": 3.1654860002357523, + "learning_rate": 3.150414021279589e-05, + "loss": 0.2041, + "step": 1556 + }, + { + "epoch": 2.4753577106518283, + "grad_norm": 2.1675021409470148, + "learning_rate": 3.150721023100133e-05, + "loss": 0.2697, + "step": 1557 + }, + { + "epoch": 2.4769475357710653, + "grad_norm": 1.359752284577795, + "learning_rate": 3.151028312370859e-05, + "loss": 0.1854, + "step": 1558 + }, + { + "epoch": 2.478537360890302, + "grad_norm": 1.7383788340055064, + "learning_rate": 3.151335888987759e-05, + "loss": 0.1989, + "step": 1559 + }, + { + "epoch": 2.480127186009539, + "grad_norm": 1.836117580380345, + "learning_rate": 3.1516437528467315e-05, + "loss": 0.1769, + "step": 1560 + }, + { + "epoch": 2.481717011128776, + "grad_norm": 1.8333286667868176, + "learning_rate": 3.151951903843574e-05, + "loss": 0.1817, + "step": 1561 + }, + { + "epoch": 2.483306836248013, + "grad_norm": 2.120279631244857, + "learning_rate": 3.15226034187399e-05, + "loss": 0.1772, + "step": 1562 + }, + { + "epoch": 2.48489666136725, + "grad_norm": 2.741404407263717, + "learning_rate": 3.152569066833584e-05, + "loss": 0.1939, + "step": 1563 + }, + { + "epoch": 2.4864864864864864, + "grad_norm": 1.8914845041859072, + "learning_rate": 3.1528780786178625e-05, + "loss": 0.146, + "step": 1564 + }, + { + "epoch": 2.4880763116057234, + "grad_norm": 1.5340942829081892, + "learning_rate": 3.153187377122239e-05, + "loss": 0.1939, + "step": 1565 + }, + { + "epoch": 2.4896661367249604, + "grad_norm": 1.7439342919233336, + "learning_rate": 3.153496962242024e-05, + "loss": 0.1876, + "step": 1566 + }, + { + "epoch": 2.491255961844197, + "grad_norm": 1.61316043881466, + "learning_rate": 3.1538068338724364e-05, + "loss": 0.1702, + "step": 1567 + }, + { + "epoch": 2.492845786963434, + "grad_norm": 2.6287178758888725, + "learning_rate": 3.1541169919085945e-05, + "loss": 0.2057, + "step": 1568 + }, + { + "epoch": 2.494435612082671, + "grad_norm": 1.3821009699608557, + "learning_rate": 3.1544274362455215e-05, + "loss": 0.1686, + "step": 1569 + }, + { + "epoch": 2.496025437201908, + "grad_norm": 2.3644522206373892, + "learning_rate": 3.154738166778144e-05, + "loss": 0.2875, + "step": 1570 + }, + { + "epoch": 2.4976152623211445, + "grad_norm": 1.824177878410893, + "learning_rate": 3.1550491834012896e-05, + "loss": 0.2011, + "step": 1571 + }, + { + "epoch": 2.4992050874403815, + "grad_norm": 1.7540490819874688, + "learning_rate": 3.155360486009692e-05, + "loss": 0.1459, + "step": 1572 + }, + { + "epoch": 2.5007949125596185, + "grad_norm": 1.1836154367270562, + "learning_rate": 3.1556720744979846e-05, + "loss": 0.174, + "step": 1573 + }, + { + "epoch": 2.502384737678855, + "grad_norm": 1.971247818786658, + "learning_rate": 3.155983948760708e-05, + "loss": 0.164, + "step": 1574 + }, + { + "epoch": 2.503974562798092, + "grad_norm": 2.259937106561657, + "learning_rate": 3.1562961086923025e-05, + "loss": 0.1838, + "step": 1575 + }, + { + "epoch": 2.505564387917329, + "grad_norm": 1.3556363011844528, + "learning_rate": 3.156608554187115e-05, + "loss": 0.159, + "step": 1576 + }, + { + "epoch": 2.507154213036566, + "grad_norm": 1.6360385009731324, + "learning_rate": 3.1569212851393915e-05, + "loss": 0.221, + "step": 1577 + }, + { + "epoch": 2.508744038155803, + "grad_norm": 2.086187548519359, + "learning_rate": 3.157234301443286e-05, + "loss": 0.1374, + "step": 1578 + }, + { + "epoch": 2.5103338632750396, + "grad_norm": 1.0095470304857215, + "learning_rate": 3.1575476029928524e-05, + "loss": 0.1724, + "step": 1579 + }, + { + "epoch": 2.5119236883942766, + "grad_norm": 1.6433520231243424, + "learning_rate": 3.157861189682051e-05, + "loss": 0.232, + "step": 1580 + }, + { + "epoch": 2.5135135135135136, + "grad_norm": 1.0328133474390058, + "learning_rate": 3.1581750614047434e-05, + "loss": 0.1761, + "step": 1581 + }, + { + "epoch": 2.51510333863275, + "grad_norm": 1.2690146738591945, + "learning_rate": 3.158489218054693e-05, + "loss": 0.2007, + "step": 1582 + }, + { + "epoch": 2.516693163751987, + "grad_norm": 1.246144047347006, + "learning_rate": 3.1588036595255746e-05, + "loss": 0.1999, + "step": 1583 + }, + { + "epoch": 2.518282988871224, + "grad_norm": 1.0850788097137907, + "learning_rate": 3.159118385710955e-05, + "loss": 0.1901, + "step": 1584 + }, + { + "epoch": 2.519872813990461, + "grad_norm": 0.8284641994965761, + "learning_rate": 3.159433396504316e-05, + "loss": 0.1658, + "step": 1585 + }, + { + "epoch": 2.521462639109698, + "grad_norm": 0.9720658936117254, + "learning_rate": 3.1597486917990346e-05, + "loss": 0.1578, + "step": 1586 + }, + { + "epoch": 2.5230524642289347, + "grad_norm": 1.840114119924228, + "learning_rate": 3.1600642714883954e-05, + "loss": 0.221, + "step": 1587 + }, + { + "epoch": 2.5246422893481717, + "grad_norm": 2.377086664673885, + "learning_rate": 3.1603801354655866e-05, + "loss": 0.1853, + "step": 1588 + }, + { + "epoch": 2.5262321144674087, + "grad_norm": 2.0315889551445405, + "learning_rate": 3.1606962836237004e-05, + "loss": 0.1776, + "step": 1589 + }, + { + "epoch": 2.5278219395866453, + "grad_norm": 1.9891518136174857, + "learning_rate": 3.1610127158557295e-05, + "loss": 0.1738, + "step": 1590 + }, + { + "epoch": 2.5294117647058822, + "grad_norm": 1.8688675727531665, + "learning_rate": 3.161329432054576e-05, + "loss": 0.2659, + "step": 1591 + }, + { + "epoch": 2.5310015898251192, + "grad_norm": 1.7867401394045739, + "learning_rate": 3.161646432113042e-05, + "loss": 0.1975, + "step": 1592 + }, + { + "epoch": 2.5325914149443562, + "grad_norm": 2.171421626126862, + "learning_rate": 3.161963715923833e-05, + "loss": 0.242, + "step": 1593 + }, + { + "epoch": 2.5341812400635932, + "grad_norm": 2.6943614232804096, + "learning_rate": 3.1622812833795616e-05, + "loss": 0.2038, + "step": 1594 + }, + { + "epoch": 2.53577106518283, + "grad_norm": 0.8935098063562663, + "learning_rate": 3.1625991343727414e-05, + "loss": 0.2242, + "step": 1595 + }, + { + "epoch": 2.537360890302067, + "grad_norm": 1.7616556618538113, + "learning_rate": 3.162917268795793e-05, + "loss": 0.1934, + "step": 1596 + }, + { + "epoch": 2.538950715421304, + "grad_norm": 1.6185284022114828, + "learning_rate": 3.163235686541038e-05, + "loss": 0.1785, + "step": 1597 + }, + { + "epoch": 2.5405405405405403, + "grad_norm": 2.3720044892681322, + "learning_rate": 3.163554387500705e-05, + "loss": 0.2387, + "step": 1598 + }, + { + "epoch": 2.5421303656597773, + "grad_norm": 1.1746212556196753, + "learning_rate": 3.1638733715669226e-05, + "loss": 0.1546, + "step": 1599 + }, + { + "epoch": 2.5437201907790143, + "grad_norm": 2.0582307775588706, + "learning_rate": 3.16419263863173e-05, + "loss": 0.1576, + "step": 1600 + }, + { + "epoch": 2.5453100158982513, + "grad_norm": 1.425021499342463, + "learning_rate": 3.164512188587064e-05, + "loss": 0.15, + "step": 1601 + }, + { + "epoch": 2.5468998410174883, + "grad_norm": 4.712356314712078, + "learning_rate": 3.164832021324768e-05, + "loss": 0.5045, + "step": 1602 + }, + { + "epoch": 2.548489666136725, + "grad_norm": 1.1145134267667107, + "learning_rate": 3.165152136736593e-05, + "loss": 0.1196, + "step": 1603 + }, + { + "epoch": 2.550079491255962, + "grad_norm": 1.5850706192031563, + "learning_rate": 3.16547253471419e-05, + "loss": 0.2033, + "step": 1604 + }, + { + "epoch": 2.551669316375199, + "grad_norm": 1.1366371495805372, + "learning_rate": 3.165793215149116e-05, + "loss": 0.1293, + "step": 1605 + }, + { + "epoch": 2.5532591414944354, + "grad_norm": 0.9952876982851481, + "learning_rate": 3.1661141779328316e-05, + "loss": 0.1959, + "step": 1606 + }, + { + "epoch": 2.5548489666136724, + "grad_norm": 1.2205563642082213, + "learning_rate": 3.1664354229567046e-05, + "loss": 0.1932, + "step": 1607 + }, + { + "epoch": 2.5564387917329094, + "grad_norm": 1.2206997124155936, + "learning_rate": 3.1667569501120016e-05, + "loss": 0.2295, + "step": 1608 + }, + { + "epoch": 2.5580286168521464, + "grad_norm": 0.8037883586730251, + "learning_rate": 3.167078759289901e-05, + "loss": 0.2214, + "step": 1609 + }, + { + "epoch": 2.559618441971383, + "grad_norm": 0.9903893497716555, + "learning_rate": 3.1674008503814794e-05, + "loss": 0.1528, + "step": 1610 + }, + { + "epoch": 2.56120826709062, + "grad_norm": 1.1079250554104005, + "learning_rate": 3.167723223277722e-05, + "loss": 0.2337, + "step": 1611 + }, + { + "epoch": 2.562798092209857, + "grad_norm": 0.805272238731337, + "learning_rate": 3.168045877869518e-05, + "loss": 0.2316, + "step": 1612 + }, + { + "epoch": 2.5643879173290935, + "grad_norm": 0.9296334785312509, + "learning_rate": 3.168368814047658e-05, + "loss": 0.146, + "step": 1613 + }, + { + "epoch": 2.5659777424483305, + "grad_norm": 1.1095332688567827, + "learning_rate": 3.168692031702842e-05, + "loss": 0.137, + "step": 1614 + }, + { + "epoch": 2.5675675675675675, + "grad_norm": 1.195186347935812, + "learning_rate": 3.169015530725672e-05, + "loss": 0.1781, + "step": 1615 + }, + { + "epoch": 2.5691573926868045, + "grad_norm": 1.2188215958285262, + "learning_rate": 3.169339311006655e-05, + "loss": 0.2506, + "step": 1616 + }, + { + "epoch": 2.5707472178060415, + "grad_norm": 1.8952347764631603, + "learning_rate": 3.1696633724362025e-05, + "loss": 0.1658, + "step": 1617 + }, + { + "epoch": 2.572337042925278, + "grad_norm": 1.1303299911350377, + "learning_rate": 3.1699877149046325e-05, + "loss": 0.1467, + "step": 1618 + }, + { + "epoch": 2.573926868044515, + "grad_norm": 1.272771800812583, + "learning_rate": 3.1703123383021666e-05, + "loss": 0.1791, + "step": 1619 + }, + { + "epoch": 2.575516693163752, + "grad_norm": 1.477385369713666, + "learning_rate": 3.170637242518931e-05, + "loss": 0.1206, + "step": 1620 + }, + { + "epoch": 2.5771065182829886, + "grad_norm": 1.1719991964586132, + "learning_rate": 3.170962427444958e-05, + "loss": 0.14, + "step": 1621 + }, + { + "epoch": 2.5786963434022256, + "grad_norm": 1.9668466686565567, + "learning_rate": 3.1712878929701844e-05, + "loss": 0.2481, + "step": 1622 + }, + { + "epoch": 2.5802861685214626, + "grad_norm": 0.7302787952789077, + "learning_rate": 3.171613638984451e-05, + "loss": 0.1544, + "step": 1623 + }, + { + "epoch": 2.5818759936406996, + "grad_norm": 0.9105475085363616, + "learning_rate": 3.171939665377506e-05, + "loss": 0.1962, + "step": 1624 + }, + { + "epoch": 2.5834658187599366, + "grad_norm": 1.2853315261327518, + "learning_rate": 3.172265972039e-05, + "loss": 0.1766, + "step": 1625 + }, + { + "epoch": 2.585055643879173, + "grad_norm": 0.7011321641667215, + "learning_rate": 3.17259255885849e-05, + "loss": 0.1593, + "step": 1626 + }, + { + "epoch": 2.58664546899841, + "grad_norm": 4.172950592456332, + "learning_rate": 3.172919425725438e-05, + "loss": 0.3516, + "step": 1627 + }, + { + "epoch": 2.588235294117647, + "grad_norm": 0.6970200702320729, + "learning_rate": 3.1732465725292126e-05, + "loss": 0.1466, + "step": 1628 + }, + { + "epoch": 2.5898251192368837, + "grad_norm": 1.3745930167631675, + "learning_rate": 3.173573999159086e-05, + "loss": 0.2465, + "step": 1629 + }, + { + "epoch": 2.5914149443561207, + "grad_norm": 21.45415587211635, + "learning_rate": 3.1739017055042365e-05, + "loss": 12.3912, + "step": 1630 + }, + { + "epoch": 2.5930047694753577, + "grad_norm": 0.6469654376167354, + "learning_rate": 3.174229691453746e-05, + "loss": 0.1966, + "step": 1631 + }, + { + "epoch": 2.5945945945945947, + "grad_norm": 1.2012334629515304, + "learning_rate": 3.174557956896604e-05, + "loss": 0.1595, + "step": 1632 + }, + { + "epoch": 2.5961844197138317, + "grad_norm": 1.067169130312683, + "learning_rate": 3.174886501721705e-05, + "loss": 0.1718, + "step": 1633 + }, + { + "epoch": 2.5977742448330683, + "grad_norm": 1.1754927638047044, + "learning_rate": 3.175215325817848e-05, + "loss": 0.161, + "step": 1634 + }, + { + "epoch": 2.5993640699523053, + "grad_norm": 33.8228033276979, + "learning_rate": 3.1755444290737376e-05, + "loss": 23.8961, + "step": 1635 + }, + { + "epoch": 2.6009538950715423, + "grad_norm": 1.389730707345103, + "learning_rate": 3.175873811377985e-05, + "loss": 0.1651, + "step": 1636 + }, + { + "epoch": 2.602543720190779, + "grad_norm": 1.032629610400898, + "learning_rate": 3.176203472619105e-05, + "loss": 0.2002, + "step": 1637 + }, + { + "epoch": 2.604133545310016, + "grad_norm": 1.8994430286847295, + "learning_rate": 3.176533412685521e-05, + "loss": 0.1939, + "step": 1638 + }, + { + "epoch": 2.605723370429253, + "grad_norm": 1.3019931066620578, + "learning_rate": 3.176863631465559e-05, + "loss": 0.1492, + "step": 1639 + }, + { + "epoch": 2.60731319554849, + "grad_norm": 5.501726750465694, + "learning_rate": 3.177194128847451e-05, + "loss": 0.3948, + "step": 1640 + }, + { + "epoch": 2.6089030206677264, + "grad_norm": 0.8538889161897999, + "learning_rate": 3.177524904719337e-05, + "loss": 0.1582, + "step": 1641 + }, + { + "epoch": 2.6104928457869634, + "grad_norm": 0.6587350500729588, + "learning_rate": 3.177855958969263e-05, + "loss": 0.1789, + "step": 1642 + }, + { + "epoch": 2.6120826709062004, + "grad_norm": 2.3652042751342557, + "learning_rate": 3.1781872914851756e-05, + "loss": 0.2941, + "step": 1643 + }, + { + "epoch": 2.613672496025437, + "grad_norm": 1.486225739504669, + "learning_rate": 3.178518902154933e-05, + "loss": 0.1594, + "step": 1644 + }, + { + "epoch": 2.615262321144674, + "grad_norm": 1.8505554207598498, + "learning_rate": 3.178850790866296e-05, + "loss": 0.1976, + "step": 1645 + }, + { + "epoch": 2.616852146263911, + "grad_norm": 1.1723249654038448, + "learning_rate": 3.179182957506933e-05, + "loss": 0.1911, + "step": 1646 + }, + { + "epoch": 2.618441971383148, + "grad_norm": 1.3728987602444567, + "learning_rate": 3.179515401964417e-05, + "loss": 0.249, + "step": 1647 + }, + { + "epoch": 2.620031796502385, + "grad_norm": 1.0593486516109556, + "learning_rate": 3.1798481241262284e-05, + "loss": 0.1723, + "step": 1648 + }, + { + "epoch": 2.6216216216216215, + "grad_norm": 1.281494858602922, + "learning_rate": 3.1801811238797515e-05, + "loss": 0.2096, + "step": 1649 + }, + { + "epoch": 2.6232114467408585, + "grad_norm": 1.5552829966348858, + "learning_rate": 3.1805144011122795e-05, + "loss": 0.2733, + "step": 1650 + }, + { + "epoch": 2.6248012718600955, + "grad_norm": 0.8680567447248302, + "learning_rate": 3.180847955711008e-05, + "loss": 0.1482, + "step": 1651 + }, + { + "epoch": 2.626391096979332, + "grad_norm": 1.6052048022167813, + "learning_rate": 3.181181787563043e-05, + "loss": 0.1218, + "step": 1652 + }, + { + "epoch": 2.627980922098569, + "grad_norm": 1.5389332831609386, + "learning_rate": 3.181515896555394e-05, + "loss": 0.196, + "step": 1653 + }, + { + "epoch": 2.629570747217806, + "grad_norm": 1.4382613782415516, + "learning_rate": 3.181850282574977e-05, + "loss": 0.1883, + "step": 1654 + }, + { + "epoch": 2.631160572337043, + "grad_norm": 1.78131454821202, + "learning_rate": 3.182184945508613e-05, + "loss": 0.2171, + "step": 1655 + }, + { + "epoch": 2.63275039745628, + "grad_norm": 1.5959026050327163, + "learning_rate": 3.182519885243033e-05, + "loss": 0.167, + "step": 1656 + }, + { + "epoch": 2.6343402225755166, + "grad_norm": 1.4506501146841084, + "learning_rate": 3.182855101664872e-05, + "loss": 0.1826, + "step": 1657 + }, + { + "epoch": 2.6359300476947536, + "grad_norm": 1.2336988576159258, + "learning_rate": 3.183190594660669e-05, + "loss": 0.19, + "step": 1658 + }, + { + "epoch": 2.6375198728139906, + "grad_norm": 1.7568890220037732, + "learning_rate": 3.183526364116874e-05, + "loss": 0.2242, + "step": 1659 + }, + { + "epoch": 2.639109697933227, + "grad_norm": 1.9760264122780529, + "learning_rate": 3.1838624099198396e-05, + "loss": 0.2328, + "step": 1660 + }, + { + "epoch": 2.640699523052464, + "grad_norm": 2.0053734765637574, + "learning_rate": 3.184198731955827e-05, + "loss": 0.1501, + "step": 1661 + }, + { + "epoch": 2.642289348171701, + "grad_norm": 2.258285985704613, + "learning_rate": 3.184535330111005e-05, + "loss": 0.185, + "step": 1662 + }, + { + "epoch": 2.643879173290938, + "grad_norm": 2.3558932735456786, + "learning_rate": 3.1848722042714454e-05, + "loss": 0.2384, + "step": 1663 + }, + { + "epoch": 2.645468998410175, + "grad_norm": 0.8834694005323286, + "learning_rate": 3.185209354323129e-05, + "loss": 0.1489, + "step": 1664 + }, + { + "epoch": 2.6470588235294117, + "grad_norm": 1.0315247898130053, + "learning_rate": 3.185546780151943e-05, + "loss": 0.1556, + "step": 1665 + }, + { + "epoch": 2.6486486486486487, + "grad_norm": 1.414992440838794, + "learning_rate": 3.1858844816436804e-05, + "loss": 0.1532, + "step": 1666 + }, + { + "epoch": 2.6502384737678857, + "grad_norm": 1.543054409974727, + "learning_rate": 3.1862224586840425e-05, + "loss": 0.1717, + "step": 1667 + }, + { + "epoch": 2.6518282988871222, + "grad_norm": 1.2348687419858817, + "learning_rate": 3.186560711158635e-05, + "loss": 0.1859, + "step": 1668 + }, + { + "epoch": 2.6534181240063592, + "grad_norm": 1.0777269835329621, + "learning_rate": 3.186899238952972e-05, + "loss": 0.1785, + "step": 1669 + }, + { + "epoch": 2.6550079491255962, + "grad_norm": 1.650655142566561, + "learning_rate": 3.1872380419524755e-05, + "loss": 0.1778, + "step": 1670 + }, + { + "epoch": 2.6565977742448332, + "grad_norm": 1.7929541694054434, + "learning_rate": 3.187577120042471e-05, + "loss": 0.1782, + "step": 1671 + }, + { + "epoch": 2.65818759936407, + "grad_norm": 1.435896564322184, + "learning_rate": 3.1879164731081936e-05, + "loss": 0.1707, + "step": 1672 + }, + { + "epoch": 2.659777424483307, + "grad_norm": 1.7621159292367217, + "learning_rate": 3.188256101034785e-05, + "loss": 0.1856, + "step": 1673 + }, + { + "epoch": 2.661367249602544, + "grad_norm": 1.3976184816737527, + "learning_rate": 3.1885960037072935e-05, + "loss": 0.2045, + "step": 1674 + }, + { + "epoch": 2.6629570747217803, + "grad_norm": 1.2330924249145283, + "learning_rate": 3.1889361810106736e-05, + "loss": 0.1586, + "step": 1675 + }, + { + "epoch": 2.6645468998410173, + "grad_norm": 1.3158885034398211, + "learning_rate": 3.1892766328297876e-05, + "loss": 0.1507, + "step": 1676 + }, + { + "epoch": 2.6661367249602543, + "grad_norm": 1.1200144607379563, + "learning_rate": 3.189617359049406e-05, + "loss": 0.1689, + "step": 1677 + }, + { + "epoch": 2.6677265500794913, + "grad_norm": 1.7236185066275653, + "learning_rate": 3.189958359554204e-05, + "loss": 0.2984, + "step": 1678 + }, + { + "epoch": 2.6693163751987283, + "grad_norm": 1.355741214581686, + "learning_rate": 3.1902996342287666e-05, + "loss": 0.2031, + "step": 1679 + }, + { + "epoch": 2.670906200317965, + "grad_norm": 1.5073381933331391, + "learning_rate": 3.1906411829575835e-05, + "loss": 0.1832, + "step": 1680 + }, + { + "epoch": 2.672496025437202, + "grad_norm": 0.8702560004838332, + "learning_rate": 3.190983005625053e-05, + "loss": 0.1488, + "step": 1681 + }, + { + "epoch": 2.674085850556439, + "grad_norm": 1.03058745025465, + "learning_rate": 3.191325102115481e-05, + "loss": 0.1792, + "step": 1682 + }, + { + "epoch": 2.6756756756756754, + "grad_norm": 1.923291188689511, + "learning_rate": 3.19166747231308e-05, + "loss": 0.2635, + "step": 1683 + }, + { + "epoch": 2.6772655007949124, + "grad_norm": 1.6836533719738307, + "learning_rate": 3.19201011610197e-05, + "loss": 0.249, + "step": 1684 + }, + { + "epoch": 2.6788553259141494, + "grad_norm": 1.964818303051795, + "learning_rate": 3.1923530333661784e-05, + "loss": 0.1682, + "step": 1685 + }, + { + "epoch": 2.6804451510333864, + "grad_norm": 1.3778456952100422, + "learning_rate": 3.1926962239896404e-05, + "loss": 0.2093, + "step": 1686 + }, + { + "epoch": 2.6820349761526234, + "grad_norm": 1.27779326758411, + "learning_rate": 3.193039687856198e-05, + "loss": 0.1741, + "step": 1687 + }, + { + "epoch": 2.68362480127186, + "grad_norm": 1.4621100523109438, + "learning_rate": 3.1933834248496016e-05, + "loss": 0.1598, + "step": 1688 + }, + { + "epoch": 2.685214626391097, + "grad_norm": 0.8665263222298191, + "learning_rate": 3.193727434853508e-05, + "loss": 0.1512, + "step": 1689 + }, + { + "epoch": 2.686804451510334, + "grad_norm": 1.7393494446613103, + "learning_rate": 3.194071717751484e-05, + "loss": 0.2215, + "step": 1690 + }, + { + "epoch": 2.6883942766295705, + "grad_norm": 1.423530279081555, + "learning_rate": 3.1944162734270007e-05, + "loss": 0.1694, + "step": 1691 + }, + { + "epoch": 2.6899841017488075, + "grad_norm": 1.0113582197331754, + "learning_rate": 3.194761101763439e-05, + "loss": 0.1793, + "step": 1692 + }, + { + "epoch": 2.6915739268680445, + "grad_norm": 1.6972460013817485, + "learning_rate": 3.1951062026440854e-05, + "loss": 0.1699, + "step": 1693 + }, + { + "epoch": 2.6931637519872815, + "grad_norm": 1.4275489492514701, + "learning_rate": 3.195451575952138e-05, + "loss": 0.2191, + "step": 1694 + }, + { + "epoch": 2.6947535771065185, + "grad_norm": 1.122093489103576, + "learning_rate": 3.1957972215707e-05, + "loss": 0.2306, + "step": 1695 + }, + { + "epoch": 2.696343402225755, + "grad_norm": 0.7456333724225834, + "learning_rate": 3.196143139382783e-05, + "loss": 0.155, + "step": 1696 + }, + { + "epoch": 2.697933227344992, + "grad_norm": 1.477067741269703, + "learning_rate": 3.196489329271305e-05, + "loss": 0.1922, + "step": 1697 + }, + { + "epoch": 2.699523052464229, + "grad_norm": 1.355615214597007, + "learning_rate": 3.1968357911190936e-05, + "loss": 0.191, + "step": 1698 + }, + { + "epoch": 2.7011128775834656, + "grad_norm": 1.0408488873943331, + "learning_rate": 3.1971825248088855e-05, + "loss": 0.1966, + "step": 1699 + }, + { + "epoch": 2.7027027027027026, + "grad_norm": 1.13717420157594, + "learning_rate": 3.197529530223323e-05, + "loss": 0.1942, + "step": 1700 + }, + { + "epoch": 2.7042925278219396, + "grad_norm": 2.2414440722425377, + "learning_rate": 3.197876807244956e-05, + "loss": 0.2886, + "step": 1701 + }, + { + "epoch": 2.7058823529411766, + "grad_norm": 1.3631627310146235, + "learning_rate": 3.198224355756246e-05, + "loss": 0.2118, + "step": 1702 + }, + { + "epoch": 2.7074721780604136, + "grad_norm": 1.6438113179195133, + "learning_rate": 3.1985721756395596e-05, + "loss": 0.2801, + "step": 1703 + }, + { + "epoch": 2.70906200317965, + "grad_norm": 1.296013225608353, + "learning_rate": 3.198920266777171e-05, + "loss": 0.1591, + "step": 1704 + }, + { + "epoch": 2.710651828298887, + "grad_norm": 1.1902862599409112, + "learning_rate": 3.199268629051267e-05, + "loss": 0.2281, + "step": 1705 + }, + { + "epoch": 2.7122416534181237, + "grad_norm": 1.3033252061979184, + "learning_rate": 3.1996172623439363e-05, + "loss": 0.2097, + "step": 1706 + }, + { + "epoch": 2.7138314785373607, + "grad_norm": 1.3567354808550032, + "learning_rate": 3.1999661665371815e-05, + "loss": 0.1386, + "step": 1707 + }, + { + "epoch": 2.7154213036565977, + "grad_norm": 1.1980831364476576, + "learning_rate": 3.2003153415129096e-05, + "loss": 0.2669, + "step": 1708 + }, + { + "epoch": 2.7170111287758347, + "grad_norm": 0.8758886678648288, + "learning_rate": 3.2006647871529385e-05, + "loss": 0.1794, + "step": 1709 + }, + { + "epoch": 2.7186009538950717, + "grad_norm": 2.2770119920425924, + "learning_rate": 3.201014503338993e-05, + "loss": 0.1909, + "step": 1710 + }, + { + "epoch": 2.7201907790143083, + "grad_norm": 1.3578763903897242, + "learning_rate": 3.201364489952707e-05, + "loss": 0.2351, + "step": 1711 + }, + { + "epoch": 2.7217806041335453, + "grad_norm": 1.4609257473020034, + "learning_rate": 3.201714746875623e-05, + "loss": 0.2299, + "step": 1712 + }, + { + "epoch": 2.7233704292527823, + "grad_norm": 1.282090891499825, + "learning_rate": 3.202065273989192e-05, + "loss": 0.2211, + "step": 1713 + }, + { + "epoch": 2.724960254372019, + "grad_norm": 0.8559316511104884, + "learning_rate": 3.202416071174771e-05, + "loss": 0.1496, + "step": 1714 + }, + { + "epoch": 2.726550079491256, + "grad_norm": 32.53751570442245, + "learning_rate": 3.202767138313631e-05, + "loss": 23.3768, + "step": 1715 + }, + { + "epoch": 2.728139904610493, + "grad_norm": 1.3776538876684823, + "learning_rate": 3.203118475286947e-05, + "loss": 0.1701, + "step": 1716 + }, + { + "epoch": 2.72972972972973, + "grad_norm": 1.187766910298497, + "learning_rate": 3.2034700819758046e-05, + "loss": 0.2013, + "step": 1717 + }, + { + "epoch": 2.731319554848967, + "grad_norm": 1.567732228678719, + "learning_rate": 3.203821958261196e-05, + "loss": 0.189, + "step": 1718 + }, + { + "epoch": 2.7329093799682034, + "grad_norm": 1.1703829067350333, + "learning_rate": 3.204174104024026e-05, + "loss": 0.1996, + "step": 1719 + }, + { + "epoch": 2.7344992050874404, + "grad_norm": 0.8792507982491055, + "learning_rate": 3.204526519145105e-05, + "loss": 0.1807, + "step": 1720 + }, + { + "epoch": 2.7360890302066774, + "grad_norm": 1.8268282056630911, + "learning_rate": 3.204879203505152e-05, + "loss": 0.1598, + "step": 1721 + }, + { + "epoch": 2.737678855325914, + "grad_norm": 1.6029457919985353, + "learning_rate": 3.205232156984799e-05, + "loss": 0.1829, + "step": 1722 + }, + { + "epoch": 2.739268680445151, + "grad_norm": 1.2319465015828268, + "learning_rate": 3.205585379464582e-05, + "loss": 0.1588, + "step": 1723 + }, + { + "epoch": 2.740858505564388, + "grad_norm": 1.7712858001605818, + "learning_rate": 3.2059388708249486e-05, + "loss": 0.2265, + "step": 1724 + }, + { + "epoch": 2.742448330683625, + "grad_norm": 0.8573729718573424, + "learning_rate": 3.206292630946255e-05, + "loss": 0.1771, + "step": 1725 + }, + { + "epoch": 2.744038155802862, + "grad_norm": 1.4809619002048446, + "learning_rate": 3.206646659708765e-05, + "loss": 0.1957, + "step": 1726 + }, + { + "epoch": 2.7456279809220985, + "grad_norm": 1.8338706312751636, + "learning_rate": 3.2070009569926546e-05, + "loss": 0.2097, + "step": 1727 + }, + { + "epoch": 2.7472178060413355, + "grad_norm": 1.039101427630297, + "learning_rate": 3.207355522678005e-05, + "loss": 0.1587, + "step": 1728 + }, + { + "epoch": 2.7488076311605725, + "grad_norm": 2.128916809261633, + "learning_rate": 3.207710356644809e-05, + "loss": 0.2538, + "step": 1729 + }, + { + "epoch": 2.750397456279809, + "grad_norm": 0.9104875019324283, + "learning_rate": 3.20806545877297e-05, + "loss": 0.1329, + "step": 1730 + }, + { + "epoch": 2.751987281399046, + "grad_norm": 1.2507427845750951, + "learning_rate": 3.208420828942297e-05, + "loss": 0.1729, + "step": 1731 + }, + { + "epoch": 2.753577106518283, + "grad_norm": 1.9635739107784789, + "learning_rate": 3.2087764670325106e-05, + "loss": 0.2143, + "step": 1732 + }, + { + "epoch": 2.75516693163752, + "grad_norm": 1.0835176712051497, + "learning_rate": 3.20913237292324e-05, + "loss": 0.2116, + "step": 1733 + }, + { + "epoch": 2.756756756756757, + "grad_norm": 2.5062558678885667, + "learning_rate": 3.2094885464940236e-05, + "loss": 0.2314, + "step": 1734 + }, + { + "epoch": 2.7583465818759936, + "grad_norm": 1.0641432499915056, + "learning_rate": 3.2098449876243093e-05, + "loss": 0.1471, + "step": 1735 + }, + { + "epoch": 2.7599364069952306, + "grad_norm": 1.27405110523491, + "learning_rate": 3.2102016961934575e-05, + "loss": 0.2091, + "step": 1736 + }, + { + "epoch": 2.7615262321144676, + "grad_norm": 2.477749790896029, + "learning_rate": 3.210558672080731e-05, + "loss": 0.1802, + "step": 1737 + }, + { + "epoch": 2.763116057233704, + "grad_norm": 0.7760285817555416, + "learning_rate": 3.21091591516531e-05, + "loss": 0.1575, + "step": 1738 + }, + { + "epoch": 2.764705882352941, + "grad_norm": 0.9769955512861405, + "learning_rate": 3.211273425326278e-05, + "loss": 0.1832, + "step": 1739 + }, + { + "epoch": 2.766295707472178, + "grad_norm": 1.2731070411363863, + "learning_rate": 3.2116312024426325e-05, + "loss": 0.1565, + "step": 1740 + }, + { + "epoch": 2.767885532591415, + "grad_norm": 1.175387201377257, + "learning_rate": 3.211989246393278e-05, + "loss": 0.174, + "step": 1741 + }, + { + "epoch": 2.7694753577106517, + "grad_norm": 2.2529024620001, + "learning_rate": 3.21234755705703e-05, + "loss": 0.1835, + "step": 1742 + }, + { + "epoch": 2.7710651828298887, + "grad_norm": 1.176054148207597, + "learning_rate": 3.212706134312614e-05, + "loss": 0.3043, + "step": 1743 + }, + { + "epoch": 2.7726550079491257, + "grad_norm": 2.104039901756064, + "learning_rate": 3.213064978038662e-05, + "loss": 0.2276, + "step": 1744 + }, + { + "epoch": 2.7742448330683622, + "grad_norm": 0.7709259002556587, + "learning_rate": 3.2134240881137215e-05, + "loss": 0.1119, + "step": 1745 + }, + { + "epoch": 2.7758346581875992, + "grad_norm": 1.5939038386254845, + "learning_rate": 3.213783464416246e-05, + "loss": 0.2864, + "step": 1746 + }, + { + "epoch": 2.7774244833068362, + "grad_norm": 14.80254176290554, + "learning_rate": 3.2141431068245975e-05, + "loss": 12.0569, + "step": 1747 + }, + { + "epoch": 2.779014308426073, + "grad_norm": 1.0609416169674386, + "learning_rate": 3.214503015217053e-05, + "loss": 0.1465, + "step": 1748 + }, + { + "epoch": 2.78060413354531, + "grad_norm": 1.5027287422318525, + "learning_rate": 3.2148631894717945e-05, + "loss": 0.1965, + "step": 1749 + }, + { + "epoch": 2.7821939586645468, + "grad_norm": 0.7307606870505935, + "learning_rate": 3.215223629466917e-05, + "loss": 0.1381, + "step": 1750 + }, + { + "epoch": 2.7837837837837838, + "grad_norm": 1.714743212025667, + "learning_rate": 3.2155843350804244e-05, + "loss": 0.2377, + "step": 1751 + }, + { + "epoch": 2.7853736089030208, + "grad_norm": 34.97373885193613, + "learning_rate": 3.2159453061902314e-05, + "loss": 22.6905, + "step": 1752 + }, + { + "epoch": 2.7869634340222573, + "grad_norm": 1.7772171006352502, + "learning_rate": 3.2163065426741604e-05, + "loss": 0.1993, + "step": 1753 + }, + { + "epoch": 2.7885532591414943, + "grad_norm": 1.9736706194550975, + "learning_rate": 3.216668044409948e-05, + "loss": 0.1461, + "step": 1754 + }, + { + "epoch": 2.7901430842607313, + "grad_norm": 21.846105885135277, + "learning_rate": 3.217029811275238e-05, + "loss": 11.3971, + "step": 1755 + }, + { + "epoch": 2.7917329093799683, + "grad_norm": 3.6866855585417335, + "learning_rate": 3.217391843147587e-05, + "loss": 0.2437, + "step": 1756 + }, + { + "epoch": 2.7933227344992053, + "grad_norm": 1.8147222778311776, + "learning_rate": 3.2177541399044574e-05, + "loss": 0.198, + "step": 1757 + }, + { + "epoch": 2.794912559618442, + "grad_norm": 2.335280478843124, + "learning_rate": 3.218116701423227e-05, + "loss": 0.155, + "step": 1758 + }, + { + "epoch": 2.796502384737679, + "grad_norm": 2.1947574636326537, + "learning_rate": 3.218479527581182e-05, + "loss": 0.1701, + "step": 1759 + }, + { + "epoch": 2.798092209856916, + "grad_norm": 1.1251785657993423, + "learning_rate": 3.2188426182555166e-05, + "loss": 0.1652, + "step": 1760 + }, + { + "epoch": 2.7996820349761524, + "grad_norm": 2.699847610986689, + "learning_rate": 3.2192059733233414e-05, + "loss": 0.2165, + "step": 1761 + }, + { + "epoch": 2.8012718600953894, + "grad_norm": 4.306110244044309, + "learning_rate": 3.21956959266167e-05, + "loss": 0.2491, + "step": 1762 + }, + { + "epoch": 2.8028616852146264, + "grad_norm": 1.9944981895021419, + "learning_rate": 3.2199334761474334e-05, + "loss": 0.146, + "step": 1763 + }, + { + "epoch": 2.8044515103338634, + "grad_norm": 3.53157054045948, + "learning_rate": 3.220297623657469e-05, + "loss": 0.3417, + "step": 1764 + }, + { + "epoch": 2.8060413354531004, + "grad_norm": 1.6904104701396765, + "learning_rate": 3.220662035068526e-05, + "loss": 0.1897, + "step": 1765 + }, + { + "epoch": 2.807631160572337, + "grad_norm": 2.111251656890466, + "learning_rate": 3.221026710257264e-05, + "loss": 0.2737, + "step": 1766 + }, + { + "epoch": 2.809220985691574, + "grad_norm": 1.4197709979435877, + "learning_rate": 3.221391649100255e-05, + "loss": 0.1191, + "step": 1767 + }, + { + "epoch": 2.810810810810811, + "grad_norm": 2.1680567222244953, + "learning_rate": 3.2217568514739795e-05, + "loss": 0.1694, + "step": 1768 + }, + { + "epoch": 2.8124006359300475, + "grad_norm": 3.7832067252603165, + "learning_rate": 3.222122317254829e-05, + "loss": 0.1965, + "step": 1769 + }, + { + "epoch": 2.8139904610492845, + "grad_norm": 2.000114122655783, + "learning_rate": 3.222488046319107e-05, + "loss": 0.2326, + "step": 1770 + }, + { + "epoch": 2.8155802861685215, + "grad_norm": 1.933208812867363, + "learning_rate": 3.222854038543029e-05, + "loss": 0.2298, + "step": 1771 + }, + { + "epoch": 2.8171701112877585, + "grad_norm": 0.6352925591220043, + "learning_rate": 3.2232202938027174e-05, + "loss": 0.1778, + "step": 1772 + }, + { + "epoch": 2.818759936406995, + "grad_norm": 2.598852653632586, + "learning_rate": 3.223586811974211e-05, + "loss": 0.3305, + "step": 1773 + }, + { + "epoch": 2.820349761526232, + "grad_norm": 2.1144912865314804, + "learning_rate": 3.2239535929334535e-05, + "loss": 0.1947, + "step": 1774 + }, + { + "epoch": 2.821939586645469, + "grad_norm": 2.4545852865496, + "learning_rate": 3.224320636556305e-05, + "loss": 0.1631, + "step": 1775 + }, + { + "epoch": 2.8235294117647056, + "grad_norm": 1.7121594275242755, + "learning_rate": 3.2246879427185346e-05, + "loss": 0.2234, + "step": 1776 + }, + { + "epoch": 2.8251192368839426, + "grad_norm": 2.3896269986296375, + "learning_rate": 3.225055511295821e-05, + "loss": 0.2006, + "step": 1777 + }, + { + "epoch": 2.8267090620031796, + "grad_norm": 1.8310456788085938, + "learning_rate": 3.2254233421637555e-05, + "loss": 0.1796, + "step": 1778 + }, + { + "epoch": 2.8282988871224166, + "grad_norm": 1.7401567196392833, + "learning_rate": 3.225791435197842e-05, + "loss": 0.1902, + "step": 1779 + }, + { + "epoch": 2.8298887122416536, + "grad_norm": 2.4531004194601413, + "learning_rate": 3.226159790273493e-05, + "loss": 0.1912, + "step": 1780 + }, + { + "epoch": 2.83147853736089, + "grad_norm": 2.832812464289055, + "learning_rate": 3.2265284072660364e-05, + "loss": 0.2752, + "step": 1781 + }, + { + "epoch": 2.833068362480127, + "grad_norm": 1.4158067284930005, + "learning_rate": 3.226897286050705e-05, + "loss": 0.1989, + "step": 1782 + }, + { + "epoch": 2.834658187599364, + "grad_norm": 0.9594148311195276, + "learning_rate": 3.2272664265026494e-05, + "loss": 0.0984, + "step": 1783 + }, + { + "epoch": 2.8362480127186007, + "grad_norm": 2.3192094925385525, + "learning_rate": 3.2276358284969266e-05, + "loss": 0.2225, + "step": 1784 + }, + { + "epoch": 2.8378378378378377, + "grad_norm": 1.234217590285289, + "learning_rate": 3.22800549190851e-05, + "loss": 0.1561, + "step": 1785 + }, + { + "epoch": 2.8394276629570747, + "grad_norm": 1.8018200270835174, + "learning_rate": 3.2283754166122804e-05, + "loss": 0.1796, + "step": 1786 + }, + { + "epoch": 2.8410174880763117, + "grad_norm": 1.8056419445386878, + "learning_rate": 3.228745602483032e-05, + "loss": 0.2091, + "step": 1787 + }, + { + "epoch": 2.8426073131955487, + "grad_norm": 1.9142642816844597, + "learning_rate": 3.2291160493954694e-05, + "loss": 0.1931, + "step": 1788 + }, + { + "epoch": 2.8441971383147853, + "grad_norm": 1.1120128883972513, + "learning_rate": 3.229486757224211e-05, + "loss": 0.13, + "step": 1789 + }, + { + "epoch": 2.8457869634340223, + "grad_norm": 1.2667038640437513, + "learning_rate": 3.229857725843785e-05, + "loss": 0.1931, + "step": 1790 + }, + { + "epoch": 2.8473767885532593, + "grad_norm": 3.200711906687901, + "learning_rate": 3.230228955128632e-05, + "loss": 0.1997, + "step": 1791 + }, + { + "epoch": 2.848966613672496, + "grad_norm": 1.6766016177150809, + "learning_rate": 3.230600444953105e-05, + "loss": 0.1777, + "step": 1792 + }, + { + "epoch": 2.850556438791733, + "grad_norm": 1.228569454723208, + "learning_rate": 3.230972195191467e-05, + "loss": 0.1608, + "step": 1793 + }, + { + "epoch": 2.85214626391097, + "grad_norm": 2.0462899069052356, + "learning_rate": 3.231344205717895e-05, + "loss": 0.2354, + "step": 1794 + }, + { + "epoch": 2.853736089030207, + "grad_norm": 8.921863436997553, + "learning_rate": 3.2317164764064773e-05, + "loss": 6.8511, + "step": 1795 + }, + { + "epoch": 2.855325914149444, + "grad_norm": 2.9048163074029616, + "learning_rate": 3.232089007131212e-05, + "loss": 0.1997, + "step": 1796 + }, + { + "epoch": 2.8569157392686804, + "grad_norm": 1.1714846662154808, + "learning_rate": 3.232461797766011e-05, + "loss": 0.1418, + "step": 1797 + }, + { + "epoch": 2.8585055643879174, + "grad_norm": 1.4078822473512569, + "learning_rate": 3.2328348481847e-05, + "loss": 0.2506, + "step": 1798 + }, + { + "epoch": 2.8600953895071544, + "grad_norm": 1.8884875908647951, + "learning_rate": 3.233208158261014e-05, + "loss": 0.184, + "step": 1799 + }, + { + "epoch": 2.861685214626391, + "grad_norm": 1.6350291491581712, + "learning_rate": 3.233581727868601e-05, + "loss": 0.1682, + "step": 1800 + }, + { + "epoch": 2.863275039745628, + "grad_norm": 1.3381182017079336, + "learning_rate": 3.2339555568810225e-05, + "loss": 0.156, + "step": 1801 + }, + { + "epoch": 2.864864864864865, + "grad_norm": 0.9944119796506846, + "learning_rate": 3.234329645171748e-05, + "loss": 0.1731, + "step": 1802 + }, + { + "epoch": 2.866454689984102, + "grad_norm": 1.6876037753574757, + "learning_rate": 3.234703992614165e-05, + "loss": 0.2124, + "step": 1803 + }, + { + "epoch": 2.868044515103339, + "grad_norm": 1.7791816064524173, + "learning_rate": 3.235078599081568e-05, + "loss": 0.1865, + "step": 1804 + }, + { + "epoch": 2.8696343402225755, + "grad_norm": 1.606348302069339, + "learning_rate": 3.235453464447169e-05, + "loss": 0.158, + "step": 1805 + }, + { + "epoch": 2.8712241653418125, + "grad_norm": 1.7817509217750394, + "learning_rate": 3.235828588584088e-05, + "loss": 0.1462, + "step": 1806 + }, + { + "epoch": 2.872813990461049, + "grad_norm": 1.1831815548218376, + "learning_rate": 3.2362039713653576e-05, + "loss": 0.1494, + "step": 1807 + }, + { + "epoch": 2.874403815580286, + "grad_norm": 1.3333737045906051, + "learning_rate": 3.236579612663928e-05, + "loss": 0.19, + "step": 1808 + }, + { + "epoch": 2.875993640699523, + "grad_norm": 33.6998921425052, + "learning_rate": 3.236955512352655e-05, + "loss": 24.7118, + "step": 1809 + }, + { + "epoch": 2.87758346581876, + "grad_norm": 1.6581471959725742, + "learning_rate": 3.237331670304312e-05, + "loss": 0.1311, + "step": 1810 + }, + { + "epoch": 2.879173290937997, + "grad_norm": 2.0458793441517122, + "learning_rate": 3.2377080863915816e-05, + "loss": 0.1345, + "step": 1811 + }, + { + "epoch": 2.8807631160572336, + "grad_norm": 1.8223951957318947, + "learning_rate": 3.238084760487063e-05, + "loss": 0.208, + "step": 1812 + }, + { + "epoch": 2.8823529411764706, + "grad_norm": 3.002320276478578, + "learning_rate": 3.2384616924632636e-05, + "loss": 0.3845, + "step": 1813 + }, + { + "epoch": 2.8839427662957076, + "grad_norm": 1.3747817159913949, + "learning_rate": 3.238838882192606e-05, + "loss": 0.1457, + "step": 1814 + }, + { + "epoch": 2.885532591414944, + "grad_norm": 1.298999626216307, + "learning_rate": 3.2392163295474254e-05, + "loss": 0.1486, + "step": 1815 + }, + { + "epoch": 2.887122416534181, + "grad_norm": 34.44194534170242, + "learning_rate": 3.239594034399969e-05, + "loss": 22.3721, + "step": 1816 + }, + { + "epoch": 2.888712241653418, + "grad_norm": 2.346599093155901, + "learning_rate": 3.239971996622398e-05, + "loss": 0.2209, + "step": 1817 + }, + { + "epoch": 2.890302066772655, + "grad_norm": 2.8246101554821577, + "learning_rate": 3.2403502160867855e-05, + "loss": 0.1743, + "step": 1818 + }, + { + "epoch": 2.891891891891892, + "grad_norm": 2.1762712110482787, + "learning_rate": 3.24072869266512e-05, + "loss": 0.1789, + "step": 1819 + }, + { + "epoch": 2.8934817170111287, + "grad_norm": 1.1837876290290728, + "learning_rate": 3.241107426229296e-05, + "loss": 0.1827, + "step": 1820 + }, + { + "epoch": 2.8950715421303657, + "grad_norm": 2.092027378061255, + "learning_rate": 3.241486416651131e-05, + "loss": 0.2095, + "step": 1821 + }, + { + "epoch": 2.8966613672496027, + "grad_norm": 2.428722664628801, + "learning_rate": 3.2418656638023476e-05, + "loss": 0.2139, + "step": 1822 + }, + { + "epoch": 2.898251192368839, + "grad_norm": 2.312577280796555, + "learning_rate": 3.242245167554586e-05, + "loss": 0.1966, + "step": 1823 + }, + { + "epoch": 2.899841017488076, + "grad_norm": 1.5064922527069833, + "learning_rate": 3.2426249277793954e-05, + "loss": 0.1918, + "step": 1824 + }, + { + "epoch": 2.901430842607313, + "grad_norm": 2.1226484016189935, + "learning_rate": 3.243004944348244e-05, + "loss": 0.1931, + "step": 1825 + }, + { + "epoch": 2.90302066772655, + "grad_norm": 1.855599530271734, + "learning_rate": 3.243385217132507e-05, + "loss": 0.1476, + "step": 1826 + }, + { + "epoch": 2.904610492845787, + "grad_norm": 3.5829060471023113, + "learning_rate": 3.243765746003478e-05, + "loss": 0.2053, + "step": 1827 + }, + { + "epoch": 2.9062003179650238, + "grad_norm": 2.505957667632483, + "learning_rate": 3.244146530832361e-05, + "loss": 0.1501, + "step": 1828 + }, + { + "epoch": 2.9077901430842608, + "grad_norm": 2.1146517595911694, + "learning_rate": 3.2445275714902723e-05, + "loss": 0.1818, + "step": 1829 + }, + { + "epoch": 2.9093799682034978, + "grad_norm": 11.12380780935816, + "learning_rate": 3.2449088678482465e-05, + "loss": 0.4669, + "step": 1830 + }, + { + "epoch": 2.9109697933227343, + "grad_norm": 26.12362543291316, + "learning_rate": 3.245290419777228e-05, + "loss": 15.4261, + "step": 1831 + }, + { + "epoch": 2.9125596184419713, + "grad_norm": 3.780108650183511, + "learning_rate": 3.245672227148074e-05, + "loss": 0.1549, + "step": 1832 + }, + { + "epoch": 2.9141494435612083, + "grad_norm": 36.54774952299517, + "learning_rate": 3.246054289831557e-05, + "loss": 21.6537, + "step": 1833 + }, + { + "epoch": 2.9157392686804453, + "grad_norm": 5.580079307167936, + "learning_rate": 3.246436607698361e-05, + "loss": 0.3568, + "step": 1834 + }, + { + "epoch": 2.9173290937996823, + "grad_norm": 2.4919037854233874, + "learning_rate": 3.24681918061909e-05, + "loss": 0.2161, + "step": 1835 + }, + { + "epoch": 2.918918918918919, + "grad_norm": 3.1653913542509926, + "learning_rate": 3.2472020084642517e-05, + "loss": 0.1501, + "step": 1836 + }, + { + "epoch": 2.920508744038156, + "grad_norm": 1.7680860458764274, + "learning_rate": 3.247585091104276e-05, + "loss": 0.2124, + "step": 1837 + }, + { + "epoch": 2.9220985691573924, + "grad_norm": 2.1511797601739184, + "learning_rate": 3.2479684284095016e-05, + "loss": 0.1816, + "step": 1838 + }, + { + "epoch": 2.9236883942766294, + "grad_norm": 3.940517095945787, + "learning_rate": 3.248352020250184e-05, + "loss": 0.207, + "step": 1839 + }, + { + "epoch": 2.9252782193958664, + "grad_norm": 2.2781744442566954, + "learning_rate": 3.248735866496489e-05, + "loss": 0.19, + "step": 1840 + }, + { + "epoch": 2.9268680445151034, + "grad_norm": 2.4352795233961544, + "learning_rate": 3.249119967018501e-05, + "loss": 0.2036, + "step": 1841 + }, + { + "epoch": 2.9284578696343404, + "grad_norm": 1.590662277056064, + "learning_rate": 3.249504321686215e-05, + "loss": 0.16, + "step": 1842 + }, + { + "epoch": 2.930047694753577, + "grad_norm": 4.0913943175004714, + "learning_rate": 3.249888930369541e-05, + "loss": 0.2709, + "step": 1843 + }, + { + "epoch": 2.931637519872814, + "grad_norm": 2.932441191850693, + "learning_rate": 3.250273792938302e-05, + "loss": 0.28, + "step": 1844 + }, + { + "epoch": 2.933227344992051, + "grad_norm": 2.1297286474151433, + "learning_rate": 3.250658909262237e-05, + "loss": 0.2142, + "step": 1845 + }, + { + "epoch": 2.9348171701112875, + "grad_norm": 3.326055317374116, + "learning_rate": 3.2510442792109984e-05, + "loss": 0.2444, + "step": 1846 + }, + { + "epoch": 2.9364069952305245, + "grad_norm": 1.6009156602104764, + "learning_rate": 3.2514299026541505e-05, + "loss": 0.1215, + "step": 1847 + }, + { + "epoch": 2.9379968203497615, + "grad_norm": 2.966842325325179, + "learning_rate": 3.251815779461175e-05, + "loss": 0.1817, + "step": 1848 + }, + { + "epoch": 2.9395866454689985, + "grad_norm": 2.262328560679966, + "learning_rate": 3.252201909501468e-05, + "loss": 0.2007, + "step": 1849 + }, + { + "epoch": 2.9411764705882355, + "grad_norm": 2.4362391296064163, + "learning_rate": 3.252588292644337e-05, + "loss": 0.238, + "step": 1850 + }, + { + "epoch": 2.942766295707472, + "grad_norm": 5.024061021007884, + "learning_rate": 3.2529749287590036e-05, + "loss": 0.3082, + "step": 1851 + }, + { + "epoch": 2.944356120826709, + "grad_norm": 3.8806305999463158, + "learning_rate": 3.2533618177146084e-05, + "loss": 0.2192, + "step": 1852 + }, + { + "epoch": 2.945945945945946, + "grad_norm": 5.6499319890107484, + "learning_rate": 3.253748959380203e-05, + "loss": 0.3267, + "step": 1853 + }, + { + "epoch": 2.9475357710651826, + "grad_norm": 1.7469900567131358, + "learning_rate": 3.254136353624751e-05, + "loss": 0.2042, + "step": 1854 + }, + { + "epoch": 2.9491255961844196, + "grad_norm": 2.2586329249031385, + "learning_rate": 3.2545240003171384e-05, + "loss": 0.2078, + "step": 1855 + }, + { + "epoch": 2.9507154213036566, + "grad_norm": 2.993807115656151, + "learning_rate": 3.2549118993261557e-05, + "loss": 0.212, + "step": 1856 + }, + { + "epoch": 2.9523052464228936, + "grad_norm": 3.5747388179410136, + "learning_rate": 3.2553000505205176e-05, + "loss": 0.295, + "step": 1857 + }, + { + "epoch": 2.9538950715421306, + "grad_norm": 3.0406701446229776, + "learning_rate": 3.255688453768846e-05, + "loss": 0.241, + "step": 1858 + }, + { + "epoch": 2.955484896661367, + "grad_norm": 2.2974389292643727, + "learning_rate": 3.2560771089396815e-05, + "loss": 0.2194, + "step": 1859 + }, + { + "epoch": 2.957074721780604, + "grad_norm": 2.173118821995272, + "learning_rate": 3.256466015901478e-05, + "loss": 0.1373, + "step": 1860 + }, + { + "epoch": 2.958664546899841, + "grad_norm": 2.993814847201125, + "learning_rate": 3.2568551745226056e-05, + "loss": 0.1772, + "step": 1861 + }, + { + "epoch": 2.9602543720190777, + "grad_norm": 1.6363405578787704, + "learning_rate": 3.257244584671348e-05, + "loss": 0.2132, + "step": 1862 + }, + { + "epoch": 2.9618441971383147, + "grad_norm": 1.0565829447716102, + "learning_rate": 3.257634246215903e-05, + "loss": 0.1624, + "step": 1863 + }, + { + "epoch": 2.9634340222575517, + "grad_norm": 1.7515221152713383, + "learning_rate": 3.258024159024383e-05, + "loss": 0.1915, + "step": 1864 + }, + { + "epoch": 2.9650238473767887, + "grad_norm": 77.3194986407281, + "learning_rate": 3.2584143229648206e-05, + "loss": 25.2183, + "step": 1865 + }, + { + "epoch": 2.9666136724960257, + "grad_norm": 2.2122275501316495, + "learning_rate": 3.258804737905156e-05, + "loss": 0.2118, + "step": 1866 + }, + { + "epoch": 2.9682034976152623, + "grad_norm": 1.4136148223961897, + "learning_rate": 3.25919540371325e-05, + "loss": 0.2109, + "step": 1867 + }, + { + "epoch": 2.9697933227344993, + "grad_norm": 2.988176151334641, + "learning_rate": 3.2595863202568745e-05, + "loss": 0.242, + "step": 1868 + }, + { + "epoch": 2.9713831478537363, + "grad_norm": 2.0233876787700744, + "learning_rate": 3.25997748740372e-05, + "loss": 0.2042, + "step": 1869 + }, + { + "epoch": 2.972972972972973, + "grad_norm": 1.0966639433763725, + "learning_rate": 3.26036890502139e-05, + "loss": 0.1939, + "step": 1870 + }, + { + "epoch": 2.97456279809221, + "grad_norm": 2.1742903420702597, + "learning_rate": 3.2607605729774046e-05, + "loss": 0.1475, + "step": 1871 + }, + { + "epoch": 2.976152623211447, + "grad_norm": 1.5714201251870261, + "learning_rate": 3.2611524911391964e-05, + "loss": 0.1983, + "step": 1872 + }, + { + "epoch": 2.977742448330684, + "grad_norm": 1.281674395154899, + "learning_rate": 3.261544659374116e-05, + "loss": 0.1106, + "step": 1873 + }, + { + "epoch": 2.9793322734499204, + "grad_norm": 2.273856068176308, + "learning_rate": 3.2619370775494306e-05, + "loss": 0.1639, + "step": 1874 + }, + { + "epoch": 2.9809220985691574, + "grad_norm": 1.3458060703066936, + "learning_rate": 3.2623297455323186e-05, + "loss": 0.1455, + "step": 1875 + }, + { + "epoch": 2.9825119236883944, + "grad_norm": 2.459342440348716, + "learning_rate": 3.2627226631898765e-05, + "loss": 0.195, + "step": 1876 + }, + { + "epoch": 2.984101748807631, + "grad_norm": 1.679834309306734, + "learning_rate": 3.263115830389117e-05, + "loss": 0.1801, + "step": 1877 + }, + { + "epoch": 2.985691573926868, + "grad_norm": 2.3654151029644366, + "learning_rate": 3.263509246996965e-05, + "loss": 0.1819, + "step": 1878 + }, + { + "epoch": 2.987281399046105, + "grad_norm": 1.4450223452783846, + "learning_rate": 3.263902912880266e-05, + "loss": 0.1797, + "step": 1879 + }, + { + "epoch": 2.988871224165342, + "grad_norm": 1.2896168223404074, + "learning_rate": 3.264296827905776e-05, + "loss": 0.1561, + "step": 1880 + }, + { + "epoch": 2.990461049284579, + "grad_norm": 1.1028702388559217, + "learning_rate": 3.2646909919401706e-05, + "loss": 0.1731, + "step": 1881 + }, + { + "epoch": 2.9920508744038155, + "grad_norm": 3.222497754783046, + "learning_rate": 3.2650854048500405e-05, + "loss": 0.2544, + "step": 1882 + }, + { + "epoch": 2.9936406995230525, + "grad_norm": 1.314969257056666, + "learning_rate": 3.265480066501889e-05, + "loss": 0.1414, + "step": 1883 + }, + { + "epoch": 2.9952305246422894, + "grad_norm": 1.2013236612060652, + "learning_rate": 3.265874976762138e-05, + "loss": 0.2029, + "step": 1884 + }, + { + "epoch": 2.996820349761526, + "grad_norm": 1.218399788036494, + "learning_rate": 3.266270135497123e-05, + "loss": 0.1361, + "step": 1885 + }, + { + "epoch": 2.998410174880763, + "grad_norm": 1.2500087718695136, + "learning_rate": 3.266665542573101e-05, + "loss": 0.1551, + "step": 1886 + }, + { + "epoch": 3.0, + "grad_norm": 0.9800135750224278, + "learning_rate": 3.267061197856239e-05, + "loss": 0.1434, + "step": 1887 + }, + { + "epoch": 3.001589825119237, + "grad_norm": 1.1281258284629547, + "learning_rate": 3.2674571012126206e-05, + "loss": 0.2176, + "step": 1888 + }, + { + "epoch": 3.0031796502384736, + "grad_norm": 0.8377961012707515, + "learning_rate": 3.26785325250825e-05, + "loss": 0.2095, + "step": 1889 + }, + { + "epoch": 3.0047694753577106, + "grad_norm": 1.3870899681171056, + "learning_rate": 3.268249651609041e-05, + "loss": 0.2162, + "step": 1890 + }, + { + "epoch": 3.0063593004769475, + "grad_norm": 1.0624178646401679, + "learning_rate": 3.26864629838083e-05, + "loss": 0.1484, + "step": 1891 + }, + { + "epoch": 3.0079491255961845, + "grad_norm": 1.0696249564716236, + "learning_rate": 3.269043192689364e-05, + "loss": 0.1469, + "step": 1892 + }, + { + "epoch": 3.009538950715421, + "grad_norm": 0.7928740121309226, + "learning_rate": 3.269440334400309e-05, + "loss": 0.2311, + "step": 1893 + }, + { + "epoch": 3.011128775834658, + "grad_norm": 1.8902743625030656, + "learning_rate": 3.269837723379248e-05, + "loss": 0.1727, + "step": 1894 + }, + { + "epoch": 3.012718600953895, + "grad_norm": 0.9340447720099906, + "learning_rate": 3.270235359491678e-05, + "loss": 0.1757, + "step": 1895 + }, + { + "epoch": 3.014308426073132, + "grad_norm": 1.5324306122089544, + "learning_rate": 3.270633242603015e-05, + "loss": 0.1661, + "step": 1896 + }, + { + "epoch": 3.0158982511923687, + "grad_norm": 1.202390005942924, + "learning_rate": 3.2710313725785886e-05, + "loss": 0.2412, + "step": 1897 + }, + { + "epoch": 3.0174880763116056, + "grad_norm": 1.2100063713448455, + "learning_rate": 3.271429749283647e-05, + "loss": 0.1556, + "step": 1898 + }, + { + "epoch": 3.0190779014308426, + "grad_norm": 2.1708437656270934, + "learning_rate": 3.271828372583354e-05, + "loss": 0.1829, + "step": 1899 + }, + { + "epoch": 3.0206677265500796, + "grad_norm": 0.7740077062503045, + "learning_rate": 3.272227242342789e-05, + "loss": 0.1491, + "step": 1900 + }, + { + "epoch": 3.022257551669316, + "grad_norm": 2.279173217250167, + "learning_rate": 3.2726263584269514e-05, + "loss": 0.2551, + "step": 1901 + }, + { + "epoch": 3.023847376788553, + "grad_norm": 2.3654667747412863, + "learning_rate": 3.2730257207007523e-05, + "loss": 0.2887, + "step": 1902 + }, + { + "epoch": 3.02543720190779, + "grad_norm": 2.313704747315006, + "learning_rate": 3.273425329029024e-05, + "loss": 0.1808, + "step": 1903 + }, + { + "epoch": 3.027027027027027, + "grad_norm": 1.6213307420266159, + "learning_rate": 3.273825183276513e-05, + "loss": 0.1549, + "step": 1904 + }, + { + "epoch": 3.0286168521462637, + "grad_norm": 2.011158551051491, + "learning_rate": 3.274225283307881e-05, + "loss": 0.2066, + "step": 1905 + }, + { + "epoch": 3.0302066772655007, + "grad_norm": 2.027395998585125, + "learning_rate": 3.2746256289877126e-05, + "loss": 0.2191, + "step": 1906 + }, + { + "epoch": 3.0317965023847377, + "grad_norm": 1.692920427575308, + "learning_rate": 3.275026220180502e-05, + "loss": 0.1531, + "step": 1907 + }, + { + "epoch": 3.0333863275039747, + "grad_norm": 2.029454715546095, + "learning_rate": 3.275427056750665e-05, + "loss": 0.2158, + "step": 1908 + }, + { + "epoch": 3.0349761526232113, + "grad_norm": 1.7330464379784225, + "learning_rate": 3.2758281385625325e-05, + "loss": 0.188, + "step": 1909 + }, + { + "epoch": 3.0365659777424483, + "grad_norm": 1.893798129977745, + "learning_rate": 3.2762294654803536e-05, + "loss": 0.2665, + "step": 1910 + }, + { + "epoch": 3.0381558028616853, + "grad_norm": 1.0591379262273728, + "learning_rate": 3.2766310373682915e-05, + "loss": 0.1771, + "step": 1911 + }, + { + "epoch": 3.0397456279809223, + "grad_norm": 1.0126405391549058, + "learning_rate": 3.277032854090433e-05, + "loss": 0.186, + "step": 1912 + }, + { + "epoch": 3.041335453100159, + "grad_norm": 1.1712462392496072, + "learning_rate": 3.277434915510772e-05, + "loss": 0.1999, + "step": 1913 + }, + { + "epoch": 3.042925278219396, + "grad_norm": 0.8382379370089657, + "learning_rate": 3.277837221493229e-05, + "loss": 0.1686, + "step": 1914 + }, + { + "epoch": 3.044515103338633, + "grad_norm": 1.2011902536329309, + "learning_rate": 3.278239771901638e-05, + "loss": 0.2315, + "step": 1915 + }, + { + "epoch": 3.04610492845787, + "grad_norm": 2.661871345898783, + "learning_rate": 3.278642566599749e-05, + "loss": 0.1994, + "step": 1916 + }, + { + "epoch": 3.0476947535771064, + "grad_norm": 1.324634890354758, + "learning_rate": 3.27904560545123e-05, + "loss": 0.1362, + "step": 1917 + }, + { + "epoch": 3.0492845786963434, + "grad_norm": 1.842924550395593, + "learning_rate": 3.27944888831967e-05, + "loss": 0.1642, + "step": 1918 + }, + { + "epoch": 3.0508744038155804, + "grad_norm": 144.16157827376344, + "learning_rate": 3.279852415068569e-05, + "loss": 29.0972, + "step": 1919 + }, + { + "epoch": 3.0524642289348174, + "grad_norm": 1.9624553295275307, + "learning_rate": 3.280256185561349e-05, + "loss": 0.1752, + "step": 1920 + }, + { + "epoch": 3.054054054054054, + "grad_norm": 1.3747680421603894, + "learning_rate": 3.280660199661349e-05, + "loss": 0.183, + "step": 1921 + }, + { + "epoch": 3.055643879173291, + "grad_norm": 2.4370877991044524, + "learning_rate": 3.2810644572318235e-05, + "loss": 0.2116, + "step": 1922 + }, + { + "epoch": 3.057233704292528, + "grad_norm": 2.2911024601773864, + "learning_rate": 3.281468958135948e-05, + "loss": 0.1644, + "step": 1923 + }, + { + "epoch": 3.0588235294117645, + "grad_norm": 1.7023827045778943, + "learning_rate": 3.281873702236811e-05, + "loss": 0.153, + "step": 1924 + }, + { + "epoch": 3.0604133545310015, + "grad_norm": 2.209047208715941, + "learning_rate": 3.282278689397423e-05, + "loss": 0.2076, + "step": 1925 + }, + { + "epoch": 3.0620031796502385, + "grad_norm": 2.3500151430837874, + "learning_rate": 3.282683919480711e-05, + "loss": 0.1726, + "step": 1926 + }, + { + "epoch": 3.0635930047694755, + "grad_norm": 2.0006679935322467, + "learning_rate": 3.2830893923495166e-05, + "loss": 0.1594, + "step": 1927 + }, + { + "epoch": 3.065182829888712, + "grad_norm": 1.5272442036748977, + "learning_rate": 3.2834951078666056e-05, + "loss": 0.168, + "step": 1928 + }, + { + "epoch": 3.066772655007949, + "grad_norm": 0.9167213066631827, + "learning_rate": 3.283901065894655e-05, + "loss": 0.1116, + "step": 1929 + }, + { + "epoch": 3.068362480127186, + "grad_norm": 39.3013823353338, + "learning_rate": 3.2843072662962646e-05, + "loss": 25.4797, + "step": 1930 + }, + { + "epoch": 3.069952305246423, + "grad_norm": 1.3991004367595645, + "learning_rate": 3.284713708933948e-05, + "loss": 0.2264, + "step": 1931 + }, + { + "epoch": 3.0715421303656596, + "grad_norm": 1.6586053197123771, + "learning_rate": 3.285120393670142e-05, + "loss": 0.156, + "step": 1932 + }, + { + "epoch": 3.0731319554848966, + "grad_norm": 1.195665037480131, + "learning_rate": 3.285527320367196e-05, + "loss": 0.1298, + "step": 1933 + }, + { + "epoch": 3.0747217806041336, + "grad_norm": 1.1040711858605634, + "learning_rate": 3.285934488887382e-05, + "loss": 0.1526, + "step": 1934 + }, + { + "epoch": 3.0763116057233706, + "grad_norm": 1.782315852979005, + "learning_rate": 3.286341899092887e-05, + "loss": 0.1779, + "step": 1935 + }, + { + "epoch": 3.077901430842607, + "grad_norm": 1.6819257787270387, + "learning_rate": 3.286749550845818e-05, + "loss": 0.2027, + "step": 1936 + }, + { + "epoch": 3.079491255961844, + "grad_norm": 1.5149116774749574, + "learning_rate": 3.287157444008199e-05, + "loss": 0.176, + "step": 1937 + }, + { + "epoch": 3.081081081081081, + "grad_norm": 1.4941675057717916, + "learning_rate": 3.287565578441974e-05, + "loss": 0.1745, + "step": 1938 + }, + { + "epoch": 3.082670906200318, + "grad_norm": 1.0002696983956485, + "learning_rate": 3.287973954009003e-05, + "loss": 0.1972, + "step": 1939 + }, + { + "epoch": 3.0842607313195547, + "grad_norm": 0.9454272741350396, + "learning_rate": 3.288382570571067e-05, + "loss": 0.1629, + "step": 1940 + }, + { + "epoch": 3.0858505564387917, + "grad_norm": 1.1817530758763106, + "learning_rate": 3.288791427989863e-05, + "loss": 0.1738, + "step": 1941 + }, + { + "epoch": 3.0874403815580287, + "grad_norm": 1.2163507639562412, + "learning_rate": 3.2892005261270074e-05, + "loss": 0.1684, + "step": 1942 + }, + { + "epoch": 3.0890302066772657, + "grad_norm": 1.10392308378242, + "learning_rate": 3.289609864844037e-05, + "loss": 0.1927, + "step": 1943 + }, + { + "epoch": 3.0906200317965022, + "grad_norm": 1.263396106097779, + "learning_rate": 3.290019444002403e-05, + "loss": 0.1577, + "step": 1944 + }, + { + "epoch": 3.0922098569157392, + "grad_norm": 0.9845783198187106, + "learning_rate": 3.2904292634634795e-05, + "loss": 0.1886, + "step": 1945 + }, + { + "epoch": 3.0937996820349762, + "grad_norm": 1.0762094068529975, + "learning_rate": 3.290839323088556e-05, + "loss": 0.1507, + "step": 1946 + }, + { + "epoch": 3.0953895071542132, + "grad_norm": 1.3904239721622995, + "learning_rate": 3.2912496227388444e-05, + "loss": 0.1701, + "step": 1947 + }, + { + "epoch": 3.09697933227345, + "grad_norm": 1.398537309104411, + "learning_rate": 3.291660162275471e-05, + "loss": 0.1722, + "step": 1948 + }, + { + "epoch": 3.098569157392687, + "grad_norm": 1.344297316706831, + "learning_rate": 3.292070941559484e-05, + "loss": 0.1552, + "step": 1949 + }, + { + "epoch": 3.100158982511924, + "grad_norm": 0.841262345763392, + "learning_rate": 3.292481960451849e-05, + "loss": 0.1535, + "step": 1950 + }, + { + "epoch": 3.101748807631161, + "grad_norm": 0.7620259295472986, + "learning_rate": 3.2928932188134525e-05, + "loss": 0.1569, + "step": 1951 + }, + { + "epoch": 3.1033386327503973, + "grad_norm": 1.417824005812314, + "learning_rate": 3.293304716505096e-05, + "loss": 0.2298, + "step": 1952 + }, + { + "epoch": 3.1049284578696343, + "grad_norm": 0.8344711142697316, + "learning_rate": 3.293716453387505e-05, + "loss": 0.1687, + "step": 1953 + }, + { + "epoch": 3.1065182829888713, + "grad_norm": 1.8677102889037265, + "learning_rate": 3.2941284293213186e-05, + "loss": 0.2357, + "step": 1954 + }, + { + "epoch": 3.108108108108108, + "grad_norm": 1.3798876719295738, + "learning_rate": 3.2945406441671e-05, + "loss": 0.1447, + "step": 1955 + }, + { + "epoch": 3.109697933227345, + "grad_norm": 3.6113097669741996, + "learning_rate": 3.29495309778533e-05, + "loss": 0.1674, + "step": 1956 + }, + { + "epoch": 3.111287758346582, + "grad_norm": 1.4186979959943251, + "learning_rate": 3.295365790036406e-05, + "loss": 0.2354, + "step": 1957 + }, + { + "epoch": 3.112877583465819, + "grad_norm": 1.2580283664386984, + "learning_rate": 3.2957787207806465e-05, + "loss": 0.1565, + "step": 1958 + }, + { + "epoch": 3.1144674085850554, + "grad_norm": 37.18545167642314, + "learning_rate": 3.29619188987829e-05, + "loss": 22.002, + "step": 1959 + }, + { + "epoch": 3.1160572337042924, + "grad_norm": 1.1559754608869024, + "learning_rate": 3.296605297189496e-05, + "loss": 0.1891, + "step": 1960 + }, + { + "epoch": 3.1176470588235294, + "grad_norm": 1.2714999828438585, + "learning_rate": 3.297018942574338e-05, + "loss": 0.2014, + "step": 1961 + }, + { + "epoch": 3.1192368839427664, + "grad_norm": 1.6043778443584225, + "learning_rate": 3.2974328258928137e-05, + "loss": 0.1208, + "step": 1962 + }, + { + "epoch": 3.120826709062003, + "grad_norm": 1.651638929725782, + "learning_rate": 3.2978469470048376e-05, + "loss": 0.195, + "step": 1963 + }, + { + "epoch": 3.12241653418124, + "grad_norm": 1.7351014190374314, + "learning_rate": 3.2982613057702446e-05, + "loss": 0.1747, + "step": 1964 + }, + { + "epoch": 3.124006359300477, + "grad_norm": 0.6126506364431229, + "learning_rate": 3.2986759020487906e-05, + "loss": 0.0996, + "step": 1965 + }, + { + "epoch": 3.125596184419714, + "grad_norm": 1.609898128216873, + "learning_rate": 3.299090735700149e-05, + "loss": 0.175, + "step": 1966 + }, + { + "epoch": 3.1271860095389505, + "grad_norm": 1.6780783515860238, + "learning_rate": 3.2995058065839136e-05, + "loss": 0.1567, + "step": 1967 + }, + { + "epoch": 3.1287758346581875, + "grad_norm": 1.2847573814103512, + "learning_rate": 3.2999211145595976e-05, + "loss": 0.1419, + "step": 1968 + }, + { + "epoch": 3.1303656597774245, + "grad_norm": 0.7533568806556772, + "learning_rate": 3.300336659486635e-05, + "loss": 0.105, + "step": 1969 + }, + { + "epoch": 3.1319554848966615, + "grad_norm": 0.9644498780308932, + "learning_rate": 3.300752441224378e-05, + "loss": 0.2062, + "step": 1970 + }, + { + "epoch": 3.133545310015898, + "grad_norm": 0.8185427540073015, + "learning_rate": 3.3011684596321004e-05, + "loss": 0.1449, + "step": 1971 + }, + { + "epoch": 3.135135135135135, + "grad_norm": 1.0768584128413883, + "learning_rate": 3.3015847145689936e-05, + "loss": 0.1707, + "step": 1972 + }, + { + "epoch": 3.136724960254372, + "grad_norm": 1.1263664526600041, + "learning_rate": 3.302001205894173e-05, + "loss": 0.1974, + "step": 1973 + }, + { + "epoch": 3.138314785373609, + "grad_norm": 0.9768191162392208, + "learning_rate": 3.302417933466669e-05, + "loss": 0.2151, + "step": 1974 + }, + { + "epoch": 3.1399046104928456, + "grad_norm": 1.3755838816110673, + "learning_rate": 3.302834897145436e-05, + "loss": 0.1743, + "step": 1975 + }, + { + "epoch": 3.1414944356120826, + "grad_norm": 0.6798102871890868, + "learning_rate": 3.303252096789345e-05, + "loss": 0.1561, + "step": 1976 + }, + { + "epoch": 3.1430842607313196, + "grad_norm": 0.781133820601312, + "learning_rate": 3.3036695322571906e-05, + "loss": 0.1196, + "step": 1977 + }, + { + "epoch": 3.1446740858505566, + "grad_norm": 2.072769586559616, + "learning_rate": 3.3040872034076855e-05, + "loss": 0.2347, + "step": 1978 + }, + { + "epoch": 3.146263910969793, + "grad_norm": 1.6849357484997305, + "learning_rate": 3.3045051100994644e-05, + "loss": 0.1685, + "step": 1979 + }, + { + "epoch": 3.14785373608903, + "grad_norm": 2.692333163987331, + "learning_rate": 3.3049232521910785e-05, + "loss": 0.2578, + "step": 1980 + }, + { + "epoch": 3.149443561208267, + "grad_norm": 0.9627159800236059, + "learning_rate": 3.3053416295410026e-05, + "loss": 0.0988, + "step": 1981 + }, + { + "epoch": 3.151033386327504, + "grad_norm": 0.9776701422595108, + "learning_rate": 3.3057602420076326e-05, + "loss": 0.1665, + "step": 1982 + }, + { + "epoch": 3.1526232114467407, + "grad_norm": 2.0385540158621125, + "learning_rate": 3.306179089449282e-05, + "loss": 0.1802, + "step": 1983 + }, + { + "epoch": 3.1542130365659777, + "grad_norm": 1.4048896694502067, + "learning_rate": 3.306598171724188e-05, + "loss": 0.2071, + "step": 1984 + }, + { + "epoch": 3.1558028616852147, + "grad_norm": 1.501249936428329, + "learning_rate": 3.3070174886905034e-05, + "loss": 0.2386, + "step": 1985 + }, + { + "epoch": 3.1573926868044513, + "grad_norm": 1.587898068718713, + "learning_rate": 3.3074370402063054e-05, + "loss": 0.1827, + "step": 1986 + }, + { + "epoch": 3.1589825119236883, + "grad_norm": 1.0132545643530786, + "learning_rate": 3.307856826129593e-05, + "loss": 0.1432, + "step": 1987 + }, + { + "epoch": 3.1605723370429253, + "grad_norm": 1.1635612617034106, + "learning_rate": 3.308276846318283e-05, + "loss": 0.1548, + "step": 1988 + }, + { + "epoch": 3.1621621621621623, + "grad_norm": 1.8734635262946138, + "learning_rate": 3.3086971006302126e-05, + "loss": 0.1398, + "step": 1989 + }, + { + "epoch": 3.1637519872813993, + "grad_norm": 0.9747597540983771, + "learning_rate": 3.309117588923142e-05, + "loss": 0.136, + "step": 1990 + }, + { + "epoch": 3.165341812400636, + "grad_norm": 32.13137658526767, + "learning_rate": 3.30953831105475e-05, + "loss": 20.4109, + "step": 1991 + }, + { + "epoch": 3.166931637519873, + "grad_norm": 1.4277880209360654, + "learning_rate": 3.3099592668826386e-05, + "loss": 0.125, + "step": 1992 + }, + { + "epoch": 3.16852146263911, + "grad_norm": 1.524782026029901, + "learning_rate": 3.3103804562643306e-05, + "loss": 0.1585, + "step": 1993 + }, + { + "epoch": 3.1701112877583464, + "grad_norm": 0.861977347980029, + "learning_rate": 3.310801879057266e-05, + "loss": 0.1378, + "step": 1994 + }, + { + "epoch": 3.1717011128775834, + "grad_norm": 1.0077426223162675, + "learning_rate": 3.3112235351188087e-05, + "loss": 0.1943, + "step": 1995 + }, + { + "epoch": 3.1732909379968204, + "grad_norm": 2.2943093068787683, + "learning_rate": 3.311645424306246e-05, + "loss": 0.1592, + "step": 1996 + }, + { + "epoch": 3.1748807631160574, + "grad_norm": 1.6875214584713865, + "learning_rate": 3.312067546476781e-05, + "loss": 0.1556, + "step": 1997 + }, + { + "epoch": 3.176470588235294, + "grad_norm": 1.1806329465463816, + "learning_rate": 3.3124899014875426e-05, + "loss": 0.1388, + "step": 1998 + }, + { + "epoch": 3.178060413354531, + "grad_norm": 56.409258678504905, + "learning_rate": 3.312912489195577e-05, + "loss": 33.0671, + "step": 1999 + }, + { + "epoch": 3.179650238473768, + "grad_norm": 2.141471304691906, + "learning_rate": 3.313335309457854e-05, + "loss": 0.1528, + "step": 2000 + }, + { + "epoch": 3.181240063593005, + "grad_norm": 1.6639295605976259, + "learning_rate": 3.313758362131266e-05, + "loss": 0.1539, + "step": 2001 + }, + { + "epoch": 3.1828298887122415, + "grad_norm": 2.2856800720902926, + "learning_rate": 3.314181647072623e-05, + "loss": 0.255, + "step": 2002 + }, + { + "epoch": 3.1844197138314785, + "grad_norm": 1.3118614331928724, + "learning_rate": 3.3146051641386606e-05, + "loss": 0.1373, + "step": 2003 + }, + { + "epoch": 3.1860095389507155, + "grad_norm": 1.2733814165098716, + "learning_rate": 3.3150289131860306e-05, + "loss": 0.156, + "step": 2004 + }, + { + "epoch": 3.1875993640699525, + "grad_norm": 1.508294259693326, + "learning_rate": 3.315452894071311e-05, + "loss": 0.2343, + "step": 2005 + }, + { + "epoch": 3.189189189189189, + "grad_norm": 2.162535711209585, + "learning_rate": 3.315877106651e-05, + "loss": 0.1781, + "step": 2006 + }, + { + "epoch": 3.190779014308426, + "grad_norm": 2.3836983581709577, + "learning_rate": 3.316301550781516e-05, + "loss": 0.2031, + "step": 2007 + }, + { + "epoch": 3.192368839427663, + "grad_norm": 1.5693348330991597, + "learning_rate": 3.316726226319201e-05, + "loss": 0.1899, + "step": 2008 + }, + { + "epoch": 3.1939586645469, + "grad_norm": 2.046961823376242, + "learning_rate": 3.317151133120317e-05, + "loss": 0.1282, + "step": 2009 + }, + { + "epoch": 3.1955484896661366, + "grad_norm": 4.313264951099641, + "learning_rate": 3.317576271041049e-05, + "loss": 0.2294, + "step": 2010 + }, + { + "epoch": 3.1971383147853736, + "grad_norm": 2.6025331577579225, + "learning_rate": 3.318001639937501e-05, + "loss": 0.1757, + "step": 2011 + }, + { + "epoch": 3.1987281399046106, + "grad_norm": 1.8316778137434928, + "learning_rate": 3.318427239665705e-05, + "loss": 0.2137, + "step": 2012 + }, + { + "epoch": 3.2003179650238476, + "grad_norm": 4.0184933933952225, + "learning_rate": 3.318853070081608e-05, + "loss": 0.2147, + "step": 2013 + }, + { + "epoch": 3.201907790143084, + "grad_norm": 3.180541743830729, + "learning_rate": 3.3192791310410816e-05, + "loss": 0.2239, + "step": 2014 + }, + { + "epoch": 3.203497615262321, + "grad_norm": 3.547151935416979, + "learning_rate": 3.319705422399923e-05, + "loss": 0.1764, + "step": 2015 + }, + { + "epoch": 3.205087440381558, + "grad_norm": 2.708890069858936, + "learning_rate": 3.3201319440138433e-05, + "loss": 0.1673, + "step": 2016 + }, + { + "epoch": 3.2066772655007947, + "grad_norm": 2.810242563409344, + "learning_rate": 3.320558695738483e-05, + "loss": 0.1769, + "step": 2017 + }, + { + "epoch": 3.2082670906200317, + "grad_norm": 4.5665891109311305, + "learning_rate": 3.320985677429403e-05, + "loss": 0.2431, + "step": 2018 + }, + { + "epoch": 3.2098569157392687, + "grad_norm": 1.6811383945656806, + "learning_rate": 3.3214128889420835e-05, + "loss": 0.1495, + "step": 2019 + }, + { + "epoch": 3.2114467408585057, + "grad_norm": 1.5516439642875695, + "learning_rate": 3.3218403301319294e-05, + "loss": 0.1747, + "step": 2020 + }, + { + "epoch": 3.2130365659777427, + "grad_norm": 2.1251807156694613, + "learning_rate": 3.322268000854268e-05, + "loss": 0.1572, + "step": 2021 + }, + { + "epoch": 3.2146263910969792, + "grad_norm": 1.5052606934218604, + "learning_rate": 3.322695900964348e-05, + "loss": 0.1531, + "step": 2022 + }, + { + "epoch": 3.2162162162162162, + "grad_norm": 2.2431616919677904, + "learning_rate": 3.32312403031734e-05, + "loss": 0.1463, + "step": 2023 + }, + { + "epoch": 3.2178060413354532, + "grad_norm": 1.298867478561596, + "learning_rate": 3.323552388768338e-05, + "loss": 0.1616, + "step": 2024 + }, + { + "epoch": 3.21939586645469, + "grad_norm": 1.5018859916598755, + "learning_rate": 3.323980976172358e-05, + "loss": 0.1349, + "step": 2025 + }, + { + "epoch": 3.220985691573927, + "grad_norm": 2.982556851255493, + "learning_rate": 3.32440979238434e-05, + "loss": 0.1618, + "step": 2026 + }, + { + "epoch": 3.2225755166931638, + "grad_norm": 1.7714178314345896, + "learning_rate": 3.3248388372591435e-05, + "loss": 0.1617, + "step": 2027 + }, + { + "epoch": 3.2241653418124008, + "grad_norm": 2.2655010972965943, + "learning_rate": 3.3252681106515534e-05, + "loss": 0.1871, + "step": 2028 + }, + { + "epoch": 3.2257551669316373, + "grad_norm": 2.7541267080238323, + "learning_rate": 3.325697612416277e-05, + "loss": 0.1438, + "step": 2029 + }, + { + "epoch": 3.2273449920508743, + "grad_norm": 3.2554128941818172, + "learning_rate": 3.326127342407941e-05, + "loss": 0.2001, + "step": 2030 + }, + { + "epoch": 3.2289348171701113, + "grad_norm": 3.0231568447528194, + "learning_rate": 3.326557300481099e-05, + "loss": 0.1624, + "step": 2031 + }, + { + "epoch": 3.2305246422893483, + "grad_norm": 1.597346330492968, + "learning_rate": 3.3269874864902266e-05, + "loss": 0.183, + "step": 2032 + }, + { + "epoch": 3.232114467408585, + "grad_norm": 2.208834066194717, + "learning_rate": 3.32741790028972e-05, + "loss": 0.1846, + "step": 2033 + }, + { + "epoch": 3.233704292527822, + "grad_norm": 3.4048356114629046, + "learning_rate": 3.3278485417339004e-05, + "loss": 0.2179, + "step": 2034 + }, + { + "epoch": 3.235294117647059, + "grad_norm": 2.3164545945181265, + "learning_rate": 3.32827941067701e-05, + "loss": 0.1463, + "step": 2035 + }, + { + "epoch": 3.236883942766296, + "grad_norm": 1.7994079248617014, + "learning_rate": 3.328710506973216e-05, + "loss": 0.1761, + "step": 2036 + }, + { + "epoch": 3.2384737678855324, + "grad_norm": 1.8010778234783207, + "learning_rate": 3.3291418304766094e-05, + "loss": 0.1471, + "step": 2037 + }, + { + "epoch": 3.2400635930047694, + "grad_norm": 1.3559843917733438, + "learning_rate": 3.329573381041201e-05, + "loss": 0.1789, + "step": 2038 + }, + { + "epoch": 3.2416534181240064, + "grad_norm": 4.1962914123268185, + "learning_rate": 3.330005158520927e-05, + "loss": 0.3014, + "step": 2039 + }, + { + "epoch": 3.2432432432432434, + "grad_norm": 47.58509618502178, + "learning_rate": 3.330437162769647e-05, + "loss": 22.9623, + "step": 2040 + }, + { + "epoch": 3.24483306836248, + "grad_norm": 3.7935446218238686, + "learning_rate": 3.3308693936411426e-05, + "loss": 0.2144, + "step": 2041 + }, + { + "epoch": 3.246422893481717, + "grad_norm": 2.2487300045613847, + "learning_rate": 3.331301850989118e-05, + "loss": 0.1711, + "step": 2042 + }, + { + "epoch": 3.248012718600954, + "grad_norm": 1.9941419530698707, + "learning_rate": 3.331734534667205e-05, + "loss": 0.1675, + "step": 2043 + }, + { + "epoch": 3.249602543720191, + "grad_norm": 2.1261608900167204, + "learning_rate": 3.3321674445289536e-05, + "loss": 0.1439, + "step": 2044 + }, + { + "epoch": 3.2511923688394275, + "grad_norm": 3.0045998227534962, + "learning_rate": 3.3326005804278396e-05, + "loss": 0.2156, + "step": 2045 + }, + { + "epoch": 3.2527821939586645, + "grad_norm": 4.213304807376894, + "learning_rate": 3.333033942217264e-05, + "loss": 0.2483, + "step": 2046 + }, + { + "epoch": 3.2543720190779015, + "grad_norm": 3.9534928376186027, + "learning_rate": 3.333467529750548e-05, + "loss": 0.2007, + "step": 2047 + }, + { + "epoch": 3.255961844197138, + "grad_norm": 3.5995097561911384, + "learning_rate": 3.333901342880937e-05, + "loss": 0.2193, + "step": 2048 + }, + { + "epoch": 3.257551669316375, + "grad_norm": 3.7608789956159767, + "learning_rate": 3.334335381461603e-05, + "loss": 0.1536, + "step": 2049 + }, + { + "epoch": 3.259141494435612, + "grad_norm": 4.140493716440672, + "learning_rate": 3.33476964534564e-05, + "loss": 0.1823, + "step": 2050 + }, + { + "epoch": 3.260731319554849, + "grad_norm": 3.2947050907811355, + "learning_rate": 3.335204134386062e-05, + "loss": 0.21, + "step": 2051 + }, + { + "epoch": 3.262321144674086, + "grad_norm": 3.724432855304805, + "learning_rate": 3.335638848435814e-05, + "loss": 0.1665, + "step": 2052 + }, + { + "epoch": 3.2639109697933226, + "grad_norm": 1.4390746557597143, + "learning_rate": 3.336073787347759e-05, + "loss": 0.2013, + "step": 2053 + }, + { + "epoch": 3.2655007949125596, + "grad_norm": 2.138549521034436, + "learning_rate": 3.3365089509746854e-05, + "loss": 0.1581, + "step": 2054 + }, + { + "epoch": 3.2670906200317966, + "grad_norm": 2.0356620517618595, + "learning_rate": 3.336944339169308e-05, + "loss": 0.1841, + "step": 2055 + }, + { + "epoch": 3.268680445151033, + "grad_norm": 2.1871357520909402, + "learning_rate": 3.337379951784262e-05, + "loss": 0.1487, + "step": 2056 + }, + { + "epoch": 3.27027027027027, + "grad_norm": 3.3897997812299923, + "learning_rate": 3.33781578867211e-05, + "loss": 0.1627, + "step": 2057 + }, + { + "epoch": 3.271860095389507, + "grad_norm": 2.812923655334617, + "learning_rate": 3.338251849685336e-05, + "loss": 0.2008, + "step": 2058 + }, + { + "epoch": 3.273449920508744, + "grad_norm": 2.1589961141789766, + "learning_rate": 3.3386881346763476e-05, + "loss": 0.1448, + "step": 2059 + }, + { + "epoch": 3.275039745627981, + "grad_norm": 3.274878608170008, + "learning_rate": 3.339124643497481e-05, + "loss": 0.2327, + "step": 2060 + }, + { + "epoch": 3.2766295707472177, + "grad_norm": 19.682556968853664, + "learning_rate": 3.3395613760009926e-05, + "loss": 0.5019, + "step": 2061 + }, + { + "epoch": 3.2782193958664547, + "grad_norm": 2.375560645116687, + "learning_rate": 3.339998332039063e-05, + "loss": 0.175, + "step": 2062 + }, + { + "epoch": 3.2798092209856917, + "grad_norm": 1.9781578974665686, + "learning_rate": 3.3404355114638e-05, + "loss": 0.1498, + "step": 2063 + }, + { + "epoch": 3.2813990461049283, + "grad_norm": 1.3596175622607638, + "learning_rate": 3.3408729141272346e-05, + "loss": 0.217, + "step": 2064 + }, + { + "epoch": 3.2829888712241653, + "grad_norm": 3.934068218806691, + "learning_rate": 3.34131053988132e-05, + "loss": 0.2102, + "step": 2065 + }, + { + "epoch": 3.2845786963434023, + "grad_norm": 1.8627148598428587, + "learning_rate": 3.341748388577936e-05, + "loss": 0.1771, + "step": 2066 + }, + { + "epoch": 3.2861685214626393, + "grad_norm": 1.5054654612110512, + "learning_rate": 3.3421864600688886e-05, + "loss": 0.1943, + "step": 2067 + }, + { + "epoch": 3.287758346581876, + "grad_norm": 1.0810595631648718, + "learning_rate": 3.342624754205905e-05, + "loss": 0.1479, + "step": 2068 + }, + { + "epoch": 3.289348171701113, + "grad_norm": 2.3677587201051287, + "learning_rate": 3.343063270840637e-05, + "loss": 0.1579, + "step": 2069 + }, + { + "epoch": 3.29093799682035, + "grad_norm": 2.0453085968861364, + "learning_rate": 3.3435020098246656e-05, + "loss": 0.1811, + "step": 2070 + }, + { + "epoch": 3.292527821939587, + "grad_norm": 1.323429309382073, + "learning_rate": 3.3439409710094935e-05, + "loss": 0.149, + "step": 2071 + }, + { + "epoch": 3.2941176470588234, + "grad_norm": 9.7879780388433, + "learning_rate": 3.3443801542465455e-05, + "loss": 39.6278, + "step": 2072 + }, + { + "epoch": 3.2957074721780604, + "grad_norm": 18.584709452989777, + "learning_rate": 3.344819559387175e-05, + "loss": 7.2133, + "step": 2073 + }, + { + "epoch": 3.2972972972972974, + "grad_norm": 3.014184130755303, + "learning_rate": 3.345259186282661e-05, + "loss": 0.1924, + "step": 2074 + }, + { + "epoch": 3.2988871224165344, + "grad_norm": 4.150033421339138, + "learning_rate": 3.3456990347842036e-05, + "loss": 0.1853, + "step": 2075 + }, + { + "epoch": 3.300476947535771, + "grad_norm": 2.6950937769825067, + "learning_rate": 3.3461391047429305e-05, + "loss": 0.2621, + "step": 2076 + }, + { + "epoch": 3.302066772655008, + "grad_norm": 1.845369806123608, + "learning_rate": 3.3465793960098945e-05, + "loss": 0.2772, + "step": 2077 + }, + { + "epoch": 3.303656597774245, + "grad_norm": 4.434643034512295, + "learning_rate": 3.3470199084360735e-05, + "loss": 0.173, + "step": 2078 + }, + { + "epoch": 3.3052464228934815, + "grad_norm": 3.9911423063317724, + "learning_rate": 3.347460641872368e-05, + "loss": 0.2178, + "step": 2079 + }, + { + "epoch": 3.3068362480127185, + "grad_norm": 1.3368786895459137, + "learning_rate": 3.3479015961696085e-05, + "loss": 0.2259, + "step": 2080 + }, + { + "epoch": 3.3084260731319555, + "grad_norm": 2.300668625990305, + "learning_rate": 3.3483427711785454e-05, + "loss": 0.1603, + "step": 2081 + }, + { + "epoch": 3.3100158982511925, + "grad_norm": 2.328607631336273, + "learning_rate": 3.3487841667498575e-05, + "loss": 0.1426, + "step": 2082 + }, + { + "epoch": 3.3116057233704295, + "grad_norm": 2.2766915190712584, + "learning_rate": 3.349225782734149e-05, + "loss": 0.2161, + "step": 2083 + }, + { + "epoch": 3.313195548489666, + "grad_norm": 3.6611936435477963, + "learning_rate": 3.349667618981949e-05, + "loss": 0.1419, + "step": 2084 + }, + { + "epoch": 3.314785373608903, + "grad_norm": 2.9980482234918946, + "learning_rate": 3.3501096753437114e-05, + "loss": 0.1203, + "step": 2085 + }, + { + "epoch": 3.31637519872814, + "grad_norm": 1.7111955270348578, + "learning_rate": 3.350551951669816e-05, + "loss": 0.2149, + "step": 2086 + }, + { + "epoch": 3.3179650238473766, + "grad_norm": 37.303963807717935, + "learning_rate": 3.350994447810569e-05, + "loss": 16.3468, + "step": 2087 + }, + { + "epoch": 3.3195548489666136, + "grad_norm": 2.5257303017998574, + "learning_rate": 3.351437163616202e-05, + "loss": 0.2372, + "step": 2088 + }, + { + "epoch": 3.3211446740858506, + "grad_norm": 2.959088143267741, + "learning_rate": 3.351880098936869e-05, + "loss": 0.2107, + "step": 2089 + }, + { + "epoch": 3.3227344992050876, + "grad_norm": 1.2716736817045031, + "learning_rate": 3.3523232536226546e-05, + "loss": 0.1768, + "step": 2090 + }, + { + "epoch": 3.3243243243243246, + "grad_norm": 1.9471410134252394, + "learning_rate": 3.352766627523568e-05, + "loss": 0.2063, + "step": 2091 + }, + { + "epoch": 3.325914149443561, + "grad_norm": 1.2465839795045928, + "learning_rate": 3.3532102204895395e-05, + "loss": 0.1965, + "step": 2092 + }, + { + "epoch": 3.327503974562798, + "grad_norm": 2.2432255659009672, + "learning_rate": 3.3536540323704336e-05, + "loss": 0.1735, + "step": 2093 + }, + { + "epoch": 3.329093799682035, + "grad_norm": 20.537803716846273, + "learning_rate": 3.354098063016033e-05, + "loss": 9.2448, + "step": 2094 + }, + { + "epoch": 3.3306836248012717, + "grad_norm": 2.557327523773593, + "learning_rate": 3.35454231227605e-05, + "loss": 0.1609, + "step": 2095 + }, + { + "epoch": 3.3322734499205087, + "grad_norm": 1.7436180161122463, + "learning_rate": 3.3549867800001224e-05, + "loss": 0.1877, + "step": 2096 + }, + { + "epoch": 3.3338632750397457, + "grad_norm": 3.0693422701988884, + "learning_rate": 3.3554314660378134e-05, + "loss": 0.1568, + "step": 2097 + }, + { + "epoch": 3.3354531001589827, + "grad_norm": 1.81735322159398, + "learning_rate": 3.355876370238614e-05, + "loss": 0.2079, + "step": 2098 + }, + { + "epoch": 3.337042925278219, + "grad_norm": 4.166750498568972, + "learning_rate": 3.3563214924519395e-05, + "loss": 0.1545, + "step": 2099 + }, + { + "epoch": 3.338632750397456, + "grad_norm": 1.570116510322975, + "learning_rate": 3.3567668325271324e-05, + "loss": 0.1559, + "step": 2100 + }, + { + "epoch": 3.340222575516693, + "grad_norm": 2.6392027116524024, + "learning_rate": 3.3572123903134616e-05, + "loss": 0.1922, + "step": 2101 + }, + { + "epoch": 3.34181240063593, + "grad_norm": 1.5507919497905496, + "learning_rate": 3.35765816566012e-05, + "loss": 0.1611, + "step": 2102 + }, + { + "epoch": 3.3434022257551668, + "grad_norm": 2.9069591648755027, + "learning_rate": 3.358104158416231e-05, + "loss": 0.1856, + "step": 2103 + }, + { + "epoch": 3.3449920508744038, + "grad_norm": 3.1764079695465677, + "learning_rate": 3.358550368430842e-05, + "loss": 0.2452, + "step": 2104 + }, + { + "epoch": 3.3465818759936408, + "grad_norm": 3.1109162341619863, + "learning_rate": 3.358996795552926e-05, + "loss": 0.1811, + "step": 2105 + }, + { + "epoch": 3.3481717011128778, + "grad_norm": 2.0614628653725657, + "learning_rate": 3.3594434396313846e-05, + "loss": 0.2231, + "step": 2106 + }, + { + "epoch": 3.3497615262321143, + "grad_norm": 3.150357181803421, + "learning_rate": 3.3598903005150444e-05, + "loss": 0.1274, + "step": 2107 + }, + { + "epoch": 3.3513513513513513, + "grad_norm": 4.41304512253067, + "learning_rate": 3.3603373780526594e-05, + "loss": 0.3273, + "step": 2108 + }, + { + "epoch": 3.3529411764705883, + "grad_norm": 3.1773168358969333, + "learning_rate": 3.36078467209291e-05, + "loss": 0.1698, + "step": 2109 + }, + { + "epoch": 3.3545310015898253, + "grad_norm": 2.7264620764431924, + "learning_rate": 3.3612321824844026e-05, + "loss": 0.1793, + "step": 2110 + }, + { + "epoch": 3.356120826709062, + "grad_norm": 3.3388799761897965, + "learning_rate": 3.361679909075671e-05, + "loss": 0.172, + "step": 2111 + }, + { + "epoch": 3.357710651828299, + "grad_norm": 8.132202886736007, + "learning_rate": 3.362127851715179e-05, + "loss": 0.2245, + "step": 2112 + }, + { + "epoch": 3.359300476947536, + "grad_norm": 3.0471120913808574, + "learning_rate": 3.36257601025131e-05, + "loss": 0.1993, + "step": 2113 + }, + { + "epoch": 3.360890302066773, + "grad_norm": 38.9031732410398, + "learning_rate": 3.363024384532381e-05, + "loss": 14.8347, + "step": 2114 + }, + { + "epoch": 3.3624801271860094, + "grad_norm": 3.9451946228445536, + "learning_rate": 3.363472974406633e-05, + "loss": 0.1653, + "step": 2115 + }, + { + "epoch": 3.3640699523052464, + "grad_norm": 4.974225852814844, + "learning_rate": 3.3639217797222356e-05, + "loss": 0.2454, + "step": 2116 + }, + { + "epoch": 3.3656597774244834, + "grad_norm": 2.943610320647188, + "learning_rate": 3.3643708003272827e-05, + "loss": 0.1967, + "step": 2117 + }, + { + "epoch": 3.36724960254372, + "grad_norm": 6.199498369217142, + "learning_rate": 3.364820036069799e-05, + "loss": 0.294, + "step": 2118 + }, + { + "epoch": 3.368839427662957, + "grad_norm": 3.876403373560721, + "learning_rate": 3.365269486797733e-05, + "loss": 0.1655, + "step": 2119 + }, + { + "epoch": 3.370429252782194, + "grad_norm": 4.450210813779476, + "learning_rate": 3.365719152358962e-05, + "loss": 0.1666, + "step": 2120 + }, + { + "epoch": 3.372019077901431, + "grad_norm": 6.199273332228174, + "learning_rate": 3.36616903260129e-05, + "loss": 0.2147, + "step": 2121 + }, + { + "epoch": 3.373608903020668, + "grad_norm": 5.461239816934954, + "learning_rate": 3.36661912737245e-05, + "loss": 0.1692, + "step": 2122 + }, + { + "epoch": 3.3751987281399045, + "grad_norm": 3.828740174271409, + "learning_rate": 3.367069436520101e-05, + "loss": 0.9425, + "step": 2123 + }, + { + "epoch": 3.3767885532591415, + "grad_norm": 7.3116931172107735, + "learning_rate": 3.367519959891829e-05, + "loss": 0.3303, + "step": 2124 + }, + { + "epoch": 3.3783783783783785, + "grad_norm": 8.853360203021568, + "learning_rate": 3.367970697335149e-05, + "loss": 0.2298, + "step": 2125 + }, + { + "epoch": 3.379968203497615, + "grad_norm": 5.031469142603491, + "learning_rate": 3.368421648697502e-05, + "loss": 0.195, + "step": 2126 + }, + { + "epoch": 3.381558028616852, + "grad_norm": 2.146405048242195, + "learning_rate": 3.368872813826259e-05, + "loss": 0.1756, + "step": 2127 + }, + { + "epoch": 3.383147853736089, + "grad_norm": 5.034097998991767, + "learning_rate": 3.3693241925687136e-05, + "loss": 0.1971, + "step": 2128 + }, + { + "epoch": 3.384737678855326, + "grad_norm": 3.611530120724988, + "learning_rate": 3.369775784772094e-05, + "loss": 0.1811, + "step": 2129 + }, + { + "epoch": 3.3863275039745626, + "grad_norm": 4.111126576492514, + "learning_rate": 3.3702275902835494e-05, + "loss": 0.1922, + "step": 2130 + }, + { + "epoch": 3.3879173290937996, + "grad_norm": 5.157071650324187, + "learning_rate": 3.3706796089501624e-05, + "loss": 0.1734, + "step": 2131 + }, + { + "epoch": 3.3895071542130366, + "grad_norm": 1.7201039368085091, + "learning_rate": 3.37113184061894e-05, + "loss": 0.2418, + "step": 2132 + }, + { + "epoch": 3.3910969793322736, + "grad_norm": 4.340771580040756, + "learning_rate": 3.371584285136819e-05, + "loss": 0.172, + "step": 2133 + }, + { + "epoch": 3.39268680445151, + "grad_norm": 2.337969475780816, + "learning_rate": 3.372036942350662e-05, + "loss": 0.2358, + "step": 2134 + }, + { + "epoch": 3.394276629570747, + "grad_norm": 3.088486372529841, + "learning_rate": 3.372489812107262e-05, + "loss": 0.1656, + "step": 2135 + }, + { + "epoch": 3.395866454689984, + "grad_norm": 4.311901676947814, + "learning_rate": 3.3729428942533384e-05, + "loss": 0.2293, + "step": 2136 + }, + { + "epoch": 3.397456279809221, + "grad_norm": 1.7915664512944196, + "learning_rate": 3.3733961886355394e-05, + "loss": 0.1557, + "step": 2137 + }, + { + "epoch": 3.3990461049284577, + "grad_norm": 1.2885339333863737, + "learning_rate": 3.373849695100442e-05, + "loss": 0.1144, + "step": 2138 + }, + { + "epoch": 3.4006359300476947, + "grad_norm": 41.737273747148684, + "learning_rate": 3.374303413494549e-05, + "loss": 15.6859, + "step": 2139 + }, + { + "epoch": 3.4022257551669317, + "grad_norm": 3.6992944174720863, + "learning_rate": 3.374757343664295e-05, + "loss": 0.1643, + "step": 2140 + }, + { + "epoch": 3.4038155802861687, + "grad_norm": 4.110467230295227, + "learning_rate": 3.37521148545604e-05, + "loss": 0.1788, + "step": 2141 + }, + { + "epoch": 3.4054054054054053, + "grad_norm": 2.9958323299699128, + "learning_rate": 3.3756658387160735e-05, + "loss": 0.2365, + "step": 2142 + }, + { + "epoch": 3.4069952305246423, + "grad_norm": 2.7046787458429256, + "learning_rate": 3.3761204032906134e-05, + "loss": 0.1948, + "step": 2143 + }, + { + "epoch": 3.4085850556438793, + "grad_norm": 3.3276913456507016, + "learning_rate": 3.3765751790258064e-05, + "loss": 0.1543, + "step": 2144 + }, + { + "epoch": 3.4101748807631163, + "grad_norm": 3.070162684081112, + "learning_rate": 3.3770301657677275e-05, + "loss": 0.1783, + "step": 2145 + }, + { + "epoch": 3.411764705882353, + "grad_norm": 3.1458464718347794, + "learning_rate": 3.37748536336238e-05, + "loss": 0.2202, + "step": 2146 + }, + { + "epoch": 3.41335453100159, + "grad_norm": 3.4623554352067476, + "learning_rate": 3.377940771655696e-05, + "loss": 0.1631, + "step": 2147 + }, + { + "epoch": 3.414944356120827, + "grad_norm": 2.775281363278864, + "learning_rate": 3.3783963904935367e-05, + "loss": 0.1884, + "step": 2148 + }, + { + "epoch": 3.4165341812400634, + "grad_norm": 2.8078079690465465, + "learning_rate": 3.37885221972169e-05, + "loss": 0.2201, + "step": 2149 + }, + { + "epoch": 3.4181240063593004, + "grad_norm": 4.107895412851775, + "learning_rate": 3.3793082591858753e-05, + "loss": 0.2434, + "step": 2150 + }, + { + "epoch": 3.4197138314785374, + "grad_norm": 3.1525698129921427, + "learning_rate": 3.379764508731741e-05, + "loss": 0.1979, + "step": 2151 + }, + { + "epoch": 3.4213036565977744, + "grad_norm": 4.579972422764068, + "learning_rate": 3.38022096820486e-05, + "loss": 0.2237, + "step": 2152 + }, + { + "epoch": 3.4228934817170114, + "grad_norm": 3.2064368093115676, + "learning_rate": 3.3806776374507395e-05, + "loss": 0.1749, + "step": 2153 + }, + { + "epoch": 3.424483306836248, + "grad_norm": 3.7168287919867002, + "learning_rate": 3.381134516314814e-05, + "loss": 0.1725, + "step": 2154 + }, + { + "epoch": 3.426073131955485, + "grad_norm": 2.9683114496060545, + "learning_rate": 3.381591604642446e-05, + "loss": 0.2521, + "step": 2155 + }, + { + "epoch": 3.427662957074722, + "grad_norm": 3.8528097786993465, + "learning_rate": 3.382048902278927e-05, + "loss": 0.1841, + "step": 2156 + }, + { + "epoch": 3.4292527821939585, + "grad_norm": 6.870375394785343, + "learning_rate": 3.382506409069479e-05, + "loss": 0.7296, + "step": 2157 + }, + { + "epoch": 3.4308426073131955, + "grad_norm": 2.1184206237802536, + "learning_rate": 3.382964124859252e-05, + "loss": 0.1378, + "step": 2158 + }, + { + "epoch": 3.4324324324324325, + "grad_norm": 3.000731446186443, + "learning_rate": 3.383422049493325e-05, + "loss": 0.1442, + "step": 2159 + }, + { + "epoch": 3.4340222575516695, + "grad_norm": 3.2782961136649056, + "learning_rate": 3.383880182816709e-05, + "loss": 0.1615, + "step": 2160 + }, + { + "epoch": 3.435612082670906, + "grad_norm": 4.054326351046822, + "learning_rate": 3.384338524674342e-05, + "loss": 0.2425, + "step": 2161 + }, + { + "epoch": 3.437201907790143, + "grad_norm": 2.8977262837324553, + "learning_rate": 3.384797074911091e-05, + "loss": 0.1867, + "step": 2162 + }, + { + "epoch": 3.43879173290938, + "grad_norm": 3.9481861133530023, + "learning_rate": 3.385255833371753e-05, + "loss": 0.1844, + "step": 2163 + }, + { + "epoch": 3.440381558028617, + "grad_norm": 2.00942043841239, + "learning_rate": 3.385714799901057e-05, + "loss": 0.1502, + "step": 2164 + }, + { + "epoch": 3.4419713831478536, + "grad_norm": 2.6894135580287992, + "learning_rate": 3.386173974343657e-05, + "loss": 0.1824, + "step": 2165 + }, + { + "epoch": 3.4435612082670906, + "grad_norm": 1.5990469325129022, + "learning_rate": 3.3866333565441406e-05, + "loss": 0.1805, + "step": 2166 + }, + { + "epoch": 3.4451510333863276, + "grad_norm": 3.2768738952473324, + "learning_rate": 3.387092946347023e-05, + "loss": 0.2358, + "step": 2167 + }, + { + "epoch": 3.4467408585055646, + "grad_norm": 2.2199109289611307, + "learning_rate": 3.387552743596751e-05, + "loss": 0.1938, + "step": 2168 + }, + { + "epoch": 3.448330683624801, + "grad_norm": 2.097812273139215, + "learning_rate": 3.388012748137698e-05, + "loss": 0.1297, + "step": 2169 + }, + { + "epoch": 3.449920508744038, + "grad_norm": 3.212255973167637, + "learning_rate": 3.388472959814169e-05, + "loss": 0.1765, + "step": 2170 + }, + { + "epoch": 3.451510333863275, + "grad_norm": 2.39076475124453, + "learning_rate": 3.3889333784704e-05, + "loss": 0.1489, + "step": 2171 + }, + { + "epoch": 3.453100158982512, + "grad_norm": 2.781727952033124, + "learning_rate": 3.389394003950556e-05, + "loss": 0.2081, + "step": 2172 + }, + { + "epoch": 3.4546899841017487, + "grad_norm": 2.7356780126644265, + "learning_rate": 3.389854836098732e-05, + "loss": 0.1744, + "step": 2173 + }, + { + "epoch": 3.4562798092209857, + "grad_norm": 2.111571839276, + "learning_rate": 3.3903158747589534e-05, + "loss": 0.1709, + "step": 2174 + }, + { + "epoch": 3.4578696343402227, + "grad_norm": 1.8040690783986737, + "learning_rate": 3.390777119775174e-05, + "loss": 0.1915, + "step": 2175 + }, + { + "epoch": 3.4594594594594597, + "grad_norm": 1.3262928303365507, + "learning_rate": 3.391238570991279e-05, + "loss": 0.176, + "step": 2176 + }, + { + "epoch": 3.461049284578696, + "grad_norm": 3.2926337738916747, + "learning_rate": 3.3917002282510864e-05, + "loss": 0.1997, + "step": 2177 + }, + { + "epoch": 3.462639109697933, + "grad_norm": 2.123643365442141, + "learning_rate": 3.3921620913983385e-05, + "loss": 0.1991, + "step": 2178 + }, + { + "epoch": 3.46422893481717, + "grad_norm": 1.6456511998951413, + "learning_rate": 3.392624160276714e-05, + "loss": 0.1589, + "step": 2179 + }, + { + "epoch": 3.4658187599364068, + "grad_norm": 55.64065635317461, + "learning_rate": 3.393086434729817e-05, + "loss": 24.0611, + "step": 2180 + }, + { + "epoch": 3.4674085850556438, + "grad_norm": 5.5795563955335465, + "learning_rate": 3.393548914601187e-05, + "loss": 0.2211, + "step": 2181 + }, + { + "epoch": 3.4689984101748808, + "grad_norm": 4.4981944211527365, + "learning_rate": 3.394011599734289e-05, + "loss": 0.2543, + "step": 2182 + }, + { + "epoch": 3.4705882352941178, + "grad_norm": 2.579640088456395, + "learning_rate": 3.394474489972522e-05, + "loss": 0.2024, + "step": 2183 + }, + { + "epoch": 3.4721780604133547, + "grad_norm": 2.4271961898846834, + "learning_rate": 3.394937585159214e-05, + "loss": 0.1729, + "step": 2184 + }, + { + "epoch": 3.4737678855325913, + "grad_norm": 3.5809252323718077, + "learning_rate": 3.395400885137625e-05, + "loss": 0.1688, + "step": 2185 + }, + { + "epoch": 3.4753577106518283, + "grad_norm": 3.3666480906915566, + "learning_rate": 3.395864389750944e-05, + "loss": 0.1815, + "step": 2186 + }, + { + "epoch": 3.4769475357710653, + "grad_norm": 1.414983950788271, + "learning_rate": 3.396328098842291e-05, + "loss": 0.1424, + "step": 2187 + }, + { + "epoch": 3.478537360890302, + "grad_norm": 2.7099769763544432, + "learning_rate": 3.396792012254718e-05, + "loss": 0.2029, + "step": 2188 + }, + { + "epoch": 3.480127186009539, + "grad_norm": 3.2846008613747064, + "learning_rate": 3.397256129831206e-05, + "loss": 0.1836, + "step": 2189 + }, + { + "epoch": 3.481717011128776, + "grad_norm": 2.3755013789416926, + "learning_rate": 3.3977204514146697e-05, + "loss": 0.1599, + "step": 2190 + }, + { + "epoch": 3.483306836248013, + "grad_norm": 3.576185000003846, + "learning_rate": 3.398184976847951e-05, + "loss": 0.1516, + "step": 2191 + }, + { + "epoch": 3.48489666136725, + "grad_norm": 1.3608002899041598, + "learning_rate": 3.3986497059738275e-05, + "loss": 0.166, + "step": 2192 + }, + { + "epoch": 3.4864864864864864, + "grad_norm": 2.7348729164966614, + "learning_rate": 3.3991146386350036e-05, + "loss": 0.2067, + "step": 2193 + }, + { + "epoch": 3.4880763116057234, + "grad_norm": 3.860754630372321, + "learning_rate": 3.399579774674116e-05, + "loss": 0.149, + "step": 2194 + }, + { + "epoch": 3.4896661367249604, + "grad_norm": 2.9614610542801074, + "learning_rate": 3.400045113933734e-05, + "loss": 0.234, + "step": 2195 + }, + { + "epoch": 3.491255961844197, + "grad_norm": 3.950100658150459, + "learning_rate": 3.4005106562563566e-05, + "loss": 0.2657, + "step": 2196 + }, + { + "epoch": 3.492845786963434, + "grad_norm": 1.981735627359607, + "learning_rate": 3.400976401484414e-05, + "loss": 0.1482, + "step": 2197 + }, + { + "epoch": 3.494435612082671, + "grad_norm": 2.5275298011325527, + "learning_rate": 3.40144234946027e-05, + "loss": 0.199, + "step": 2198 + }, + { + "epoch": 3.496025437201908, + "grad_norm": 4.411399450783077, + "learning_rate": 3.401908500026217e-05, + "loss": 0.1697, + "step": 2199 + }, + { + "epoch": 3.4976152623211445, + "grad_norm": 2.4092006239734753, + "learning_rate": 3.402374853024479e-05, + "loss": 0.1184, + "step": 2200 + }, + { + "epoch": 3.4992050874403815, + "grad_norm": 2.957952762723262, + "learning_rate": 3.4028414082972135e-05, + "loss": 0.1889, + "step": 2201 + }, + { + "epoch": 3.5007949125596185, + "grad_norm": 2.788155962104228, + "learning_rate": 3.4033081656865085e-05, + "loss": 0.1552, + "step": 2202 + }, + { + "epoch": 3.502384737678855, + "grad_norm": 2.438776921587329, + "learning_rate": 3.403775125034384e-05, + "loss": 0.206, + "step": 2203 + }, + { + "epoch": 3.503974562798092, + "grad_norm": 3.876453117979367, + "learning_rate": 3.404242286182791e-05, + "loss": 0.154, + "step": 2204 + }, + { + "epoch": 3.505564387917329, + "grad_norm": 3.239506735201599, + "learning_rate": 3.404709648973611e-05, + "loss": 0.2192, + "step": 2205 + }, + { + "epoch": 3.507154213036566, + "grad_norm": 2.5948439312648413, + "learning_rate": 3.4051772132486586e-05, + "loss": 0.2253, + "step": 2206 + }, + { + "epoch": 3.508744038155803, + "grad_norm": 4.030055172956011, + "learning_rate": 3.405644978849682e-05, + "loss": 0.2134, + "step": 2207 + }, + { + "epoch": 3.5103338632750396, + "grad_norm": 4.175105502326585, + "learning_rate": 3.4061129456183584e-05, + "loss": 0.1504, + "step": 2208 + }, + { + "epoch": 3.5119236883942766, + "grad_norm": 3.3917770467861166, + "learning_rate": 3.406581113396298e-05, + "loss": 0.21, + "step": 2209 + }, + { + "epoch": 3.5135135135135136, + "grad_norm": 2.5649197187696853, + "learning_rate": 3.4070494820250445e-05, + "loss": 0.1793, + "step": 2210 + }, + { + "epoch": 3.51510333863275, + "grad_norm": 3.4291171760907537, + "learning_rate": 3.40751805134607e-05, + "loss": 0.1714, + "step": 2211 + }, + { + "epoch": 3.516693163751987, + "grad_norm": 2.3631258905262, + "learning_rate": 3.4079868212007804e-05, + "loss": 0.2271, + "step": 2212 + }, + { + "epoch": 3.518282988871224, + "grad_norm": 3.3463495731738706, + "learning_rate": 3.4084557914305156e-05, + "loss": 0.1449, + "step": 2213 + }, + { + "epoch": 3.519872813990461, + "grad_norm": 5.134986789482722, + "learning_rate": 3.408924961876547e-05, + "loss": 0.2584, + "step": 2214 + }, + { + "epoch": 3.521462639109698, + "grad_norm": 3.363341197135916, + "learning_rate": 3.4093943323800746e-05, + "loss": 0.1694, + "step": 2215 + }, + { + "epoch": 3.5230524642289347, + "grad_norm": 2.0588947012531795, + "learning_rate": 3.4098639027822355e-05, + "loss": 0.2065, + "step": 2216 + }, + { + "epoch": 3.5246422893481717, + "grad_norm": 2.710768330059068, + "learning_rate": 3.410333672924097e-05, + "loss": 0.2022, + "step": 2217 + }, + { + "epoch": 3.5262321144674087, + "grad_norm": 2.778468558206064, + "learning_rate": 3.410803642646658e-05, + "loss": 0.1645, + "step": 2218 + }, + { + "epoch": 3.5278219395866453, + "grad_norm": 2.588865704206046, + "learning_rate": 3.411273811790852e-05, + "loss": 0.1556, + "step": 2219 + }, + { + "epoch": 3.5294117647058822, + "grad_norm": 2.1734756855357906, + "learning_rate": 3.411744180197542e-05, + "loss": 0.1738, + "step": 2220 + }, + { + "epoch": 3.5310015898251192, + "grad_norm": 1.859857625810375, + "learning_rate": 3.412214747707527e-05, + "loss": 0.2157, + "step": 2221 + }, + { + "epoch": 3.5325914149443562, + "grad_norm": 2.2348037869973583, + "learning_rate": 3.412685514161536e-05, + "loss": 0.2166, + "step": 2222 + }, + { + "epoch": 3.5341812400635932, + "grad_norm": 2.7988995354255506, + "learning_rate": 3.413156479400232e-05, + "loss": 0.1786, + "step": 2223 + }, + { + "epoch": 3.53577106518283, + "grad_norm": 2.5668557554612383, + "learning_rate": 3.413627643264211e-05, + "loss": 0.1877, + "step": 2224 + }, + { + "epoch": 3.537360890302067, + "grad_norm": 2.5269583639974114, + "learning_rate": 3.414099005594e-05, + "loss": 0.1769, + "step": 2225 + }, + { + "epoch": 3.538950715421304, + "grad_norm": 4.06429581804545, + "learning_rate": 3.41457056623006e-05, + "loss": 0.2037, + "step": 2226 + }, + { + "epoch": 3.5405405405405403, + "grad_norm": 2.518892455939632, + "learning_rate": 3.415042325012785e-05, + "loss": 0.1494, + "step": 2227 + }, + { + "epoch": 3.5421303656597773, + "grad_norm": 1.7214389669068981, + "learning_rate": 3.415514281782501e-05, + "loss": 0.1276, + "step": 2228 + }, + { + "epoch": 3.5437201907790143, + "grad_norm": 2.8492626005164525, + "learning_rate": 3.41598643637947e-05, + "loss": 0.1359, + "step": 2229 + }, + { + "epoch": 3.5453100158982513, + "grad_norm": 1.7768946683979763, + "learning_rate": 3.416458788643883e-05, + "loss": 0.1348, + "step": 2230 + }, + { + "epoch": 3.5468998410174883, + "grad_norm": 2.576159912904376, + "learning_rate": 3.4169313384158655e-05, + "loss": 0.2295, + "step": 2231 + }, + { + "epoch": 3.548489666136725, + "grad_norm": 36.9665246530489, + "learning_rate": 3.417404085535477e-05, + "loss": 15.4191, + "step": 2232 + }, + { + "epoch": 3.550079491255962, + "grad_norm": 3.525256989766491, + "learning_rate": 3.4178770298427105e-05, + "loss": 0.2494, + "step": 2233 + }, + { + "epoch": 3.551669316375199, + "grad_norm": 2.859370599697808, + "learning_rate": 3.418350171177491e-05, + "loss": 0.1658, + "step": 2234 + }, + { + "epoch": 3.5532591414944354, + "grad_norm": 1.9042829320072436, + "learning_rate": 3.418823509379677e-05, + "loss": 0.2296, + "step": 2235 + }, + { + "epoch": 3.5548489666136724, + "grad_norm": 4.016433547259195, + "learning_rate": 3.41929704428906e-05, + "loss": 0.2644, + "step": 2236 + }, + { + "epoch": 3.5564387917329094, + "grad_norm": 4.357613434026333, + "learning_rate": 3.419770775745367e-05, + "loss": 0.1314, + "step": 2237 + }, + { + "epoch": 3.5580286168521464, + "grad_norm": 14.057584097209778, + "learning_rate": 3.420244703588257e-05, + "loss": 5.4259, + "step": 2238 + }, + { + "epoch": 3.559618441971383, + "grad_norm": 2.253003561372279, + "learning_rate": 3.420718827657321e-05, + "loss": 0.1598, + "step": 2239 + }, + { + "epoch": 3.56120826709062, + "grad_norm": 5.751152761592095, + "learning_rate": 3.421193147792087e-05, + "loss": 0.2441, + "step": 2240 + }, + { + "epoch": 3.562798092209857, + "grad_norm": 2.6784110215551897, + "learning_rate": 3.4216676638320134e-05, + "loss": 0.1235, + "step": 2241 + }, + { + "epoch": 3.5643879173290935, + "grad_norm": 2.114680599685505, + "learning_rate": 3.422142375616495e-05, + "loss": 0.1323, + "step": 2242 + }, + { + "epoch": 3.5659777424483305, + "grad_norm": 4.075348934786634, + "learning_rate": 3.422617282984858e-05, + "loss": 0.2429, + "step": 2243 + }, + { + "epoch": 3.5675675675675675, + "grad_norm": 2.409238601758043, + "learning_rate": 3.4230923857763636e-05, + "loss": 0.2051, + "step": 2244 + }, + { + "epoch": 3.5691573926868045, + "grad_norm": 2.3893800850148454, + "learning_rate": 3.4235676838302066e-05, + "loss": 0.1243, + "step": 2245 + }, + { + "epoch": 3.5707472178060415, + "grad_norm": 2.5854761109997764, + "learning_rate": 3.4240431769855164e-05, + "loss": 0.1967, + "step": 2246 + }, + { + "epoch": 3.572337042925278, + "grad_norm": 2.9506156239410637, + "learning_rate": 3.4245188650813566e-05, + "loss": 0.1435, + "step": 2247 + }, + { + "epoch": 3.573926868044515, + "grad_norm": 3.301166821257964, + "learning_rate": 3.424994747956721e-05, + "loss": 0.1946, + "step": 2248 + }, + { + "epoch": 3.575516693163752, + "grad_norm": 3.320171001047837, + "learning_rate": 3.425470825450544e-05, + "loss": 0.2228, + "step": 2249 + }, + { + "epoch": 3.5771065182829886, + "grad_norm": 4.566361161712173, + "learning_rate": 3.4259470974016885e-05, + "loss": 0.22, + "step": 2250 + }, + { + "epoch": 3.5786963434022256, + "grad_norm": 3.029054064315815, + "learning_rate": 3.4264235636489544e-05, + "loss": 0.2434, + "step": 2251 + }, + { + "epoch": 3.5802861685214626, + "grad_norm": 1.8838927540230304, + "learning_rate": 3.426900224031074e-05, + "loss": 0.1651, + "step": 2252 + }, + { + "epoch": 3.5818759936406996, + "grad_norm": 4.596458434587626, + "learning_rate": 3.427377078386716e-05, + "loss": 0.228, + "step": 2253 + }, + { + "epoch": 3.5834658187599366, + "grad_norm": 3.8577389660577066, + "learning_rate": 3.4278541265544835e-05, + "loss": 0.2014, + "step": 2254 + }, + { + "epoch": 3.585055643879173, + "grad_norm": 6.338771202530857, + "learning_rate": 3.4283313683729115e-05, + "loss": 0.4152, + "step": 2255 + }, + { + "epoch": 3.58664546899841, + "grad_norm": 1.841274035894927, + "learning_rate": 3.4288088036804715e-05, + "loss": 0.1563, + "step": 2256 + }, + { + "epoch": 3.588235294117647, + "grad_norm": 5.6116912356050035, + "learning_rate": 3.429286432315568e-05, + "loss": 0.181, + "step": 2257 + }, + { + "epoch": 3.5898251192368837, + "grad_norm": 4.214938153688392, + "learning_rate": 3.429764254116542e-05, + "loss": 0.2211, + "step": 2258 + }, + { + "epoch": 3.5914149443561207, + "grad_norm": 2.6001930171720353, + "learning_rate": 3.430242268921669e-05, + "loss": 0.1626, + "step": 2259 + }, + { + "epoch": 3.5930047694753577, + "grad_norm": 5.918990854283879, + "learning_rate": 3.430720476569156e-05, + "loss": 0.4128, + "step": 2260 + }, + { + "epoch": 3.5945945945945947, + "grad_norm": 5.576818058686819, + "learning_rate": 3.431198876897148e-05, + "loss": 0.1494, + "step": 2261 + }, + { + "epoch": 3.5961844197138317, + "grad_norm": 5.907947288151928, + "learning_rate": 3.4316774697437244e-05, + "loss": 0.1731, + "step": 2262 + }, + { + "epoch": 3.5977742448330683, + "grad_norm": 2.327019084078932, + "learning_rate": 3.4321562549468995e-05, + "loss": 0.2101, + "step": 2263 + }, + { + "epoch": 3.5993640699523053, + "grad_norm": 2.7985626720709877, + "learning_rate": 3.43263523234462e-05, + "loss": 0.1317, + "step": 2264 + }, + { + "epoch": 3.6009538950715423, + "grad_norm": 4.072661424485802, + "learning_rate": 3.433114401774769e-05, + "loss": 0.1615, + "step": 2265 + }, + { + "epoch": 3.602543720190779, + "grad_norm": 2.8231238512874657, + "learning_rate": 3.4335937630751675e-05, + "loss": 0.1673, + "step": 2266 + }, + { + "epoch": 3.604133545310016, + "grad_norm": 3.3355505102611733, + "learning_rate": 3.434073316083567e-05, + "loss": 0.2443, + "step": 2267 + }, + { + "epoch": 3.605723370429253, + "grad_norm": 3.562202793634764, + "learning_rate": 3.4345530606376576e-05, + "loss": 0.1669, + "step": 2268 + }, + { + "epoch": 3.60731319554849, + "grad_norm": 1.7682861462217057, + "learning_rate": 3.435032996575062e-05, + "loss": 0.1754, + "step": 2269 + }, + { + "epoch": 3.6089030206677264, + "grad_norm": 2.447206838622675, + "learning_rate": 3.43551312373334e-05, + "loss": 0.1885, + "step": 2270 + }, + { + "epoch": 3.6104928457869634, + "grad_norm": 2.86177788923834, + "learning_rate": 3.435993441949985e-05, + "loss": 0.1619, + "step": 2271 + }, + { + "epoch": 3.6120826709062004, + "grad_norm": 2.686715163840309, + "learning_rate": 3.4364739510624285e-05, + "loss": 0.1933, + "step": 2272 + }, + { + "epoch": 3.613672496025437, + "grad_norm": 37.41382241954652, + "learning_rate": 3.436954650908034e-05, + "loss": 13.8121, + "step": 2273 + }, + { + "epoch": 3.615262321144674, + "grad_norm": 1.9255833882516404, + "learning_rate": 3.4374355413241026e-05, + "loss": 0.1871, + "step": 2274 + }, + { + "epoch": 3.616852146263911, + "grad_norm": 3.5383493640689485, + "learning_rate": 3.437916622147869e-05, + "loss": 0.1361, + "step": 2275 + }, + { + "epoch": 3.618441971383148, + "grad_norm": 2.1782744586869316, + "learning_rate": 3.4383978932165066e-05, + "loss": 0.1045, + "step": 2276 + }, + { + "epoch": 3.620031796502385, + "grad_norm": 2.9120566940800257, + "learning_rate": 3.438879354367123e-05, + "loss": 0.1437, + "step": 2277 + }, + { + "epoch": 3.6216216216216215, + "grad_norm": 2.2983772759256205, + "learning_rate": 3.4393610054367585e-05, + "loss": 0.202, + "step": 2278 + }, + { + "epoch": 3.6232114467408585, + "grad_norm": 1.8067785838437427, + "learning_rate": 3.439842846262394e-05, + "loss": 0.1745, + "step": 2279 + }, + { + "epoch": 3.6248012718600955, + "grad_norm": 3.4078253803416394, + "learning_rate": 3.4403248766809414e-05, + "loss": 0.1872, + "step": 2280 + }, + { + "epoch": 3.626391096979332, + "grad_norm": 2.5226845007078063, + "learning_rate": 3.440807096529253e-05, + "loss": 0.159, + "step": 2281 + }, + { + "epoch": 3.627980922098569, + "grad_norm": 2.5203270088636724, + "learning_rate": 3.441289505644114e-05, + "loss": 0.2339, + "step": 2282 + }, + { + "epoch": 3.629570747217806, + "grad_norm": 2.626330713440023, + "learning_rate": 3.441772103862248e-05, + "loss": 0.2438, + "step": 2283 + }, + { + "epoch": 3.631160572337043, + "grad_norm": 2.0502242323045494, + "learning_rate": 3.4422548910203095e-05, + "loss": 0.1861, + "step": 2284 + }, + { + "epoch": 3.63275039745628, + "grad_norm": 2.8756284595594077, + "learning_rate": 3.442737866954896e-05, + "loss": 0.2557, + "step": 2285 + }, + { + "epoch": 3.6343402225755166, + "grad_norm": 1.7745984119176768, + "learning_rate": 3.443221031502536e-05, + "loss": 0.2023, + "step": 2286 + }, + { + "epoch": 3.6359300476947536, + "grad_norm": 2.554434937357879, + "learning_rate": 3.443704384499695e-05, + "loss": 0.1187, + "step": 2287 + }, + { + "epoch": 3.6375198728139906, + "grad_norm": 2.620617746737545, + "learning_rate": 3.444187925782777e-05, + "loss": 0.1723, + "step": 2288 + }, + { + "epoch": 3.639109697933227, + "grad_norm": 2.922435096343134, + "learning_rate": 3.444671655188121e-05, + "loss": 0.2132, + "step": 2289 + }, + { + "epoch": 3.640699523052464, + "grad_norm": 2.694029556211865, + "learning_rate": 3.445155572552001e-05, + "loss": 0.1747, + "step": 2290 + }, + { + "epoch": 3.642289348171701, + "grad_norm": 4.540787233347911, + "learning_rate": 3.445639677710628e-05, + "loss": 0.235, + "step": 2291 + }, + { + "epoch": 3.643879173290938, + "grad_norm": 4.964047278314403, + "learning_rate": 3.446123970500152e-05, + "loss": 0.3568, + "step": 2292 + }, + { + "epoch": 3.645468998410175, + "grad_norm": 4.481992082432244, + "learning_rate": 3.446608450756656e-05, + "loss": 0.1436, + "step": 2293 + }, + { + "epoch": 3.6470588235294117, + "grad_norm": 3.22670473643323, + "learning_rate": 3.4470931183161605e-05, + "loss": 0.1988, + "step": 2294 + }, + { + "epoch": 3.6486486486486487, + "grad_norm": 3.671710213258525, + "learning_rate": 3.4475779730146245e-05, + "loss": 0.1582, + "step": 2295 + }, + { + "epoch": 3.6502384737678857, + "grad_norm": 4.2184587667245035, + "learning_rate": 3.448063014687942e-05, + "loss": 0.1759, + "step": 2296 + }, + { + "epoch": 3.6518282988871222, + "grad_norm": 3.106020188299759, + "learning_rate": 3.448548243171943e-05, + "loss": 0.1351, + "step": 2297 + }, + { + "epoch": 3.6534181240063592, + "grad_norm": 2.66723789554461, + "learning_rate": 3.449033658302396e-05, + "loss": 0.1831, + "step": 2298 + }, + { + "epoch": 3.6550079491255962, + "grad_norm": 1.9433657046836006, + "learning_rate": 3.449519259915005e-05, + "loss": 0.1679, + "step": 2299 + }, + { + "epoch": 3.6565977742448332, + "grad_norm": 2.4516316541226852, + "learning_rate": 3.450005047845411e-05, + "loss": 0.1844, + "step": 2300 + }, + { + "epoch": 3.65818759936407, + "grad_norm": 5.049219009301103, + "learning_rate": 3.4504910219291934e-05, + "loss": 0.183, + "step": 2301 + }, + { + "epoch": 3.659777424483307, + "grad_norm": 5.49481646203679, + "learning_rate": 3.450977182001869e-05, + "loss": 0.1974, + "step": 2302 + }, + { + "epoch": 3.661367249602544, + "grad_norm": 1.3940032561960058, + "learning_rate": 3.451463527898887e-05, + "loss": 0.1403, + "step": 2303 + }, + { + "epoch": 3.6629570747217803, + "grad_norm": 4.562428681671952, + "learning_rate": 3.451950059455638e-05, + "loss": 0.1869, + "step": 2304 + }, + { + "epoch": 3.6645468998410173, + "grad_norm": 4.936971297269469, + "learning_rate": 3.4524367765074494e-05, + "loss": 0.212, + "step": 2305 + }, + { + "epoch": 3.6661367249602543, + "grad_norm": 2.6060978318861237, + "learning_rate": 3.452923678889585e-05, + "loss": 0.1422, + "step": 2306 + }, + { + "epoch": 3.6677265500794913, + "grad_norm": 3.155356818348308, + "learning_rate": 3.4534107664372465e-05, + "loss": 0.239, + "step": 2307 + }, + { + "epoch": 3.6693163751987283, + "grad_norm": 11.813544448137588, + "learning_rate": 3.4538980389855704e-05, + "loss": 0.6999, + "step": 2308 + }, + { + "epoch": 3.670906200317965, + "grad_norm": 3.6389087904011648, + "learning_rate": 3.454385496369635e-05, + "loss": 0.1703, + "step": 2309 + }, + { + "epoch": 3.672496025437202, + "grad_norm": 2.8944301972882336, + "learning_rate": 3.454873138424452e-05, + "loss": 0.1967, + "step": 2310 + }, + { + "epoch": 3.674085850556439, + "grad_norm": 2.0656070003349427, + "learning_rate": 3.455360964984973e-05, + "loss": 0.1834, + "step": 2311 + }, + { + "epoch": 3.6756756756756754, + "grad_norm": 2.477057663255453, + "learning_rate": 3.455848975886086e-05, + "loss": 0.1726, + "step": 2312 + }, + { + "epoch": 3.6772655007949124, + "grad_norm": 42.26123804523394, + "learning_rate": 3.456337170962617e-05, + "loss": 12.6692, + "step": 2313 + }, + { + "epoch": 3.6788553259141494, + "grad_norm": 2.4892692291877654, + "learning_rate": 3.45682555004933e-05, + "loss": 0.1478, + "step": 2314 + }, + { + "epoch": 3.6804451510333864, + "grad_norm": 1.5452380338837028, + "learning_rate": 3.457314112980925e-05, + "loss": 0.145, + "step": 2315 + }, + { + "epoch": 3.6820349761526234, + "grad_norm": 3.28085964751207, + "learning_rate": 3.457802859592043e-05, + "loss": 0.2478, + "step": 2316 + }, + { + "epoch": 3.68362480127186, + "grad_norm": 1.8619416725266886, + "learning_rate": 3.4582917897172606e-05, + "loss": 0.1933, + "step": 2317 + }, + { + "epoch": 3.685214626391097, + "grad_norm": 3.0318040634766428, + "learning_rate": 3.4587809031910915e-05, + "loss": 0.1959, + "step": 2318 + }, + { + "epoch": 3.686804451510334, + "grad_norm": 6.527467021135294, + "learning_rate": 3.459270199847989e-05, + "loss": 0.2098, + "step": 2319 + }, + { + "epoch": 3.6883942766295705, + "grad_norm": 3.1986261677412853, + "learning_rate": 3.459759679522345e-05, + "loss": 0.149, + "step": 2320 + }, + { + "epoch": 3.6899841017488075, + "grad_norm": 1.0413873174843853, + "learning_rate": 3.460249342048487e-05, + "loss": 0.1505, + "step": 2321 + }, + { + "epoch": 3.6915739268680445, + "grad_norm": 2.435611173089463, + "learning_rate": 3.460739187260682e-05, + "loss": 0.1677, + "step": 2322 + }, + { + "epoch": 3.6931637519872815, + "grad_norm": 4.413936484957647, + "learning_rate": 3.461229214993136e-05, + "loss": 0.2017, + "step": 2323 + }, + { + "epoch": 3.6947535771065185, + "grad_norm": 2.7072352061062515, + "learning_rate": 3.461719425079993e-05, + "loss": 0.2308, + "step": 2324 + }, + { + "epoch": 3.696343402225755, + "grad_norm": 2.3537338098702585, + "learning_rate": 3.462209817355333e-05, + "loss": 0.1907, + "step": 2325 + }, + { + "epoch": 3.697933227344992, + "grad_norm": 2.066459671956203, + "learning_rate": 3.462700391653176e-05, + "loss": 0.2215, + "step": 2326 + }, + { + "epoch": 3.699523052464229, + "grad_norm": 2.0519213228534143, + "learning_rate": 3.463191147807482e-05, + "loss": 0.1468, + "step": 2327 + }, + { + "epoch": 3.7011128775834656, + "grad_norm": 2.452532896603811, + "learning_rate": 3.463682085652146e-05, + "loss": 0.1486, + "step": 2328 + }, + { + "epoch": 3.7027027027027026, + "grad_norm": 1.8992498150981318, + "learning_rate": 3.464173205021004e-05, + "loss": 0.1275, + "step": 2329 + }, + { + "epoch": 3.7042925278219396, + "grad_norm": 2.4716633414403417, + "learning_rate": 3.464664505747829e-05, + "loss": 0.2015, + "step": 2330 + }, + { + "epoch": 3.7058823529411766, + "grad_norm": 3.3707798231168296, + "learning_rate": 3.465155987666335e-05, + "loss": 0.1621, + "step": 2331 + }, + { + "epoch": 3.7074721780604136, + "grad_norm": 2.2108990507123565, + "learning_rate": 3.465647650610173e-05, + "loss": 0.2192, + "step": 2332 + }, + { + "epoch": 3.70906200317965, + "grad_norm": 56.21615648325044, + "learning_rate": 3.4661394944129334e-05, + "loss": 1.647, + "step": 2333 + }, + { + "epoch": 3.710651828298887, + "grad_norm": 3.8275669427765107, + "learning_rate": 3.466631518908143e-05, + "loss": 0.2487, + "step": 2334 + }, + { + "epoch": 3.7122416534181237, + "grad_norm": 6.050164351756658, + "learning_rate": 3.4671237239292705e-05, + "loss": 0.2014, + "step": 2335 + }, + { + "epoch": 3.7138314785373607, + "grad_norm": 3.109243734928668, + "learning_rate": 3.4676161093097213e-05, + "loss": 0.2037, + "step": 2336 + }, + { + "epoch": 3.7154213036565977, + "grad_norm": 2.8354605919419926, + "learning_rate": 3.4681086748828426e-05, + "loss": 0.2031, + "step": 2337 + }, + { + "epoch": 3.7170111287758347, + "grad_norm": 2.4843078172652246, + "learning_rate": 3.468601420481917e-05, + "loss": 0.2555, + "step": 2338 + }, + { + "epoch": 3.7186009538950717, + "grad_norm": 2.786951390503813, + "learning_rate": 3.469094345940169e-05, + "loss": 0.2061, + "step": 2339 + }, + { + "epoch": 3.7201907790143083, + "grad_norm": 1.6697304009388219, + "learning_rate": 3.469587451090761e-05, + "loss": 0.2119, + "step": 2340 + }, + { + "epoch": 3.7217806041335453, + "grad_norm": 3.5796718797231724, + "learning_rate": 3.470080735766795e-05, + "loss": 0.234, + "step": 2341 + }, + { + "epoch": 3.7233704292527823, + "grad_norm": 2.0991303793579568, + "learning_rate": 3.470574199801312e-05, + "loss": 0.1727, + "step": 2342 + }, + { + "epoch": 3.724960254372019, + "grad_norm": 1.6365875398144263, + "learning_rate": 3.471067843027291e-05, + "loss": 0.2046, + "step": 2343 + }, + { + "epoch": 3.726550079491256, + "grad_norm": 1.68264238342068, + "learning_rate": 3.471561665277653e-05, + "loss": 0.1954, + "step": 2344 + }, + { + "epoch": 3.728139904610493, + "grad_norm": 3.3875991538796857, + "learning_rate": 3.472055666385256e-05, + "loss": 0.18, + "step": 2345 + }, + { + "epoch": 3.72972972972973, + "grad_norm": 1.3583845518927988, + "learning_rate": 3.4725498461829006e-05, + "loss": 0.1499, + "step": 2346 + }, + { + "epoch": 3.731319554848967, + "grad_norm": 19.37809957544923, + "learning_rate": 3.473044204503322e-05, + "loss": 7.4579, + "step": 2347 + }, + { + "epoch": 3.7329093799682034, + "grad_norm": 0.7892535760689335, + "learning_rate": 3.4735387411792e-05, + "loss": 0.1631, + "step": 2348 + }, + { + "epoch": 3.7344992050874404, + "grad_norm": 2.4268763169751413, + "learning_rate": 3.474033456043152e-05, + "loss": 0.1397, + "step": 2349 + }, + { + "epoch": 3.7360890302066774, + "grad_norm": 1.5980243894926773, + "learning_rate": 3.474528348927732e-05, + "loss": 0.162, + "step": 2350 + }, + { + "epoch": 3.737678855325914, + "grad_norm": 3.6777773768793787, + "learning_rate": 3.47502341966544e-05, + "loss": 0.251, + "step": 2351 + }, + { + "epoch": 3.739268680445151, + "grad_norm": 1.3088722355668778, + "learning_rate": 3.475518668088711e-05, + "loss": 0.1674, + "step": 2352 + }, + { + "epoch": 3.740858505564388, + "grad_norm": 2.5353819230558408, + "learning_rate": 3.4760140940299205e-05, + "loss": 0.1111, + "step": 2353 + }, + { + "epoch": 3.742448330683625, + "grad_norm": 1.8682585379053294, + "learning_rate": 3.476509697321387e-05, + "loss": 0.1761, + "step": 2354 + }, + { + "epoch": 3.744038155802862, + "grad_norm": 2.6145514988595107, + "learning_rate": 3.477005477795365e-05, + "loss": 0.1363, + "step": 2355 + }, + { + "epoch": 3.7456279809220985, + "grad_norm": 2.0331056246104007, + "learning_rate": 3.4775014352840515e-05, + "loss": 0.2026, + "step": 2356 + }, + { + "epoch": 3.7472178060413355, + "grad_norm": 1.6606349979753297, + "learning_rate": 3.477997569619583e-05, + "loss": 0.1588, + "step": 2357 + }, + { + "epoch": 3.7488076311605725, + "grad_norm": 1.6941984181243817, + "learning_rate": 3.478493880634034e-05, + "loss": 0.1302, + "step": 2358 + }, + { + "epoch": 3.750397456279809, + "grad_norm": 2.694086276526322, + "learning_rate": 3.478990368159424e-05, + "loss": 0.2274, + "step": 2359 + }, + { + "epoch": 3.751987281399046, + "grad_norm": 1.429161015228244, + "learning_rate": 3.479487032027708e-05, + "loss": 0.1837, + "step": 2360 + }, + { + "epoch": 3.753577106518283, + "grad_norm": 2.3091023582664514, + "learning_rate": 3.4799838720707845e-05, + "loss": 0.166, + "step": 2361 + }, + { + "epoch": 3.75516693163752, + "grad_norm": 1.738031921692634, + "learning_rate": 3.4804808881204904e-05, + "loss": 0.1375, + "step": 2362 + }, + { + "epoch": 3.756756756756757, + "grad_norm": 1.0666717618053219, + "learning_rate": 3.480978080008605e-05, + "loss": 0.1521, + "step": 2363 + }, + { + "epoch": 3.7583465818759936, + "grad_norm": 1.446082465316099, + "learning_rate": 3.481475447566845e-05, + "loss": 0.1797, + "step": 2364 + }, + { + "epoch": 3.7599364069952306, + "grad_norm": 2.393680174347454, + "learning_rate": 3.48197299062687e-05, + "loss": 0.2275, + "step": 2365 + }, + { + "epoch": 3.7615262321144676, + "grad_norm": 2.750729858337788, + "learning_rate": 3.4824707090202807e-05, + "loss": 0.1902, + "step": 2366 + }, + { + "epoch": 3.763116057233704, + "grad_norm": 2.09719966026055, + "learning_rate": 3.482968602578616e-05, + "loss": 0.1771, + "step": 2367 + }, + { + "epoch": 3.764705882352941, + "grad_norm": 0.8901560440594336, + "learning_rate": 3.483466671133358e-05, + "loss": 0.1709, + "step": 2368 + }, + { + "epoch": 3.766295707472178, + "grad_norm": 2.832412908842704, + "learning_rate": 3.483964914515929e-05, + "loss": 0.1774, + "step": 2369 + }, + { + "epoch": 3.767885532591415, + "grad_norm": 3.072405062063487, + "learning_rate": 3.4844633325576905e-05, + "loss": 0.1632, + "step": 2370 + }, + { + "epoch": 3.7694753577106517, + "grad_norm": 1.3775188252973474, + "learning_rate": 3.484961925089946e-05, + "loss": 0.1798, + "step": 2371 + }, + { + "epoch": 3.7710651828298887, + "grad_norm": 2.6656984173681115, + "learning_rate": 3.485460691943941e-05, + "loss": 0.2401, + "step": 2372 + }, + { + "epoch": 3.7726550079491257, + "grad_norm": 2.8410475481407405, + "learning_rate": 3.485959632950859e-05, + "loss": 0.1532, + "step": 2373 + }, + { + "epoch": 3.7742448330683622, + "grad_norm": 1.2800953904056138, + "learning_rate": 3.48645874794183e-05, + "loss": 0.1876, + "step": 2374 + }, + { + "epoch": 3.7758346581875992, + "grad_norm": 3.651196357860561, + "learning_rate": 3.486958036747919e-05, + "loss": 0.1661, + "step": 2375 + }, + { + "epoch": 3.7774244833068362, + "grad_norm": 1.967634903761419, + "learning_rate": 3.487457499200135e-05, + "loss": 0.1366, + "step": 2376 + }, + { + "epoch": 3.779014308426073, + "grad_norm": 57.894450147851586, + "learning_rate": 3.487957135129429e-05, + "loss": 23.0071, + "step": 2377 + }, + { + "epoch": 3.78060413354531, + "grad_norm": 1.9366433387358164, + "learning_rate": 3.488456944366691e-05, + "loss": 0.156, + "step": 2378 + }, + { + "epoch": 3.7821939586645468, + "grad_norm": 1.9819929713986302, + "learning_rate": 3.488956926742755e-05, + "loss": 0.1209, + "step": 2379 + }, + { + "epoch": 3.7837837837837838, + "grad_norm": 2.581085864964594, + "learning_rate": 3.489457082088394e-05, + "loss": 0.1926, + "step": 2380 + }, + { + "epoch": 3.7853736089030208, + "grad_norm": 4.493207814518764, + "learning_rate": 3.489957410234325e-05, + "loss": 0.2476, + "step": 2381 + }, + { + "epoch": 3.7869634340222573, + "grad_norm": 1.114143972082843, + "learning_rate": 3.4904579110112034e-05, + "loss": 0.1929, + "step": 2382 + }, + { + "epoch": 3.7885532591414943, + "grad_norm": 2.1586913375810783, + "learning_rate": 3.490958584249629e-05, + "loss": 0.1605, + "step": 2383 + }, + { + "epoch": 3.7901430842607313, + "grad_norm": 3.111469634313678, + "learning_rate": 3.491459429780141e-05, + "loss": 0.1804, + "step": 2384 + }, + { + "epoch": 3.7917329093799683, + "grad_norm": 2.1194095132951283, + "learning_rate": 3.4919604474332224e-05, + "loss": 0.2098, + "step": 2385 + }, + { + "epoch": 3.7933227344992053, + "grad_norm": 1.9405234973877377, + "learning_rate": 3.492461637039296e-05, + "loss": 0.164, + "step": 2386 + }, + { + "epoch": 3.794912559618442, + "grad_norm": 2.5648742357407484, + "learning_rate": 3.4929629984287275e-05, + "loss": 0.2008, + "step": 2387 + }, + { + "epoch": 3.796502384737679, + "grad_norm": 4.4408873963457545, + "learning_rate": 3.4934645314318245e-05, + "loss": 0.1842, + "step": 2388 + }, + { + "epoch": 3.798092209856916, + "grad_norm": 2.48413781072719, + "learning_rate": 3.493966235878836e-05, + "loss": 0.1586, + "step": 2389 + }, + { + "epoch": 3.7996820349761524, + "grad_norm": 2.386148581839202, + "learning_rate": 3.4944681115999535e-05, + "loss": 0.2059, + "step": 2390 + }, + { + "epoch": 3.8012718600953894, + "grad_norm": 1.5817612876378282, + "learning_rate": 3.49497015842531e-05, + "loss": 0.1519, + "step": 2391 + }, + { + "epoch": 3.8028616852146264, + "grad_norm": 1.9317169464537525, + "learning_rate": 3.4954723761849814e-05, + "loss": 0.1376, + "step": 2392 + }, + { + "epoch": 3.8044515103338634, + "grad_norm": 3.1973302093237743, + "learning_rate": 3.495974764708983e-05, + "loss": 0.201, + "step": 2393 + }, + { + "epoch": 3.8060413354531004, + "grad_norm": 3.0405341932530234, + "learning_rate": 3.4964773238272774e-05, + "loss": 0.1956, + "step": 2394 + }, + { + "epoch": 3.807631160572337, + "grad_norm": 1.7686012740430275, + "learning_rate": 3.4969800533697644e-05, + "loss": 0.1453, + "step": 2395 + }, + { + "epoch": 3.809220985691574, + "grad_norm": 5.717205893084337, + "learning_rate": 3.4974829531662905e-05, + "loss": 0.2696, + "step": 2396 + }, + { + "epoch": 3.810810810810811, + "grad_norm": 1.8058259312810452, + "learning_rate": 3.49798602304664e-05, + "loss": 0.1395, + "step": 2397 + }, + { + "epoch": 3.8124006359300475, + "grad_norm": 2.441482058096391, + "learning_rate": 3.498489262840543e-05, + "loss": 0.1405, + "step": 2398 + }, + { + "epoch": 3.8139904610492845, + "grad_norm": 2.9756814243122442, + "learning_rate": 3.498992672377671e-05, + "loss": 0.1775, + "step": 2399 + }, + { + "epoch": 3.8155802861685215, + "grad_norm": 1.756883062229637, + "learning_rate": 3.499496251487637e-05, + "loss": 0.1553, + "step": 2400 + }, + { + "epoch": 3.8171701112877585, + "grad_norm": 1.656680457274463, + "learning_rate": 3.5000000000000004e-05, + "loss": 0.1677, + "step": 2401 + }, + { + "epoch": 3.818759936406995, + "grad_norm": 12.710179507984915, + "learning_rate": 3.500503917744258e-05, + "loss": 0.4823, + "step": 2402 + }, + { + "epoch": 3.820349761526232, + "grad_norm": 2.649124846644406, + "learning_rate": 3.5010080045498535e-05, + "loss": 0.2023, + "step": 2403 + }, + { + "epoch": 3.821939586645469, + "grad_norm": 3.3603041894553916, + "learning_rate": 3.50151226024617e-05, + "loss": 0.2355, + "step": 2404 + }, + { + "epoch": 3.8235294117647056, + "grad_norm": 1.8062759740131256, + "learning_rate": 3.502016684662536e-05, + "loss": 0.1368, + "step": 2405 + }, + { + "epoch": 3.8251192368839426, + "grad_norm": 51.03220242009722, + "learning_rate": 3.5025212776282234e-05, + "loss": 20.0733, + "step": 2406 + }, + { + "epoch": 3.8267090620031796, + "grad_norm": 3.760107257941648, + "learning_rate": 3.5030260389724446e-05, + "loss": 0.2466, + "step": 2407 + }, + { + "epoch": 3.8282988871224166, + "grad_norm": 2.063777922408905, + "learning_rate": 3.503530968524356e-05, + "loss": 0.163, + "step": 2408 + }, + { + "epoch": 3.8298887122416536, + "grad_norm": 2.7472765168197246, + "learning_rate": 3.504036066113058e-05, + "loss": 0.1665, + "step": 2409 + }, + { + "epoch": 3.83147853736089, + "grad_norm": 14.259954166086972, + "learning_rate": 3.504541331567592e-05, + "loss": 0.2014, + "step": 2410 + }, + { + "epoch": 3.833068362480127, + "grad_norm": 1.9234807647109433, + "learning_rate": 3.505046764716946e-05, + "loss": 0.1582, + "step": 2411 + }, + { + "epoch": 3.834658187599364, + "grad_norm": 3.423325038728435, + "learning_rate": 3.505552365390048e-05, + "loss": 0.1668, + "step": 2412 + }, + { + "epoch": 3.8362480127186007, + "grad_norm": 4.090884866253771, + "learning_rate": 3.50605813341577e-05, + "loss": 0.2612, + "step": 2413 + }, + { + "epoch": 3.8378378378378377, + "grad_norm": 2.86284835758412, + "learning_rate": 3.506564068622927e-05, + "loss": 0.2451, + "step": 2414 + }, + { + "epoch": 3.8394276629570747, + "grad_norm": 6.745436587247749, + "learning_rate": 3.507070170840281e-05, + "loss": 0.2984, + "step": 2415 + }, + { + "epoch": 3.8410174880763117, + "grad_norm": 3.0543069602422723, + "learning_rate": 3.5075764398965334e-05, + "loss": 0.2088, + "step": 2416 + }, + { + "epoch": 3.8426073131955487, + "grad_norm": 1.9401446800764404, + "learning_rate": 3.5080828756203295e-05, + "loss": 0.1326, + "step": 2417 + }, + { + "epoch": 3.8441971383147853, + "grad_norm": 3.3092215033239722, + "learning_rate": 3.50858947784026e-05, + "loss": 0.1811, + "step": 2418 + }, + { + "epoch": 3.8457869634340223, + "grad_norm": 5.095733416555802, + "learning_rate": 3.5090962463848594e-05, + "loss": 0.2737, + "step": 2419 + }, + { + "epoch": 3.8473767885532593, + "grad_norm": 2.851306681658006, + "learning_rate": 3.509603181082603e-05, + "loss": 0.2068, + "step": 2420 + }, + { + "epoch": 3.848966613672496, + "grad_norm": 4.305893085543297, + "learning_rate": 3.510110281761913e-05, + "loss": 0.2362, + "step": 2421 + }, + { + "epoch": 3.850556438791733, + "grad_norm": 3.9469952522170533, + "learning_rate": 3.510617548251154e-05, + "loss": 0.183, + "step": 2422 + }, + { + "epoch": 3.85214626391097, + "grad_norm": 2.2271723987204792, + "learning_rate": 3.511124980378634e-05, + "loss": 0.2052, + "step": 2423 + }, + { + "epoch": 3.853736089030207, + "grad_norm": 2.6914982073820446, + "learning_rate": 3.5116325779726066e-05, + "loss": 0.2303, + "step": 2424 + }, + { + "epoch": 3.855325914149444, + "grad_norm": 2.2197683212908053, + "learning_rate": 3.512140340861268e-05, + "loss": 0.2001, + "step": 2425 + }, + { + "epoch": 3.8569157392686804, + "grad_norm": 2.9430444680745884, + "learning_rate": 3.512648268872758e-05, + "loss": 0.2329, + "step": 2426 + }, + { + "epoch": 3.8585055643879174, + "grad_norm": 3.514802912130992, + "learning_rate": 3.513156361835162e-05, + "loss": 0.209, + "step": 2427 + }, + { + "epoch": 3.8600953895071544, + "grad_norm": 3.6130586054933898, + "learning_rate": 3.51366461957651e-05, + "loss": 0.1501, + "step": 2428 + }, + { + "epoch": 3.861685214626391, + "grad_norm": 2.7654406475294158, + "learning_rate": 3.514173041924773e-05, + "loss": 0.2438, + "step": 2429 + }, + { + "epoch": 3.863275039745628, + "grad_norm": 2.608737201712087, + "learning_rate": 3.514681628707871e-05, + "loss": 0.173, + "step": 2430 + }, + { + "epoch": 3.864864864864865, + "grad_norm": 2.3006024229888844, + "learning_rate": 3.515190379753663e-05, + "loss": 0.182, + "step": 2431 + }, + { + "epoch": 3.866454689984102, + "grad_norm": 3.484628928285874, + "learning_rate": 3.5156992948899576e-05, + "loss": 0.1638, + "step": 2432 + }, + { + "epoch": 3.868044515103339, + "grad_norm": 1.9459528299051319, + "learning_rate": 3.516208373944504e-05, + "loss": 0.1814, + "step": 2433 + }, + { + "epoch": 3.8696343402225755, + "grad_norm": 1.839963229647296, + "learning_rate": 3.5167176167449976e-05, + "loss": 0.1166, + "step": 2434 + }, + { + "epoch": 3.8712241653418125, + "grad_norm": 3.504119171750761, + "learning_rate": 3.5172270231190785e-05, + "loss": 0.1875, + "step": 2435 + }, + { + "epoch": 3.872813990461049, + "grad_norm": 2.662756725082136, + "learning_rate": 3.5177365928943314e-05, + "loss": 0.1704, + "step": 2436 + }, + { + "epoch": 3.874403815580286, + "grad_norm": 2.248953447929612, + "learning_rate": 3.5182463258982854e-05, + "loss": 0.1584, + "step": 2437 + }, + { + "epoch": 3.875993640699523, + "grad_norm": 3.379535349683668, + "learning_rate": 3.518756221958412e-05, + "loss": 0.165, + "step": 2438 + }, + { + "epoch": 3.87758346581876, + "grad_norm": 40.1916363439513, + "learning_rate": 3.5192662809021336e-05, + "loss": 14.5069, + "step": 2439 + }, + { + "epoch": 3.879173290937997, + "grad_norm": 2.15921144500605, + "learning_rate": 3.519776502556812e-05, + "loss": 0.1903, + "step": 2440 + }, + { + "epoch": 3.8807631160572336, + "grad_norm": 3.3737463743479674, + "learning_rate": 3.5202868867497535e-05, + "loss": 0.1614, + "step": 2441 + }, + { + "epoch": 3.8823529411764706, + "grad_norm": 2.6087143641687014, + "learning_rate": 3.520797433308215e-05, + "loss": 0.1999, + "step": 2442 + }, + { + "epoch": 3.8839427662957076, + "grad_norm": 2.282769125786468, + "learning_rate": 3.521308142059393e-05, + "loss": 0.1909, + "step": 2443 + }, + { + "epoch": 3.885532591414944, + "grad_norm": 2.4181488796289083, + "learning_rate": 3.521819012830432e-05, + "loss": 0.2073, + "step": 2444 + }, + { + "epoch": 3.887122416534181, + "grad_norm": 3.525508575349125, + "learning_rate": 3.522330045448421e-05, + "loss": 0.1132, + "step": 2445 + }, + { + "epoch": 3.888712241653418, + "grad_norm": 2.245257200779694, + "learning_rate": 3.5228412397403914e-05, + "loss": 0.1297, + "step": 2446 + }, + { + "epoch": 3.890302066772655, + "grad_norm": 3.960690661743815, + "learning_rate": 3.5233525955333254e-05, + "loss": 0.1479, + "step": 2447 + }, + { + "epoch": 3.891891891891892, + "grad_norm": 2.607574382611782, + "learning_rate": 3.523864112654147e-05, + "loss": 0.1997, + "step": 2448 + }, + { + "epoch": 3.8934817170111287, + "grad_norm": 1.3603443321277813, + "learning_rate": 3.524375790929725e-05, + "loss": 0.1728, + "step": 2449 + }, + { + "epoch": 3.8950715421303657, + "grad_norm": 2.398928494582779, + "learning_rate": 3.5248876301868754e-05, + "loss": 0.1807, + "step": 2450 + }, + { + "epoch": 3.8966613672496027, + "grad_norm": 2.072211532077674, + "learning_rate": 3.52539963025236e-05, + "loss": 0.2199, + "step": 2451 + }, + { + "epoch": 3.898251192368839, + "grad_norm": 23.75057433890675, + "learning_rate": 3.525911790952884e-05, + "loss": 4.3345, + "step": 2452 + }, + { + "epoch": 3.899841017488076, + "grad_norm": 4.151208819237776, + "learning_rate": 3.5264241121151e-05, + "loss": 0.1527, + "step": 2453 + }, + { + "epoch": 3.901430842607313, + "grad_norm": 2.52612688840614, + "learning_rate": 3.526936593565606e-05, + "loss": 0.1967, + "step": 2454 + }, + { + "epoch": 3.90302066772655, + "grad_norm": 3.0246190616549344, + "learning_rate": 3.527449235130946e-05, + "loss": 0.184, + "step": 2455 + }, + { + "epoch": 3.904610492845787, + "grad_norm": 4.271881579103825, + "learning_rate": 3.5279620366376087e-05, + "loss": 0.1454, + "step": 2456 + }, + { + "epoch": 3.9062003179650238, + "grad_norm": 3.5670224221087174, + "learning_rate": 3.52847499791203e-05, + "loss": 0.1479, + "step": 2457 + }, + { + "epoch": 3.9077901430842608, + "grad_norm": 2.806537248601152, + "learning_rate": 3.5289881187805904e-05, + "loss": 0.165, + "step": 2458 + }, + { + "epoch": 3.9093799682034978, + "grad_norm": 2.2702142539930974, + "learning_rate": 3.5295013990696175e-05, + "loss": 0.1798, + "step": 2459 + }, + { + "epoch": 3.9109697933227343, + "grad_norm": 2.1196379847661917, + "learning_rate": 3.5300148386053835e-05, + "loss": 0.1762, + "step": 2460 + }, + { + "epoch": 3.9125596184419713, + "grad_norm": 3.0816845470778316, + "learning_rate": 3.53052843721411e-05, + "loss": 0.1546, + "step": 2461 + }, + { + "epoch": 3.9141494435612083, + "grad_norm": 3.1863201638618643, + "learning_rate": 3.5310421947219595e-05, + "loss": 0.1619, + "step": 2462 + }, + { + "epoch": 3.9157392686804453, + "grad_norm": 3.1392011699340094, + "learning_rate": 3.5315561109550455e-05, + "loss": 0.2251, + "step": 2463 + }, + { + "epoch": 3.9173290937996823, + "grad_norm": 2.6968733323538197, + "learning_rate": 3.532070185739427e-05, + "loss": 0.2102, + "step": 2464 + }, + { + "epoch": 3.918918918918919, + "grad_norm": 2.531282954476646, + "learning_rate": 3.5325844189011066e-05, + "loss": 0.1847, + "step": 2465 + }, + { + "epoch": 3.920508744038156, + "grad_norm": 3.1937133073499737, + "learning_rate": 3.5330988102660344e-05, + "loss": 0.1467, + "step": 2466 + }, + { + "epoch": 3.9220985691573924, + "grad_norm": 1.9510489840910583, + "learning_rate": 3.533613359660109e-05, + "loss": 0.1676, + "step": 2467 + }, + { + "epoch": 3.9236883942766294, + "grad_norm": 4.015000107746218, + "learning_rate": 3.5341280669091734e-05, + "loss": 0.1471, + "step": 2468 + }, + { + "epoch": 3.9252782193958664, + "grad_norm": 2.393296239677172, + "learning_rate": 3.534642931839018e-05, + "loss": 0.1383, + "step": 2469 + }, + { + "epoch": 3.9268680445151034, + "grad_norm": 3.7303861039752433, + "learning_rate": 3.535157954275381e-05, + "loss": 0.2361, + "step": 2470 + }, + { + "epoch": 3.9284578696343404, + "grad_norm": 3.0680192836164806, + "learning_rate": 3.535673134043943e-05, + "loss": 0.1495, + "step": 2471 + }, + { + "epoch": 3.930047694753577, + "grad_norm": 4.098729265569174, + "learning_rate": 3.536188470970337e-05, + "loss": 0.2483, + "step": 2472 + }, + { + "epoch": 3.931637519872814, + "grad_norm": 3.07416289221877, + "learning_rate": 3.536703964880138e-05, + "loss": 0.2466, + "step": 2473 + }, + { + "epoch": 3.933227344992051, + "grad_norm": 1.3144657546088647, + "learning_rate": 3.537219615598872e-05, + "loss": 0.1792, + "step": 2474 + }, + { + "epoch": 3.9348171701112875, + "grad_norm": 2.7890898563836126, + "learning_rate": 3.537735422952009e-05, + "loss": 0.1722, + "step": 2475 + }, + { + "epoch": 3.9364069952305245, + "grad_norm": 3.101523806384929, + "learning_rate": 3.538251386764966e-05, + "loss": 0.1842, + "step": 2476 + }, + { + "epoch": 3.9379968203497615, + "grad_norm": 3.1351566854918356, + "learning_rate": 3.5387675068631094e-05, + "loss": 0.1326, + "step": 2477 + }, + { + "epoch": 3.9395866454689985, + "grad_norm": 2.5568469699974385, + "learning_rate": 3.5392837830717506e-05, + "loss": 0.2199, + "step": 2478 + }, + { + "epoch": 3.9411764705882355, + "grad_norm": 3.152698562235238, + "learning_rate": 3.539800215216148e-05, + "loss": 0.2167, + "step": 2479 + }, + { + "epoch": 3.942766295707472, + "grad_norm": 2.3233547599794155, + "learning_rate": 3.54031680312151e-05, + "loss": 0.1961, + "step": 2480 + }, + { + "epoch": 3.944356120826709, + "grad_norm": 4.776217524395933, + "learning_rate": 3.540833546612989e-05, + "loss": 0.1808, + "step": 2481 + }, + { + "epoch": 3.945945945945946, + "grad_norm": 2.7665568807803105, + "learning_rate": 3.5413504455156854e-05, + "loss": 0.2102, + "step": 2482 + }, + { + "epoch": 3.9475357710651826, + "grad_norm": 2.2106151605749798, + "learning_rate": 3.541867499654649e-05, + "loss": 0.1709, + "step": 2483 + }, + { + "epoch": 3.9491255961844196, + "grad_norm": 2.1384481747260944, + "learning_rate": 3.542384708854874e-05, + "loss": 0.1429, + "step": 2484 + }, + { + "epoch": 3.9507154213036566, + "grad_norm": 4.0496993514586235, + "learning_rate": 3.542902072941306e-05, + "loss": 0.2919, + "step": 2485 + }, + { + "epoch": 3.9523052464228936, + "grad_norm": 2.609633301174429, + "learning_rate": 3.543419591738835e-05, + "loss": 0.2099, + "step": 2486 + }, + { + "epoch": 3.9538950715421306, + "grad_norm": 1.5510065122093692, + "learning_rate": 3.543937265072299e-05, + "loss": 0.1418, + "step": 2487 + }, + { + "epoch": 3.955484896661367, + "grad_norm": 3.036603358895255, + "learning_rate": 3.544455092766485e-05, + "loss": 0.182, + "step": 2488 + }, + { + "epoch": 3.957074721780604, + "grad_norm": 1.989447320850289, + "learning_rate": 3.5449730746461265e-05, + "loss": 0.1752, + "step": 2489 + }, + { + "epoch": 3.958664546899841, + "grad_norm": 3.0522344712938927, + "learning_rate": 3.545491210535906e-05, + "loss": 0.2401, + "step": 2490 + }, + { + "epoch": 3.9602543720190777, + "grad_norm": 2.5157826879854817, + "learning_rate": 3.5460095002604534e-05, + "loss": 0.2001, + "step": 2491 + }, + { + "epoch": 3.9618441971383147, + "grad_norm": 2.6362953045053654, + "learning_rate": 3.546527943644345e-05, + "loss": 0.2016, + "step": 2492 + }, + { + "epoch": 3.9634340222575517, + "grad_norm": 2.7176922938212016, + "learning_rate": 3.5470465405121096e-05, + "loss": 0.1552, + "step": 2493 + }, + { + "epoch": 3.9650238473767887, + "grad_norm": 1.352395017307062, + "learning_rate": 3.547565290688218e-05, + "loss": 0.185, + "step": 2494 + }, + { + "epoch": 3.9666136724960257, + "grad_norm": 2.9172143704607008, + "learning_rate": 3.5480841939970927e-05, + "loss": 0.1893, + "step": 2495 + }, + { + "epoch": 3.9682034976152623, + "grad_norm": 3.114537081421992, + "learning_rate": 3.548603250263104e-05, + "loss": 0.1486, + "step": 2496 + }, + { + "epoch": 3.9697933227344993, + "grad_norm": 2.4737064533545254, + "learning_rate": 3.549122459310569e-05, + "loss": 0.1496, + "step": 2497 + }, + { + "epoch": 3.9713831478537363, + "grad_norm": 1.6321133872125866, + "learning_rate": 3.549641820963757e-05, + "loss": 0.201, + "step": 2498 + }, + { + "epoch": 3.972972972972973, + "grad_norm": 1.8528968206770604, + "learning_rate": 3.55016133504688e-05, + "loss": 0.1783, + "step": 2499 + }, + { + "epoch": 3.97456279809221, + "grad_norm": 1.920757501079606, + "learning_rate": 3.550681001384104e-05, + "loss": 0.1023, + "step": 2500 + }, + { + "epoch": 3.976152623211447, + "grad_norm": 1.5605398462428834, + "learning_rate": 3.5512008197995385e-05, + "loss": 0.1576, + "step": 2501 + }, + { + "epoch": 3.977742448330684, + "grad_norm": 1.4653499470889104, + "learning_rate": 3.551720790117244e-05, + "loss": 0.1717, + "step": 2502 + }, + { + "epoch": 3.9793322734499204, + "grad_norm": 3.149158580056034, + "learning_rate": 3.552240912161231e-05, + "loss": 0.2491, + "step": 2503 + }, + { + "epoch": 3.9809220985691574, + "grad_norm": 1.8675693978738612, + "learning_rate": 3.552761185755455e-05, + "loss": 0.1301, + "step": 2504 + }, + { + "epoch": 3.9825119236883944, + "grad_norm": 1.269456174649296, + "learning_rate": 3.553281610723823e-05, + "loss": 0.137, + "step": 2505 + }, + { + "epoch": 3.984101748807631, + "grad_norm": 1.2907197159785364, + "learning_rate": 3.553802186890191e-05, + "loss": 0.1679, + "step": 2506 + }, + { + "epoch": 3.985691573926868, + "grad_norm": 4.382660389061243, + "learning_rate": 3.5543229140783626e-05, + "loss": 0.121, + "step": 2507 + }, + { + "epoch": 3.987281399046105, + "grad_norm": 3.9124623768642253, + "learning_rate": 3.554843792112089e-05, + "loss": 0.1956, + "step": 2508 + }, + { + "epoch": 3.988871224165342, + "grad_norm": 3.784450163471156, + "learning_rate": 3.5553648208150726e-05, + "loss": 0.1748, + "step": 2509 + }, + { + "epoch": 3.990461049284579, + "grad_norm": 3.260134734577807, + "learning_rate": 3.555886000010965e-05, + "loss": 0.1567, + "step": 2510 + }, + { + "epoch": 3.9920508744038155, + "grad_norm": 2.377956461241035, + "learning_rate": 3.5564073295233646e-05, + "loss": 0.1498, + "step": 2511 + }, + { + "epoch": 3.9936406995230525, + "grad_norm": 4.67320743724486, + "learning_rate": 3.5569288091758204e-05, + "loss": 0.2018, + "step": 2512 + }, + { + "epoch": 3.9952305246422894, + "grad_norm": 3.3038487149362137, + "learning_rate": 3.557450438791831e-05, + "loss": 0.1526, + "step": 2513 + }, + { + "epoch": 3.996820349761526, + "grad_norm": 2.4694900236450534, + "learning_rate": 3.557972218194844e-05, + "loss": 0.1414, + "step": 2514 + }, + { + "epoch": 3.998410174880763, + "grad_norm": 3.528485949937308, + "learning_rate": 3.558494147208255e-05, + "loss": 0.1758, + "step": 2515 + }, + { + "epoch": 4.0, + "grad_norm": 2.917378654821198, + "learning_rate": 3.55901622565541e-05, + "loss": 0.1948, + "step": 2516 + }, + { + "epoch": 4.001589825119237, + "grad_norm": 3.5125347488278544, + "learning_rate": 3.5595384533596046e-05, + "loss": 0.1839, + "step": 2517 + }, + { + "epoch": 4.003179650238474, + "grad_norm": 2.865393217103114, + "learning_rate": 3.5600608301440846e-05, + "loss": 0.196, + "step": 2518 + }, + { + "epoch": 4.004769475357711, + "grad_norm": 2.091703164723003, + "learning_rate": 3.560583355832044e-05, + "loss": 0.1191, + "step": 2519 + }, + { + "epoch": 4.006359300476947, + "grad_norm": 2.859144838286252, + "learning_rate": 3.561106030246625e-05, + "loss": 0.1672, + "step": 2520 + }, + { + "epoch": 4.007949125596184, + "grad_norm": 5.409872828541667, + "learning_rate": 3.561628853210923e-05, + "loss": 0.2442, + "step": 2521 + }, + { + "epoch": 4.009538950715421, + "grad_norm": 2.715933085530968, + "learning_rate": 3.5621518245479805e-05, + "loss": 0.1996, + "step": 2522 + }, + { + "epoch": 4.011128775834658, + "grad_norm": 2.2312917411922806, + "learning_rate": 3.5626749440807916e-05, + "loss": 0.1718, + "step": 2523 + }, + { + "epoch": 4.012718600953895, + "grad_norm": 1.6730027999192127, + "learning_rate": 3.563198211632298e-05, + "loss": 0.1261, + "step": 2524 + }, + { + "epoch": 4.014308426073132, + "grad_norm": 3.730518253755824, + "learning_rate": 3.5637216270253934e-05, + "loss": 0.2291, + "step": 2525 + }, + { + "epoch": 4.015898251192369, + "grad_norm": 2.4137992285459267, + "learning_rate": 3.564245190082921e-05, + "loss": 0.1388, + "step": 2526 + }, + { + "epoch": 4.017488076311606, + "grad_norm": 3.1883868886845823, + "learning_rate": 3.564768900627672e-05, + "loss": 0.2168, + "step": 2527 + }, + { + "epoch": 4.019077901430842, + "grad_norm": 1.937624566264173, + "learning_rate": 3.565292758482392e-05, + "loss": 0.2081, + "step": 2528 + }, + { + "epoch": 4.020667726550079, + "grad_norm": 2.146416869766526, + "learning_rate": 3.565816763469772e-05, + "loss": 0.2658, + "step": 2529 + }, + { + "epoch": 4.022257551669316, + "grad_norm": 2.8357864381839346, + "learning_rate": 3.5663409154124556e-05, + "loss": 0.1801, + "step": 2530 + }, + { + "epoch": 4.023847376788553, + "grad_norm": 2.9829409511608813, + "learning_rate": 3.5668652141330376e-05, + "loss": 0.1805, + "step": 2531 + }, + { + "epoch": 4.02543720190779, + "grad_norm": 2.3801137221671045, + "learning_rate": 3.567389659454059e-05, + "loss": 0.1914, + "step": 2532 + }, + { + "epoch": 4.027027027027027, + "grad_norm": 1.5228563979544558, + "learning_rate": 3.567914251198018e-05, + "loss": 0.1264, + "step": 2533 + }, + { + "epoch": 4.028616852146264, + "grad_norm": 1.9259917632666435, + "learning_rate": 3.568438989187356e-05, + "loss": 0.1804, + "step": 2534 + }, + { + "epoch": 4.030206677265501, + "grad_norm": 12.676223011378253, + "learning_rate": 3.5689638732444706e-05, + "loss": 3.5049, + "step": 2535 + }, + { + "epoch": 4.031796502384737, + "grad_norm": 2.8182668902691823, + "learning_rate": 3.569488903191705e-05, + "loss": 0.148, + "step": 2536 + }, + { + "epoch": 4.033386327503974, + "grad_norm": 2.7453346182150247, + "learning_rate": 3.570014078851357e-05, + "loss": 0.1654, + "step": 2537 + }, + { + "epoch": 4.034976152623211, + "grad_norm": 2.148787479534688, + "learning_rate": 3.570539400045674e-05, + "loss": 0.1551, + "step": 2538 + }, + { + "epoch": 4.036565977742448, + "grad_norm": 2.745298960248963, + "learning_rate": 3.571064866596854e-05, + "loss": 0.1961, + "step": 2539 + }, + { + "epoch": 4.038155802861685, + "grad_norm": 2.2169181664290076, + "learning_rate": 3.571590478327045e-05, + "loss": 0.2213, + "step": 2540 + }, + { + "epoch": 4.039745627980922, + "grad_norm": 3.435458225096928, + "learning_rate": 3.572116235058346e-05, + "loss": 0.2095, + "step": 2541 + }, + { + "epoch": 4.041335453100159, + "grad_norm": 32.04824081369827, + "learning_rate": 3.572642136612808e-05, + "loss": 8.5681, + "step": 2542 + }, + { + "epoch": 4.042925278219396, + "grad_norm": 2.4283899406764275, + "learning_rate": 3.573168182812432e-05, + "loss": 0.1958, + "step": 2543 + }, + { + "epoch": 4.044515103338632, + "grad_norm": 2.177852217748721, + "learning_rate": 3.573694373479171e-05, + "loss": 0.1666, + "step": 2544 + }, + { + "epoch": 4.046104928457869, + "grad_norm": 2.960537239514045, + "learning_rate": 3.574220708434928e-05, + "loss": 0.1965, + "step": 2545 + }, + { + "epoch": 4.047694753577106, + "grad_norm": 2.325024155037367, + "learning_rate": 3.574747187501557e-05, + "loss": 0.1652, + "step": 2546 + }, + { + "epoch": 4.049284578696343, + "grad_norm": 2.196849574016487, + "learning_rate": 3.575273810500866e-05, + "loss": 0.1584, + "step": 2547 + }, + { + "epoch": 4.05087440381558, + "grad_norm": 158.66865002808538, + "learning_rate": 3.57580057725461e-05, + "loss": 16.2171, + "step": 2548 + }, + { + "epoch": 4.052464228934817, + "grad_norm": 2.0229446352077405, + "learning_rate": 3.576327487584499e-05, + "loss": 0.1986, + "step": 2549 + }, + { + "epoch": 4.054054054054054, + "grad_norm": 3.504939202939681, + "learning_rate": 3.576854541312192e-05, + "loss": 0.1752, + "step": 2550 + }, + { + "epoch": 4.0556438791732905, + "grad_norm": 2.703221637896262, + "learning_rate": 3.577381738259301e-05, + "loss": 0.1841, + "step": 2551 + }, + { + "epoch": 4.0572337042925275, + "grad_norm": 9.235755329496932, + "learning_rate": 3.577909078247388e-05, + "loss": 1.0857, + "step": 2552 + }, + { + "epoch": 4.0588235294117645, + "grad_norm": 3.083185840362384, + "learning_rate": 3.5784365610979685e-05, + "loss": 0.1658, + "step": 2553 + }, + { + "epoch": 4.0604133545310015, + "grad_norm": 3.044260922276935, + "learning_rate": 3.578964186632509e-05, + "loss": 0.1748, + "step": 2554 + }, + { + "epoch": 4.0620031796502385, + "grad_norm": 2.5188173064263477, + "learning_rate": 3.5794919546724264e-05, + "loss": 0.2696, + "step": 2555 + }, + { + "epoch": 4.0635930047694755, + "grad_norm": 2.799516264975812, + "learning_rate": 3.5800198650390906e-05, + "loss": 0.1868, + "step": 2556 + }, + { + "epoch": 4.0651828298887125, + "grad_norm": 2.8983494403645684, + "learning_rate": 3.580547917553823e-05, + "loss": 0.1993, + "step": 2557 + }, + { + "epoch": 4.0667726550079495, + "grad_norm": 3.4830796317011052, + "learning_rate": 3.5810761120378967e-05, + "loss": 0.2489, + "step": 2558 + }, + { + "epoch": 4.068362480127186, + "grad_norm": 2.4298914203302737, + "learning_rate": 3.5816044483125385e-05, + "loss": 0.1786, + "step": 2559 + }, + { + "epoch": 4.069952305246423, + "grad_norm": 2.7659351926985205, + "learning_rate": 3.582132926198923e-05, + "loss": 0.1875, + "step": 2560 + }, + { + "epoch": 4.07154213036566, + "grad_norm": 3.1983703432363644, + "learning_rate": 3.5826615455181826e-05, + "loss": 0.1711, + "step": 2561 + }, + { + "epoch": 4.073131955484897, + "grad_norm": 3.868628043510097, + "learning_rate": 3.583190306091396e-05, + "loss": 0.1592, + "step": 2562 + }, + { + "epoch": 4.074721780604134, + "grad_norm": 2.632769843890157, + "learning_rate": 3.583719207739599e-05, + "loss": 0.1719, + "step": 2563 + }, + { + "epoch": 4.076311605723371, + "grad_norm": 2.426780028206715, + "learning_rate": 3.584248250283777e-05, + "loss": 0.1788, + "step": 2564 + }, + { + "epoch": 4.077901430842608, + "grad_norm": 2.3334647190219706, + "learning_rate": 3.584777433544867e-05, + "loss": 0.1999, + "step": 2565 + }, + { + "epoch": 4.079491255961845, + "grad_norm": 2.681549399032922, + "learning_rate": 3.5853067573437606e-05, + "loss": 0.1617, + "step": 2566 + }, + { + "epoch": 4.081081081081081, + "grad_norm": 3.8034447868384347, + "learning_rate": 3.585836221501302e-05, + "loss": 0.1353, + "step": 2567 + }, + { + "epoch": 4.082670906200318, + "grad_norm": 1.8680499685002874, + "learning_rate": 3.586365825838285e-05, + "loss": 0.1611, + "step": 2568 + }, + { + "epoch": 4.084260731319555, + "grad_norm": 2.1274135878496065, + "learning_rate": 3.586895570175458e-05, + "loss": 0.1933, + "step": 2569 + }, + { + "epoch": 4.085850556438792, + "grad_norm": 1.7237852665809665, + "learning_rate": 3.587425454333523e-05, + "loss": 0.1104, + "step": 2570 + }, + { + "epoch": 4.087440381558029, + "grad_norm": 1.9171173271630355, + "learning_rate": 3.5879554781331314e-05, + "loss": 0.1657, + "step": 2571 + }, + { + "epoch": 4.089030206677266, + "grad_norm": 2.698412939090953, + "learning_rate": 3.588485641394891e-05, + "loss": 0.1956, + "step": 2572 + }, + { + "epoch": 4.090620031796503, + "grad_norm": 2.1542359336191095, + "learning_rate": 3.58901594393936e-05, + "loss": 0.1636, + "step": 2573 + }, + { + "epoch": 4.09220985691574, + "grad_norm": 1.6697561519569541, + "learning_rate": 3.589546385587051e-05, + "loss": 0.205, + "step": 2574 + }, + { + "epoch": 4.093799682034976, + "grad_norm": 1.5583387036635463, + "learning_rate": 3.5900769661584274e-05, + "loss": 0.206, + "step": 2575 + }, + { + "epoch": 4.095389507154213, + "grad_norm": 3.214191276768939, + "learning_rate": 3.5906076854739076e-05, + "loss": 0.1108, + "step": 2576 + }, + { + "epoch": 4.09697933227345, + "grad_norm": 1.7274702957364836, + "learning_rate": 3.5911385433538624e-05, + "loss": 0.1686, + "step": 2577 + }, + { + "epoch": 4.098569157392687, + "grad_norm": 1.8914262922436411, + "learning_rate": 3.591669539618616e-05, + "loss": 0.2483, + "step": 2578 + }, + { + "epoch": 4.100158982511924, + "grad_norm": 3.484272830076163, + "learning_rate": 3.592200674088444e-05, + "loss": 0.2068, + "step": 2579 + }, + { + "epoch": 4.101748807631161, + "grad_norm": 2.834870843716858, + "learning_rate": 3.5927319465835774e-05, + "loss": 0.2352, + "step": 2580 + }, + { + "epoch": 4.103338632750398, + "grad_norm": 2.731273231540694, + "learning_rate": 3.5932633569242e-05, + "loss": 0.183, + "step": 2581 + }, + { + "epoch": 4.104928457869635, + "grad_norm": 1.6257336209148003, + "learning_rate": 3.593794904930448e-05, + "loss": 0.1697, + "step": 2582 + }, + { + "epoch": 4.106518282988871, + "grad_norm": 2.566487916977648, + "learning_rate": 3.5943265904224134e-05, + "loss": 0.2068, + "step": 2583 + }, + { + "epoch": 4.108108108108108, + "grad_norm": 2.720067528068507, + "learning_rate": 3.594858413220137e-05, + "loss": 0.2471, + "step": 2584 + }, + { + "epoch": 4.109697933227345, + "grad_norm": 1.7671619872633417, + "learning_rate": 3.595390373143619e-05, + "loss": 0.1392, + "step": 2585 + }, + { + "epoch": 4.111287758346582, + "grad_norm": 1.7685577260004848, + "learning_rate": 3.5959224700128085e-05, + "loss": 0.2124, + "step": 2586 + }, + { + "epoch": 4.112877583465819, + "grad_norm": 1.870853995145257, + "learning_rate": 3.59645470364761e-05, + "loss": 0.1739, + "step": 2587 + }, + { + "epoch": 4.114467408585056, + "grad_norm": 1.7021865766966053, + "learning_rate": 3.596987073867882e-05, + "loss": 0.2072, + "step": 2588 + }, + { + "epoch": 4.116057233704293, + "grad_norm": 5.616823650089961, + "learning_rate": 3.5975195804934373e-05, + "loss": 7.6576, + "step": 2589 + }, + { + "epoch": 4.117647058823529, + "grad_norm": 2.3711637678633855, + "learning_rate": 3.59805222334404e-05, + "loss": 0.1426, + "step": 2590 + }, + { + "epoch": 4.119236883942766, + "grad_norm": 0.943145945860843, + "learning_rate": 3.5985850022394115e-05, + "loss": 0.1638, + "step": 2591 + }, + { + "epoch": 4.120826709062003, + "grad_norm": 1.5680379217940754, + "learning_rate": 3.599117916999224e-05, + "loss": 0.1453, + "step": 2592 + }, + { + "epoch": 4.12241653418124, + "grad_norm": 2.301409541776319, + "learning_rate": 3.5996509674431056e-05, + "loss": 0.1689, + "step": 2593 + }, + { + "epoch": 4.124006359300477, + "grad_norm": 1.2457821087274152, + "learning_rate": 3.600184153390638e-05, + "loss": 0.1799, + "step": 2594 + }, + { + "epoch": 4.125596184419714, + "grad_norm": 2.014737641798629, + "learning_rate": 3.600717474661358e-05, + "loss": 0.2377, + "step": 2595 + }, + { + "epoch": 4.127186009538951, + "grad_norm": 2.2681098277587974, + "learning_rate": 3.601250931074754e-05, + "loss": 0.1899, + "step": 2596 + }, + { + "epoch": 4.128775834658188, + "grad_norm": 2.8617757235505654, + "learning_rate": 3.601784522450272e-05, + "loss": 0.1634, + "step": 2597 + }, + { + "epoch": 4.130365659777424, + "grad_norm": 1.430311352518948, + "learning_rate": 3.602318248607309e-05, + "loss": 0.1641, + "step": 2598 + }, + { + "epoch": 4.131955484896661, + "grad_norm": 3.601325069866256, + "learning_rate": 3.60285210936522e-05, + "loss": 0.2021, + "step": 2599 + }, + { + "epoch": 4.133545310015898, + "grad_norm": 1.8600422595400594, + "learning_rate": 3.60338610454331e-05, + "loss": 0.2222, + "step": 2600 + }, + { + "epoch": 4.135135135135135, + "grad_norm": 2.8717644323128377, + "learning_rate": 3.603920233960844e-05, + "loss": 0.1746, + "step": 2601 + }, + { + "epoch": 4.136724960254372, + "grad_norm": 2.1934706411692635, + "learning_rate": 3.6044544974370355e-05, + "loss": 0.1195, + "step": 2602 + }, + { + "epoch": 4.138314785373609, + "grad_norm": 1.2528174713899627, + "learning_rate": 3.604988894791057e-05, + "loss": 0.1527, + "step": 2603 + }, + { + "epoch": 4.139904610492846, + "grad_norm": 1.4102457846683318, + "learning_rate": 3.6055234258420346e-05, + "loss": 0.1601, + "step": 2604 + }, + { + "epoch": 4.141494435612083, + "grad_norm": 2.055286138422291, + "learning_rate": 3.606058090409049e-05, + "loss": 0.1792, + "step": 2605 + }, + { + "epoch": 4.143084260731319, + "grad_norm": 2.0721522562202876, + "learning_rate": 3.606592888311135e-05, + "loss": 0.2006, + "step": 2606 + }, + { + "epoch": 4.144674085850556, + "grad_norm": 1.2467439429735536, + "learning_rate": 3.607127819367283e-05, + "loss": 0.1412, + "step": 2607 + }, + { + "epoch": 4.146263910969793, + "grad_norm": 51.62272940003203, + "learning_rate": 3.607662883396439e-05, + "loss": 14.3185, + "step": 2608 + }, + { + "epoch": 4.14785373608903, + "grad_norm": 3.791169706538436, + "learning_rate": 3.6081980802175014e-05, + "loss": 0.373, + "step": 2609 + }, + { + "epoch": 4.149443561208267, + "grad_norm": 2.5846253450907457, + "learning_rate": 3.608733409649328e-05, + "loss": 0.14, + "step": 2610 + }, + { + "epoch": 4.151033386327504, + "grad_norm": 50.32377859297938, + "learning_rate": 3.609268871510727e-05, + "loss": 16.4595, + "step": 2611 + }, + { + "epoch": 4.152623211446741, + "grad_norm": 3.322793804234763, + "learning_rate": 3.6098044656204636e-05, + "loss": 0.2678, + "step": 2612 + }, + { + "epoch": 4.154213036565977, + "grad_norm": 2.6230785533858896, + "learning_rate": 3.6103401917972614e-05, + "loss": 0.1414, + "step": 2613 + }, + { + "epoch": 4.155802861685214, + "grad_norm": 3.6459821814226485, + "learning_rate": 3.610876049859794e-05, + "loss": 0.183, + "step": 2614 + }, + { + "epoch": 4.157392686804451, + "grad_norm": 2.158306729984067, + "learning_rate": 3.611412039626694e-05, + "loss": 0.1466, + "step": 2615 + }, + { + "epoch": 4.158982511923688, + "grad_norm": 3.214937128982499, + "learning_rate": 3.6119481609165476e-05, + "loss": 0.1645, + "step": 2616 + }, + { + "epoch": 4.160572337042925, + "grad_norm": 2.3220342397726514, + "learning_rate": 3.612484413547897e-05, + "loss": 0.1995, + "step": 2617 + }, + { + "epoch": 4.162162162162162, + "grad_norm": 2.608819600018179, + "learning_rate": 3.6130207973392415e-05, + "loss": 0.1891, + "step": 2618 + }, + { + "epoch": 4.163751987281399, + "grad_norm": 1.75453889544742, + "learning_rate": 3.613557312109033e-05, + "loss": 0.1855, + "step": 2619 + }, + { + "epoch": 4.165341812400636, + "grad_norm": 2.255692700689292, + "learning_rate": 3.6140939576756814e-05, + "loss": 0.1509, + "step": 2620 + }, + { + "epoch": 4.166931637519872, + "grad_norm": 3.7064843449878966, + "learning_rate": 3.614630733857552e-05, + "loss": 0.1668, + "step": 2621 + }, + { + "epoch": 4.168521462639109, + "grad_norm": 2.4847019646961312, + "learning_rate": 3.6151676404729645e-05, + "loss": 0.1556, + "step": 2622 + }, + { + "epoch": 4.170111287758346, + "grad_norm": 3.1156487284386216, + "learning_rate": 3.6157046773401964e-05, + "loss": 0.1454, + "step": 2623 + }, + { + "epoch": 4.171701112877583, + "grad_norm": 6.444378461489699, + "learning_rate": 3.61624184427748e-05, + "loss": 0.2125, + "step": 2624 + }, + { + "epoch": 4.17329093799682, + "grad_norm": 2.90917773299034, + "learning_rate": 3.616779141103003e-05, + "loss": 0.1813, + "step": 2625 + }, + { + "epoch": 4.174880763116057, + "grad_norm": 2.6017592717961695, + "learning_rate": 3.61731656763491e-05, + "loss": 0.1544, + "step": 2626 + }, + { + "epoch": 4.176470588235294, + "grad_norm": 3.4949250718379647, + "learning_rate": 3.617854123691303e-05, + "loss": 0.2069, + "step": 2627 + }, + { + "epoch": 4.178060413354531, + "grad_norm": 4.00988653636346, + "learning_rate": 3.618391809090238e-05, + "loss": 0.2029, + "step": 2628 + }, + { + "epoch": 4.1796502384737675, + "grad_norm": 3.0203370585041087, + "learning_rate": 3.6189296236497255e-05, + "loss": 0.1417, + "step": 2629 + }, + { + "epoch": 4.1812400635930045, + "grad_norm": 3.6484348364358326, + "learning_rate": 3.6194675671877395e-05, + "loss": 0.2191, + "step": 2630 + }, + { + "epoch": 4.1828298887122415, + "grad_norm": 2.109931995336289, + "learning_rate": 3.620005639522201e-05, + "loss": 0.104, + "step": 2631 + }, + { + "epoch": 4.1844197138314785, + "grad_norm": 5.322513605535563, + "learning_rate": 3.620543840470995e-05, + "loss": 0.215, + "step": 2632 + }, + { + "epoch": 4.1860095389507155, + "grad_norm": 2.822585307981762, + "learning_rate": 3.621082169851959e-05, + "loss": 0.1551, + "step": 2633 + }, + { + "epoch": 4.1875993640699525, + "grad_norm": 3.1168746712290836, + "learning_rate": 3.621620627482888e-05, + "loss": 0.2244, + "step": 2634 + }, + { + "epoch": 4.1891891891891895, + "grad_norm": 4.012009398444918, + "learning_rate": 3.622159213181533e-05, + "loss": 0.1913, + "step": 2635 + }, + { + "epoch": 4.1907790143084265, + "grad_norm": 5.589943858404057, + "learning_rate": 3.6226979267656035e-05, + "loss": 0.2269, + "step": 2636 + }, + { + "epoch": 4.192368839427663, + "grad_norm": 1.7220056346790287, + "learning_rate": 3.6232367680527634e-05, + "loss": 0.1426, + "step": 2637 + }, + { + "epoch": 4.1939586645469, + "grad_norm": 3.9570101958872157, + "learning_rate": 3.623775736860635e-05, + "loss": 0.2488, + "step": 2638 + }, + { + "epoch": 4.195548489666137, + "grad_norm": 3.826450792136232, + "learning_rate": 3.624314833006796e-05, + "loss": 0.1643, + "step": 2639 + }, + { + "epoch": 4.197138314785374, + "grad_norm": 32.55621151259951, + "learning_rate": 3.624854056308783e-05, + "loss": 9.861, + "step": 2640 + }, + { + "epoch": 4.198728139904611, + "grad_norm": 2.9895635358558863, + "learning_rate": 3.625393406584088e-05, + "loss": 0.1996, + "step": 2641 + }, + { + "epoch": 4.200317965023848, + "grad_norm": 2.9508203173429206, + "learning_rate": 3.625932883650161e-05, + "loss": 0.1982, + "step": 2642 + }, + { + "epoch": 4.201907790143085, + "grad_norm": 4.839071719036717, + "learning_rate": 3.6264724873244074e-05, + "loss": 0.1628, + "step": 2643 + }, + { + "epoch": 4.203497615262322, + "grad_norm": 4.174836149787471, + "learning_rate": 3.627012217424191e-05, + "loss": 0.1691, + "step": 2644 + }, + { + "epoch": 4.205087440381558, + "grad_norm": 4.920053909346024, + "learning_rate": 3.627552073766834e-05, + "loss": 0.1803, + "step": 2645 + }, + { + "epoch": 4.206677265500795, + "grad_norm": 3.3732409467923117, + "learning_rate": 3.628092056169614e-05, + "loss": 0.2197, + "step": 2646 + }, + { + "epoch": 4.208267090620032, + "grad_norm": 5.29426581225197, + "learning_rate": 3.628632164449765e-05, + "loss": 0.2207, + "step": 2647 + }, + { + "epoch": 4.209856915739269, + "grad_norm": 5.354406853759848, + "learning_rate": 3.6291723984244824e-05, + "loss": 0.2292, + "step": 2648 + }, + { + "epoch": 4.211446740858506, + "grad_norm": 160.4010414373529, + "learning_rate": 3.629712757910915e-05, + "loss": 8.6029, + "step": 2649 + }, + { + "epoch": 4.213036565977743, + "grad_norm": 6.746407517547819, + "learning_rate": 3.630253242726171e-05, + "loss": 0.2074, + "step": 2650 + }, + { + "epoch": 4.21462639109698, + "grad_norm": 5.0885689546621995, + "learning_rate": 3.630793852687316e-05, + "loss": 0.1917, + "step": 2651 + }, + { + "epoch": 4.216216216216216, + "grad_norm": 2.357998478924187, + "learning_rate": 3.631334587611373e-05, + "loss": 0.1045, + "step": 2652 + }, + { + "epoch": 4.217806041335453, + "grad_norm": 4.71153141149594, + "learning_rate": 3.631875447315322e-05, + "loss": 0.2956, + "step": 2653 + }, + { + "epoch": 4.21939586645469, + "grad_norm": 8.098104413236705, + "learning_rate": 3.632416431616103e-05, + "loss": 0.1945, + "step": 2654 + }, + { + "epoch": 4.220985691573927, + "grad_norm": 2.637818313434111, + "learning_rate": 3.632957540330612e-05, + "loss": 0.1046, + "step": 2655 + }, + { + "epoch": 4.222575516693164, + "grad_norm": 5.925312865469627, + "learning_rate": 3.633498773275703e-05, + "loss": 0.1738, + "step": 2656 + }, + { + "epoch": 4.224165341812401, + "grad_norm": 9.967813560771157, + "learning_rate": 3.634040130268188e-05, + "loss": 0.2563, + "step": 2657 + }, + { + "epoch": 4.225755166931638, + "grad_norm": 3.848052597739168, + "learning_rate": 3.634581611124838e-05, + "loss": 0.133, + "step": 2658 + }, + { + "epoch": 4.227344992050875, + "grad_norm": 8.360872285952548, + "learning_rate": 3.6351232156623805e-05, + "loss": 0.246, + "step": 2659 + }, + { + "epoch": 4.228934817170111, + "grad_norm": 5.909109090532057, + "learning_rate": 3.6356649436975025e-05, + "loss": 0.1652, + "step": 2660 + }, + { + "epoch": 4.230524642289348, + "grad_norm": 4.32512618012791, + "learning_rate": 3.636206795046848e-05, + "loss": 0.1628, + "step": 2661 + }, + { + "epoch": 4.232114467408585, + "grad_norm": 6.356703735794279, + "learning_rate": 3.636748769527022e-05, + "loss": 0.2258, + "step": 2662 + }, + { + "epoch": 4.233704292527822, + "grad_norm": 10.909012764021467, + "learning_rate": 3.6372908669545833e-05, + "loss": 0.1897, + "step": 2663 + }, + { + "epoch": 4.235294117647059, + "grad_norm": 5.471527227553544, + "learning_rate": 3.637833087146053e-05, + "loss": 0.3065, + "step": 2664 + }, + { + "epoch": 4.236883942766296, + "grad_norm": 4.102357036129911, + "learning_rate": 3.638375429917908e-05, + "loss": 0.1887, + "step": 2665 + }, + { + "epoch": 4.238473767885533, + "grad_norm": 8.825678254514678, + "learning_rate": 3.638917895086586e-05, + "loss": 0.2471, + "step": 2666 + }, + { + "epoch": 4.24006359300477, + "grad_norm": 5.85351207258296, + "learning_rate": 3.639460482468482e-05, + "loss": 0.2109, + "step": 2667 + }, + { + "epoch": 4.241653418124006, + "grad_norm": 4.0660631468204995, + "learning_rate": 3.640003191879948e-05, + "loss": 0.2062, + "step": 2668 + }, + { + "epoch": 4.243243243243243, + "grad_norm": 6.037543091691723, + "learning_rate": 3.6405460231373005e-05, + "loss": 0.2501, + "step": 2669 + }, + { + "epoch": 4.24483306836248, + "grad_norm": 2.8105689972528216, + "learning_rate": 3.641088976056807e-05, + "loss": 0.1706, + "step": 2670 + }, + { + "epoch": 4.246422893481717, + "grad_norm": 17.36328034885686, + "learning_rate": 3.6416320504547e-05, + "loss": 0.3135, + "step": 2671 + }, + { + "epoch": 4.248012718600954, + "grad_norm": 4.186353610505322, + "learning_rate": 3.6421752461471674e-05, + "loss": 0.2532, + "step": 2672 + }, + { + "epoch": 4.249602543720191, + "grad_norm": 9.842024940376808, + "learning_rate": 3.642718562950356e-05, + "loss": 0.1777, + "step": 2673 + }, + { + "epoch": 4.251192368839428, + "grad_norm": 6.128437705109032, + "learning_rate": 3.643262000680375e-05, + "loss": 0.2072, + "step": 2674 + }, + { + "epoch": 4.252782193958664, + "grad_norm": 6.517263165754503, + "learning_rate": 3.6438055591532895e-05, + "loss": 0.2246, + "step": 2675 + }, + { + "epoch": 4.254372019077901, + "grad_norm": 5.468729619601952, + "learning_rate": 3.644349238185124e-05, + "loss": 0.1952, + "step": 2676 + }, + { + "epoch": 4.255961844197138, + "grad_norm": 8.302234882987564, + "learning_rate": 3.644893037591863e-05, + "loss": 0.202, + "step": 2677 + }, + { + "epoch": 4.257551669316375, + "grad_norm": 6.283138724901664, + "learning_rate": 3.645436957189451e-05, + "loss": 0.2065, + "step": 2678 + }, + { + "epoch": 4.259141494435612, + "grad_norm": 21.442714400362348, + "learning_rate": 3.6459809967937906e-05, + "loss": 5.8897, + "step": 2679 + }, + { + "epoch": 4.260731319554849, + "grad_norm": 4.054543492929826, + "learning_rate": 3.646525156220743e-05, + "loss": 0.1905, + "step": 2680 + }, + { + "epoch": 4.262321144674086, + "grad_norm": 6.20388183572734, + "learning_rate": 3.6470694352861315e-05, + "loss": 0.1921, + "step": 2681 + }, + { + "epoch": 4.263910969793323, + "grad_norm": 7.191170983352282, + "learning_rate": 3.6476138338057367e-05, + "loss": 0.2056, + "step": 2682 + }, + { + "epoch": 4.26550079491256, + "grad_norm": 2.1414628660681325, + "learning_rate": 3.648158351595298e-05, + "loss": 0.1841, + "step": 2683 + }, + { + "epoch": 4.267090620031796, + "grad_norm": 7.005617010890684, + "learning_rate": 3.648702988470518e-05, + "loss": 0.2057, + "step": 2684 + }, + { + "epoch": 4.268680445151033, + "grad_norm": 3.737852647160204, + "learning_rate": 3.6492477442470564e-05, + "loss": 0.1602, + "step": 2685 + }, + { + "epoch": 4.27027027027027, + "grad_norm": 4.184546399706757, + "learning_rate": 3.649792618740533e-05, + "loss": 0.2856, + "step": 2686 + }, + { + "epoch": 4.271860095389507, + "grad_norm": 4.463398609951982, + "learning_rate": 3.6503376117665265e-05, + "loss": 0.1866, + "step": 2687 + }, + { + "epoch": 4.273449920508744, + "grad_norm": 6.075873238989334, + "learning_rate": 3.6508827231405775e-05, + "loss": 0.2206, + "step": 2688 + }, + { + "epoch": 4.275039745627981, + "grad_norm": 3.638851809792017, + "learning_rate": 3.651427952678185e-05, + "loss": 0.2493, + "step": 2689 + }, + { + "epoch": 4.276629570747218, + "grad_norm": 3.960162781741858, + "learning_rate": 3.651973300194809e-05, + "loss": 0.2272, + "step": 2690 + }, + { + "epoch": 4.278219395866454, + "grad_norm": 5.503408794151038, + "learning_rate": 3.652518765505869e-05, + "loss": 0.179, + "step": 2691 + }, + { + "epoch": 4.279809220985691, + "grad_norm": 2.8986482810655003, + "learning_rate": 3.653064348426745e-05, + "loss": 0.1642, + "step": 2692 + }, + { + "epoch": 4.281399046104928, + "grad_norm": 3.995806164588751, + "learning_rate": 3.6536100487727755e-05, + "loss": 0.158, + "step": 2693 + }, + { + "epoch": 4.282988871224165, + "grad_norm": 2.8716230600332904, + "learning_rate": 3.654155866359263e-05, + "loss": 0.1757, + "step": 2694 + }, + { + "epoch": 4.284578696343402, + "grad_norm": 5.298558594128385, + "learning_rate": 3.654701801001466e-05, + "loss": 0.2072, + "step": 2695 + }, + { + "epoch": 4.286168521462639, + "grad_norm": 2.278107412142198, + "learning_rate": 3.655247852514606e-05, + "loss": 0.141, + "step": 2696 + }, + { + "epoch": 4.287758346581876, + "grad_norm": 3.1097924898701117, + "learning_rate": 3.655794020713865e-05, + "loss": 0.1991, + "step": 2697 + }, + { + "epoch": 4.289348171701113, + "grad_norm": 3.2331113837860266, + "learning_rate": 3.656340305414384e-05, + "loss": 0.1322, + "step": 2698 + }, + { + "epoch": 4.290937996820349, + "grad_norm": 3.740215850735703, + "learning_rate": 3.656886706431267e-05, + "loss": 0.135, + "step": 2699 + }, + { + "epoch": 4.292527821939586, + "grad_norm": 3.204813306660602, + "learning_rate": 3.657433223579574e-05, + "loss": 0.164, + "step": 2700 + }, + { + "epoch": 4.294117647058823, + "grad_norm": 1.7242666735296988, + "learning_rate": 3.6579798566743314e-05, + "loss": 0.1341, + "step": 2701 + }, + { + "epoch": 4.29570747217806, + "grad_norm": 3.196599253357943, + "learning_rate": 3.658526605530523e-05, + "loss": 0.1468, + "step": 2702 + }, + { + "epoch": 4.297297297297297, + "grad_norm": 2.9927088179256174, + "learning_rate": 3.659073469963094e-05, + "loss": 0.1536, + "step": 2703 + }, + { + "epoch": 4.298887122416534, + "grad_norm": 2.2110937165329148, + "learning_rate": 3.65962044978695e-05, + "loss": 0.1645, + "step": 2704 + }, + { + "epoch": 4.300476947535771, + "grad_norm": 3.6079806151496188, + "learning_rate": 3.660167544816959e-05, + "loss": 0.1787, + "step": 2705 + }, + { + "epoch": 4.302066772655008, + "grad_norm": 3.585262841904655, + "learning_rate": 3.660714754867949e-05, + "loss": 0.2007, + "step": 2706 + }, + { + "epoch": 4.3036565977742445, + "grad_norm": 2.569430965336825, + "learning_rate": 3.6612620797547084e-05, + "loss": 0.1616, + "step": 2707 + }, + { + "epoch": 4.3052464228934815, + "grad_norm": 2.9172509601080097, + "learning_rate": 3.661809519291989e-05, + "loss": 0.1645, + "step": 2708 + }, + { + "epoch": 4.3068362480127185, + "grad_norm": 2.5688462954812046, + "learning_rate": 3.6623570732945006e-05, + "loss": 0.1278, + "step": 2709 + }, + { + "epoch": 4.3084260731319555, + "grad_norm": 2.8645330308010792, + "learning_rate": 3.662904741576918e-05, + "loss": 0.1306, + "step": 2710 + }, + { + "epoch": 4.3100158982511925, + "grad_norm": 2.944714214475977, + "learning_rate": 3.663452523953874e-05, + "loss": 0.1686, + "step": 2711 + }, + { + "epoch": 4.3116057233704295, + "grad_norm": 1.9873329297346898, + "learning_rate": 3.664000420239964e-05, + "loss": 0.1678, + "step": 2712 + }, + { + "epoch": 4.3131955484896665, + "grad_norm": 4.9327898010633655, + "learning_rate": 3.664548430249745e-05, + "loss": 0.1922, + "step": 2713 + }, + { + "epoch": 4.314785373608903, + "grad_norm": 2.8359832019966627, + "learning_rate": 3.665096553797736e-05, + "loss": 0.1741, + "step": 2714 + }, + { + "epoch": 4.31637519872814, + "grad_norm": 2.949733494563496, + "learning_rate": 3.665644790698417e-05, + "loss": 0.1659, + "step": 2715 + }, + { + "epoch": 4.317965023847377, + "grad_norm": 4.045401247445941, + "learning_rate": 3.6661931407662294e-05, + "loss": 0.1806, + "step": 2716 + }, + { + "epoch": 4.319554848966614, + "grad_norm": 3.3181079553035544, + "learning_rate": 3.6667416038155756e-05, + "loss": 0.1936, + "step": 2717 + }, + { + "epoch": 4.321144674085851, + "grad_norm": 3.5146240760721836, + "learning_rate": 3.6672901796608224e-05, + "loss": 0.1889, + "step": 2718 + }, + { + "epoch": 4.322734499205088, + "grad_norm": 2.543682724162422, + "learning_rate": 3.6678388681162976e-05, + "loss": 0.2109, + "step": 2719 + }, + { + "epoch": 4.324324324324325, + "grad_norm": 6.397625058600096, + "learning_rate": 3.668387668996286e-05, + "loss": 0.3312, + "step": 2720 + }, + { + "epoch": 4.325914149443562, + "grad_norm": 5.485836836138616, + "learning_rate": 3.6689365821150425e-05, + "loss": 0.1988, + "step": 2721 + }, + { + "epoch": 4.327503974562799, + "grad_norm": 3.7908427510107887, + "learning_rate": 3.669485607286777e-05, + "loss": 0.1191, + "step": 2722 + }, + { + "epoch": 4.329093799682035, + "grad_norm": 3.5461132810895775, + "learning_rate": 3.670034744325666e-05, + "loss": 0.1553, + "step": 2723 + }, + { + "epoch": 4.330683624801272, + "grad_norm": 5.055192122477079, + "learning_rate": 3.6705839930458466e-05, + "loss": 0.2001, + "step": 2724 + }, + { + "epoch": 4.332273449920509, + "grad_norm": 2.2907995961639154, + "learning_rate": 3.671133353261417e-05, + "loss": 0.1922, + "step": 2725 + }, + { + "epoch": 4.333863275039746, + "grad_norm": 6.0071808266270335, + "learning_rate": 3.6716828247864396e-05, + "loss": 0.1524, + "step": 2726 + }, + { + "epoch": 4.335453100158983, + "grad_norm": 2.919174510256984, + "learning_rate": 3.672232407434937e-05, + "loss": 0.1751, + "step": 2727 + }, + { + "epoch": 4.33704292527822, + "grad_norm": 3.9701786776207757, + "learning_rate": 3.6727821010208965e-05, + "loss": 0.144, + "step": 2728 + }, + { + "epoch": 4.338632750397457, + "grad_norm": 2.267128019843562, + "learning_rate": 3.673331905358266e-05, + "loss": 0.1352, + "step": 2729 + }, + { + "epoch": 4.340222575516693, + "grad_norm": 2.434928793100397, + "learning_rate": 3.673881820260957e-05, + "loss": 0.1599, + "step": 2730 + }, + { + "epoch": 4.34181240063593, + "grad_norm": 1.8483118056371717, + "learning_rate": 3.674431845542843e-05, + "loss": 0.1975, + "step": 2731 + }, + { + "epoch": 4.343402225755167, + "grad_norm": 3.828776479036021, + "learning_rate": 3.674981981017761e-05, + "loss": 0.155, + "step": 2732 + }, + { + "epoch": 4.344992050874404, + "grad_norm": 3.973162968564649, + "learning_rate": 3.67553222649951e-05, + "loss": 0.21, + "step": 2733 + }, + { + "epoch": 4.346581875993641, + "grad_norm": 2.6259682888789557, + "learning_rate": 3.67608258180185e-05, + "loss": 0.1385, + "step": 2734 + }, + { + "epoch": 4.348171701112878, + "grad_norm": 5.199045865872974, + "learning_rate": 3.6766330467385085e-05, + "loss": 0.1842, + "step": 2735 + }, + { + "epoch": 4.349761526232115, + "grad_norm": 3.639265218194823, + "learning_rate": 3.677183621123171e-05, + "loss": 0.2359, + "step": 2736 + }, + { + "epoch": 4.351351351351352, + "grad_norm": 6.6463089266357755, + "learning_rate": 3.677734304769489e-05, + "loss": 0.1993, + "step": 2737 + }, + { + "epoch": 4.352941176470588, + "grad_norm": 5.388148038750875, + "learning_rate": 3.678285097491075e-05, + "loss": 0.2136, + "step": 2738 + }, + { + "epoch": 4.354531001589825, + "grad_norm": 3.1318104063775785, + "learning_rate": 3.678835999101507e-05, + "loss": 0.1725, + "step": 2739 + }, + { + "epoch": 4.356120826709062, + "grad_norm": 4.1035092355690574, + "learning_rate": 3.679387009414324e-05, + "loss": 0.1257, + "step": 2740 + }, + { + "epoch": 4.357710651828299, + "grad_norm": 4.471408504940488, + "learning_rate": 3.679938128243029e-05, + "loss": 0.1449, + "step": 2741 + }, + { + "epoch": 4.359300476947536, + "grad_norm": 3.726520642595853, + "learning_rate": 3.6804893554010876e-05, + "loss": 0.2134, + "step": 2742 + }, + { + "epoch": 4.360890302066773, + "grad_norm": 2.5915664696735523, + "learning_rate": 3.6810406907019304e-05, + "loss": 0.1458, + "step": 2743 + }, + { + "epoch": 4.36248012718601, + "grad_norm": 2.990889940844389, + "learning_rate": 3.68159213395895e-05, + "loss": 0.2314, + "step": 2744 + }, + { + "epoch": 4.364069952305247, + "grad_norm": 3.150654551382951, + "learning_rate": 3.682143684985503e-05, + "loss": 0.1999, + "step": 2745 + }, + { + "epoch": 4.365659777424483, + "grad_norm": 1.708517664827149, + "learning_rate": 3.682695343594908e-05, + "loss": 0.1889, + "step": 2746 + }, + { + "epoch": 4.36724960254372, + "grad_norm": 2.176965273218404, + "learning_rate": 3.683247109600451e-05, + "loss": 0.1647, + "step": 2747 + }, + { + "epoch": 4.368839427662957, + "grad_norm": 4.126780458342471, + "learning_rate": 3.683798982815377e-05, + "loss": 0.1729, + "step": 2748 + }, + { + "epoch": 4.370429252782194, + "grad_norm": 4.087307192787987, + "learning_rate": 3.684350963052898e-05, + "loss": 0.2913, + "step": 2749 + }, + { + "epoch": 4.372019077901431, + "grad_norm": 3.439228142764807, + "learning_rate": 3.684903050126188e-05, + "loss": 0.1435, + "step": 2750 + }, + { + "epoch": 4.373608903020668, + "grad_norm": 3.430060799474536, + "learning_rate": 3.6854552438483864e-05, + "loss": 0.1444, + "step": 2751 + }, + { + "epoch": 4.375198728139905, + "grad_norm": 46.58806938344657, + "learning_rate": 3.686007544032595e-05, + "loss": 9.7998, + "step": 2752 + }, + { + "epoch": 4.376788553259141, + "grad_norm": 3.701340429498749, + "learning_rate": 3.6865599504918805e-05, + "loss": 0.2045, + "step": 2753 + }, + { + "epoch": 4.378378378378378, + "grad_norm": 3.0624153999447405, + "learning_rate": 3.687112463039274e-05, + "loss": 0.1951, + "step": 2754 + }, + { + "epoch": 4.379968203497615, + "grad_norm": 27.651019430636584, + "learning_rate": 3.6876650814877675e-05, + "loss": 8.646, + "step": 2755 + }, + { + "epoch": 4.381558028616852, + "grad_norm": 2.1058910279558205, + "learning_rate": 3.688217805650323e-05, + "loss": 0.1525, + "step": 2756 + }, + { + "epoch": 4.383147853736089, + "grad_norm": 2.7941184698924144, + "learning_rate": 3.68877063533986e-05, + "loss": 0.187, + "step": 2757 + }, + { + "epoch": 4.384737678855326, + "grad_norm": 3.4307143044270028, + "learning_rate": 3.689323570369268e-05, + "loss": 0.1714, + "step": 2758 + }, + { + "epoch": 4.386327503974563, + "grad_norm": 3.31524555588055, + "learning_rate": 3.6898766105513986e-05, + "loss": 0.2197, + "step": 2759 + }, + { + "epoch": 4.3879173290938, + "grad_norm": 3.4165375880700664, + "learning_rate": 3.690429755699067e-05, + "loss": 0.1773, + "step": 2760 + }, + { + "epoch": 4.389507154213036, + "grad_norm": 4.402656186439025, + "learning_rate": 3.690983005625053e-05, + "loss": 0.2251, + "step": 2761 + }, + { + "epoch": 4.391096979332273, + "grad_norm": 3.791981740612013, + "learning_rate": 3.691536360142102e-05, + "loss": 0.2226, + "step": 2762 + }, + { + "epoch": 4.39268680445151, + "grad_norm": 3.6231411394281934, + "learning_rate": 3.6920898190629246e-05, + "loss": 0.1894, + "step": 2763 + }, + { + "epoch": 4.394276629570747, + "grad_norm": 4.993100673181136, + "learning_rate": 3.6926433822001934e-05, + "loss": 0.1881, + "step": 2764 + }, + { + "epoch": 4.395866454689984, + "grad_norm": 5.380414237960948, + "learning_rate": 3.6931970493665475e-05, + "loss": 0.1548, + "step": 2765 + }, + { + "epoch": 4.397456279809221, + "grad_norm": 3.990439257680693, + "learning_rate": 3.693750820374592e-05, + "loss": 0.1522, + "step": 2766 + }, + { + "epoch": 4.399046104928458, + "grad_norm": 4.704991687147685, + "learning_rate": 3.694304695036894e-05, + "loss": 0.2506, + "step": 2767 + }, + { + "epoch": 4.400635930047695, + "grad_norm": 5.55982771451882, + "learning_rate": 3.6948586731659886e-05, + "loss": 0.2184, + "step": 2768 + }, + { + "epoch": 4.402225755166931, + "grad_norm": 6.272620307366908, + "learning_rate": 3.695412754574372e-05, + "loss": 0.2335, + "step": 2769 + }, + { + "epoch": 4.403815580286168, + "grad_norm": 4.701942122458604, + "learning_rate": 3.69596693907451e-05, + "loss": 0.1914, + "step": 2770 + }, + { + "epoch": 4.405405405405405, + "grad_norm": 1.384871393641088, + "learning_rate": 3.6965212264788296e-05, + "loss": 0.2565, + "step": 2771 + }, + { + "epoch": 4.406995230524642, + "grad_norm": 8.70404537833915, + "learning_rate": 3.697075616599725e-05, + "loss": 0.1757, + "step": 2772 + }, + { + "epoch": 4.408585055643879, + "grad_norm": 3.6852798624386507, + "learning_rate": 3.6976301092495554e-05, + "loss": 0.1431, + "step": 2773 + }, + { + "epoch": 4.410174880763116, + "grad_norm": 6.505368143514953, + "learning_rate": 3.6981847042406455e-05, + "loss": 0.1617, + "step": 2774 + }, + { + "epoch": 4.411764705882353, + "grad_norm": 10.031257241859679, + "learning_rate": 3.698739401385284e-05, + "loss": 0.2817, + "step": 2775 + }, + { + "epoch": 4.413354531001589, + "grad_norm": 2.593628496426279, + "learning_rate": 3.699294200495727e-05, + "loss": 0.1241, + "step": 2776 + }, + { + "epoch": 4.414944356120826, + "grad_norm": 5.087044479908797, + "learning_rate": 3.699849101384195e-05, + "loss": 0.2164, + "step": 2777 + }, + { + "epoch": 4.416534181240063, + "grad_norm": 4.567404460760233, + "learning_rate": 3.7004041038628726e-05, + "loss": 0.1874, + "step": 2778 + }, + { + "epoch": 4.4181240063593, + "grad_norm": 6.364426986989506, + "learning_rate": 3.700959207743914e-05, + "loss": 0.1939, + "step": 2779 + }, + { + "epoch": 4.419713831478537, + "grad_norm": 3.3366653148307, + "learning_rate": 3.701514412839434e-05, + "loss": 0.1527, + "step": 2780 + }, + { + "epoch": 4.421303656597774, + "grad_norm": 6.040785096110123, + "learning_rate": 3.7020697189615184e-05, + "loss": 0.1967, + "step": 2781 + }, + { + "epoch": 4.422893481717011, + "grad_norm": 4.354372018678005, + "learning_rate": 3.702625125922214e-05, + "loss": 0.2341, + "step": 2782 + }, + { + "epoch": 4.424483306836248, + "grad_norm": 5.301465836162514, + "learning_rate": 3.703180633533537e-05, + "loss": 0.2158, + "step": 2783 + }, + { + "epoch": 4.426073131955485, + "grad_norm": 3.2896212507038634, + "learning_rate": 3.703736241607468e-05, + "loss": 0.1952, + "step": 2784 + }, + { + "epoch": 4.4276629570747215, + "grad_norm": 4.629143110218315, + "learning_rate": 3.704291949955953e-05, + "loss": 0.2097, + "step": 2785 + }, + { + "epoch": 4.4292527821939585, + "grad_norm": 4.197135425512117, + "learning_rate": 3.704847758390907e-05, + "loss": 0.1437, + "step": 2786 + }, + { + "epoch": 4.4308426073131955, + "grad_norm": 2.828531380039823, + "learning_rate": 3.705403666724205e-05, + "loss": 0.1629, + "step": 2787 + }, + { + "epoch": 4.4324324324324325, + "grad_norm": 6.023618988066048, + "learning_rate": 3.705959674767696e-05, + "loss": 0.2207, + "step": 2788 + }, + { + "epoch": 4.4340222575516695, + "grad_norm": 3.6184250424935787, + "learning_rate": 3.7065157823331896e-05, + "loss": 0.2019, + "step": 2789 + }, + { + "epoch": 4.4356120826709065, + "grad_norm": 3.6868921122778198, + "learning_rate": 3.707071989232464e-05, + "loss": 0.162, + "step": 2790 + }, + { + "epoch": 4.4372019077901435, + "grad_norm": 4.39189914767115, + "learning_rate": 3.707628295277263e-05, + "loss": 0.197, + "step": 2791 + }, + { + "epoch": 4.43879173290938, + "grad_norm": 4.538233626622601, + "learning_rate": 3.708184700279298e-05, + "loss": 0.1944, + "step": 2792 + }, + { + "epoch": 4.440381558028617, + "grad_norm": 3.956791389701744, + "learning_rate": 3.708741204050245e-05, + "loss": 0.1836, + "step": 2793 + }, + { + "epoch": 4.441971383147854, + "grad_norm": 5.844487852894146, + "learning_rate": 3.7092978064017475e-05, + "loss": 0.3074, + "step": 2794 + }, + { + "epoch": 4.443561208267091, + "grad_norm": 5.362990828820045, + "learning_rate": 3.709854507145417e-05, + "loss": 0.1716, + "step": 2795 + }, + { + "epoch": 4.4451510333863276, + "grad_norm": 4.022579602980249, + "learning_rate": 3.710411306092829e-05, + "loss": 0.1305, + "step": 2796 + }, + { + "epoch": 4.4467408585055646, + "grad_norm": 5.683804537535434, + "learning_rate": 3.710968203055528e-05, + "loss": 0.1918, + "step": 2797 + }, + { + "epoch": 4.4483306836248016, + "grad_norm": 4.883417975304302, + "learning_rate": 3.711525197845026e-05, + "loss": 0.1449, + "step": 2798 + }, + { + "epoch": 4.4499205087440385, + "grad_norm": 3.309917234397699, + "learning_rate": 3.712082290272797e-05, + "loss": 0.1763, + "step": 2799 + }, + { + "epoch": 4.451510333863275, + "grad_norm": 5.185115182954079, + "learning_rate": 3.712639480150288e-05, + "loss": 0.1708, + "step": 2800 + }, + { + "epoch": 4.453100158982512, + "grad_norm": 4.12508687447716, + "learning_rate": 3.71319676728891e-05, + "loss": 0.2181, + "step": 2801 + }, + { + "epoch": 4.454689984101749, + "grad_norm": 34.9179672194163, + "learning_rate": 3.713754151500041e-05, + "loss": 2.5219, + "step": 2802 + }, + { + "epoch": 4.456279809220986, + "grad_norm": 6.049362525678668, + "learning_rate": 3.714311632595027e-05, + "loss": 0.1907, + "step": 2803 + }, + { + "epoch": 4.457869634340223, + "grad_norm": 2.9145074223688723, + "learning_rate": 3.71486921038518e-05, + "loss": 0.1805, + "step": 2804 + }, + { + "epoch": 4.45945945945946, + "grad_norm": 3.8931006091671225, + "learning_rate": 3.715426884681781e-05, + "loss": 0.2147, + "step": 2805 + }, + { + "epoch": 4.461049284578697, + "grad_norm": 6.611337180132665, + "learning_rate": 3.7159846552960776e-05, + "loss": 0.3192, + "step": 2806 + }, + { + "epoch": 4.462639109697934, + "grad_norm": 3.629436247169493, + "learning_rate": 3.716542522039284e-05, + "loss": 0.2102, + "step": 2807 + }, + { + "epoch": 4.46422893481717, + "grad_norm": 2.258745493852795, + "learning_rate": 3.7171004847225825e-05, + "loss": 0.1742, + "step": 2808 + }, + { + "epoch": 4.465818759936407, + "grad_norm": 6.062549778453651, + "learning_rate": 3.717658543157124e-05, + "loss": 0.1641, + "step": 2809 + }, + { + "epoch": 4.467408585055644, + "grad_norm": 4.219589608455295, + "learning_rate": 3.718216697154024e-05, + "loss": 0.2141, + "step": 2810 + }, + { + "epoch": 4.468998410174881, + "grad_norm": 7.534390965453846, + "learning_rate": 3.718774946524369e-05, + "loss": 0.2214, + "step": 2811 + }, + { + "epoch": 4.470588235294118, + "grad_norm": 2.829483610733524, + "learning_rate": 3.719333291079212e-05, + "loss": 0.1485, + "step": 2812 + }, + { + "epoch": 4.472178060413355, + "grad_norm": 3.698981875189589, + "learning_rate": 3.719891730629573e-05, + "loss": 0.1725, + "step": 2813 + }, + { + "epoch": 4.473767885532592, + "grad_norm": 4.505116913402184, + "learning_rate": 3.7204502649864404e-05, + "loss": 0.1973, + "step": 2814 + }, + { + "epoch": 4.475357710651828, + "grad_norm": 3.265320633529519, + "learning_rate": 3.721008893960771e-05, + "loss": 0.2059, + "step": 2815 + }, + { + "epoch": 4.476947535771065, + "grad_norm": 4.5241444493374585, + "learning_rate": 3.721567617363488e-05, + "loss": 0.2121, + "step": 2816 + }, + { + "epoch": 4.478537360890302, + "grad_norm": 3.6111145012584074, + "learning_rate": 3.7221264350054855e-05, + "loss": 0.1342, + "step": 2817 + }, + { + "epoch": 4.480127186009539, + "grad_norm": 3.4718237823325335, + "learning_rate": 3.722685346697622e-05, + "loss": 0.1569, + "step": 2818 + }, + { + "epoch": 4.481717011128776, + "grad_norm": 27.295358827841486, + "learning_rate": 3.723244352250729e-05, + "loss": 35.825, + "step": 2819 + }, + { + "epoch": 4.483306836248013, + "grad_norm": 9.07619442854584, + "learning_rate": 3.723803451475599e-05, + "loss": 0.2175, + "step": 2820 + }, + { + "epoch": 4.48489666136725, + "grad_norm": 5.768878262228427, + "learning_rate": 3.724362644183001e-05, + "loss": 0.2082, + "step": 2821 + }, + { + "epoch": 4.486486486486487, + "grad_norm": 4.031848957655296, + "learning_rate": 3.7249219301836675e-05, + "loss": 0.1783, + "step": 2822 + }, + { + "epoch": 4.488076311605723, + "grad_norm": 7.9223708612477015, + "learning_rate": 3.7254813092882994e-05, + "loss": 0.215, + "step": 2823 + }, + { + "epoch": 4.48966613672496, + "grad_norm": 3.6757952282275834, + "learning_rate": 3.726040781307567e-05, + "loss": 0.1762, + "step": 2824 + }, + { + "epoch": 4.491255961844197, + "grad_norm": 3.0646252721992266, + "learning_rate": 3.726600346052112e-05, + "loss": 0.1736, + "step": 2825 + }, + { + "epoch": 4.492845786963434, + "grad_norm": 3.997769383392965, + "learning_rate": 3.727160003332539e-05, + "loss": 0.1778, + "step": 2826 + }, + { + "epoch": 4.494435612082671, + "grad_norm": 6.709997982591083, + "learning_rate": 3.727719752959426e-05, + "loss": 0.2102, + "step": 2827 + }, + { + "epoch": 4.496025437201908, + "grad_norm": 6.75511785140146, + "learning_rate": 3.7282795947433166e-05, + "loss": 0.3789, + "step": 2828 + }, + { + "epoch": 4.497615262321145, + "grad_norm": 11.963881764755069, + "learning_rate": 3.7288395284947264e-05, + "loss": 0.22, + "step": 2829 + }, + { + "epoch": 4.499205087440382, + "grad_norm": 6.021469160749505, + "learning_rate": 3.7293995540241366e-05, + "loss": 0.1727, + "step": 2830 + }, + { + "epoch": 4.500794912559618, + "grad_norm": 5.126158245069067, + "learning_rate": 3.729959671141999e-05, + "loss": 0.1794, + "step": 2831 + }, + { + "epoch": 4.502384737678855, + "grad_norm": 9.785680090767471, + "learning_rate": 3.7305198796587356e-05, + "loss": 0.2051, + "step": 2832 + }, + { + "epoch": 4.503974562798092, + "grad_norm": 4.6361494902371465, + "learning_rate": 3.731080179384735e-05, + "loss": 0.1835, + "step": 2833 + }, + { + "epoch": 4.505564387917329, + "grad_norm": 3.2293456119664095, + "learning_rate": 3.731640570130355e-05, + "loss": 0.2385, + "step": 2834 + }, + { + "epoch": 4.507154213036566, + "grad_norm": 13.642501089659985, + "learning_rate": 3.7322010517059255e-05, + "loss": 0.3587, + "step": 2835 + }, + { + "epoch": 4.508744038155803, + "grad_norm": 30.60348472399762, + "learning_rate": 3.7327616239217434e-05, + "loss": 11.3504, + "step": 2836 + }, + { + "epoch": 4.51033386327504, + "grad_norm": 3.618786746035887, + "learning_rate": 3.733322286588075e-05, + "loss": 0.3155, + "step": 2837 + }, + { + "epoch": 4.511923688394276, + "grad_norm": 5.416537160564723, + "learning_rate": 3.7338830395151554e-05, + "loss": 0.1898, + "step": 2838 + }, + { + "epoch": 4.513513513513513, + "grad_norm": 2.5996520449174816, + "learning_rate": 3.734443882513192e-05, + "loss": 0.1011, + "step": 2839 + }, + { + "epoch": 4.51510333863275, + "grad_norm": 4.71495042240284, + "learning_rate": 3.735004815392357e-05, + "loss": 0.2611, + "step": 2840 + }, + { + "epoch": 4.516693163751987, + "grad_norm": 66.40559706763618, + "learning_rate": 3.735565837962798e-05, + "loss": 20.057, + "step": 2841 + }, + { + "epoch": 4.518282988871224, + "grad_norm": 3.152306397931738, + "learning_rate": 3.736126950034628e-05, + "loss": 0.2576, + "step": 2842 + }, + { + "epoch": 4.519872813990461, + "grad_norm": 4.989368054441349, + "learning_rate": 3.736688151417929e-05, + "loss": 0.2456, + "step": 2843 + }, + { + "epoch": 4.521462639109698, + "grad_norm": 8.500775974951233, + "learning_rate": 3.737249441922757e-05, + "loss": 0.1583, + "step": 2844 + }, + { + "epoch": 4.523052464228935, + "grad_norm": 4.401315475819665, + "learning_rate": 3.7378108213591354e-05, + "loss": 0.1797, + "step": 2845 + }, + { + "epoch": 4.524642289348172, + "grad_norm": 5.871625193044864, + "learning_rate": 3.738372289537057e-05, + "loss": 0.4668, + "step": 2846 + }, + { + "epoch": 4.526232114467408, + "grad_norm": 4.675678678415982, + "learning_rate": 3.738933846266484e-05, + "loss": 0.236, + "step": 2847 + }, + { + "epoch": 4.527821939586645, + "grad_norm": 3.364556648408934, + "learning_rate": 3.739495491357352e-05, + "loss": 0.2292, + "step": 2848 + }, + { + "epoch": 4.529411764705882, + "grad_norm": 3.5825149594659447, + "learning_rate": 3.740057224619563e-05, + "loss": 0.2048, + "step": 2849 + }, + { + "epoch": 4.531001589825119, + "grad_norm": 4.709106247739551, + "learning_rate": 3.7406190458629906e-05, + "loss": 0.1675, + "step": 2850 + }, + { + "epoch": 4.532591414944356, + "grad_norm": 4.826751422511153, + "learning_rate": 3.741180954897479e-05, + "loss": 0.1672, + "step": 2851 + }, + { + "epoch": 4.534181240063593, + "grad_norm": 2.8660126792764844, + "learning_rate": 3.741742951532843e-05, + "loss": 0.1557, + "step": 2852 + }, + { + "epoch": 4.53577106518283, + "grad_norm": 3.967471914387211, + "learning_rate": 3.742305035578866e-05, + "loss": 0.1639, + "step": 2853 + }, + { + "epoch": 4.537360890302066, + "grad_norm": 5.303662444613524, + "learning_rate": 3.7428672068453035e-05, + "loss": 0.1763, + "step": 2854 + }, + { + "epoch": 4.538950715421303, + "grad_norm": 4.027149475718436, + "learning_rate": 3.743429465141881e-05, + "loss": 0.1796, + "step": 2855 + }, + { + "epoch": 4.54054054054054, + "grad_norm": 2.3258477806024564, + "learning_rate": 3.7439918102782945e-05, + "loss": 0.1723, + "step": 2856 + }, + { + "epoch": 4.542130365659777, + "grad_norm": 4.73420352541784, + "learning_rate": 3.74455424206421e-05, + "loss": 0.1749, + "step": 2857 + }, + { + "epoch": 4.543720190779014, + "grad_norm": 5.815176253798402, + "learning_rate": 3.7451167603092644e-05, + "loss": 0.217, + "step": 2858 + }, + { + "epoch": 4.545310015898251, + "grad_norm": 4.855606317556176, + "learning_rate": 3.745679364823066e-05, + "loss": 0.1705, + "step": 2859 + }, + { + "epoch": 4.546899841017488, + "grad_norm": 4.394428255071976, + "learning_rate": 3.746242055415195e-05, + "loss": 0.137, + "step": 2860 + }, + { + "epoch": 4.548489666136725, + "grad_norm": 3.051067466298107, + "learning_rate": 3.746804831895198e-05, + "loss": 0.1688, + "step": 2861 + }, + { + "epoch": 4.550079491255962, + "grad_norm": 6.478224658482319, + "learning_rate": 3.747367694072599e-05, + "loss": 0.2346, + "step": 2862 + }, + { + "epoch": 4.5516693163751984, + "grad_norm": 2.847955289662053, + "learning_rate": 3.747930641756886e-05, + "loss": 0.1798, + "step": 2863 + }, + { + "epoch": 4.5532591414944354, + "grad_norm": 5.705028785605883, + "learning_rate": 3.748493674757525e-05, + "loss": 0.1844, + "step": 2864 + }, + { + "epoch": 4.5548489666136724, + "grad_norm": 6.481835535104653, + "learning_rate": 3.749056792883948e-05, + "loss": 0.1902, + "step": 2865 + }, + { + "epoch": 4.556438791732909, + "grad_norm": 2.3467054867298853, + "learning_rate": 3.749619995945559e-05, + "loss": 0.1369, + "step": 2866 + }, + { + "epoch": 4.558028616852146, + "grad_norm": 5.061642652221836, + "learning_rate": 3.750183283751736e-05, + "loss": 0.2625, + "step": 2867 + }, + { + "epoch": 4.559618441971383, + "grad_norm": 9.072954499200883, + "learning_rate": 3.750746656111825e-05, + "loss": 1.0663, + "step": 2868 + }, + { + "epoch": 4.56120826709062, + "grad_norm": 2.986979905087573, + "learning_rate": 3.751310112835145e-05, + "loss": 0.1639, + "step": 2869 + }, + { + "epoch": 4.5627980922098565, + "grad_norm": 5.439717361733004, + "learning_rate": 3.751873653730988e-05, + "loss": 0.1607, + "step": 2870 + }, + { + "epoch": 4.5643879173290935, + "grad_norm": 5.297552435329612, + "learning_rate": 3.752437278608615e-05, + "loss": 0.1647, + "step": 2871 + }, + { + "epoch": 4.5659777424483305, + "grad_norm": 3.0939035910055406, + "learning_rate": 3.753000987277257e-05, + "loss": 0.1621, + "step": 2872 + }, + { + "epoch": 4.5675675675675675, + "grad_norm": 4.1145267409351, + "learning_rate": 3.7535647795461226e-05, + "loss": 0.1696, + "step": 2873 + }, + { + "epoch": 4.5691573926868045, + "grad_norm": 5.511622115063244, + "learning_rate": 3.7541286552243866e-05, + "loss": 0.1514, + "step": 2874 + }, + { + "epoch": 4.5707472178060415, + "grad_norm": 4.201066705365509, + "learning_rate": 3.7546926141211975e-05, + "loss": 0.1881, + "step": 2875 + }, + { + "epoch": 4.5723370429252785, + "grad_norm": 2.7474518348050236, + "learning_rate": 3.755256656045676e-05, + "loss": 0.2011, + "step": 2876 + }, + { + "epoch": 4.573926868044515, + "grad_norm": 4.672119909022862, + "learning_rate": 3.755820780806915e-05, + "loss": 0.1287, + "step": 2877 + }, + { + "epoch": 4.575516693163752, + "grad_norm": 6.427234917577963, + "learning_rate": 3.756384988213978e-05, + "loss": 0.2484, + "step": 2878 + }, + { + "epoch": 4.577106518282989, + "grad_norm": 2.6518638705510775, + "learning_rate": 3.756949278075901e-05, + "loss": 0.1753, + "step": 2879 + }, + { + "epoch": 4.578696343402226, + "grad_norm": 4.102056281935237, + "learning_rate": 3.757513650201692e-05, + "loss": 0.1431, + "step": 2880 + }, + { + "epoch": 4.580286168521463, + "grad_norm": 4.50911895348498, + "learning_rate": 3.758078104400333e-05, + "loss": 0.2021, + "step": 2881 + }, + { + "epoch": 4.5818759936407, + "grad_norm": 8.520109212665893, + "learning_rate": 3.7586426404807746e-05, + "loss": 0.3287, + "step": 2882 + }, + { + "epoch": 4.583465818759937, + "grad_norm": 2.621910342230516, + "learning_rate": 3.759207258251944e-05, + "loss": 0.2072, + "step": 2883 + }, + { + "epoch": 4.585055643879174, + "grad_norm": 5.268388231332253, + "learning_rate": 3.759771957522736e-05, + "loss": 0.1596, + "step": 2884 + }, + { + "epoch": 4.586645468998411, + "grad_norm": 2.3716181188864454, + "learning_rate": 3.760336738102023e-05, + "loss": 0.1807, + "step": 2885 + }, + { + "epoch": 4.588235294117647, + "grad_norm": 3.1786401685425427, + "learning_rate": 3.7609015997986456e-05, + "loss": 0.2016, + "step": 2886 + }, + { + "epoch": 4.589825119236884, + "grad_norm": 1.7059595072140057, + "learning_rate": 3.7614665424214193e-05, + "loss": 0.1337, + "step": 2887 + }, + { + "epoch": 4.591414944356121, + "grad_norm": 59.26652420688663, + "learning_rate": 3.76203156577913e-05, + "loss": 16.4848, + "step": 2888 + }, + { + "epoch": 4.593004769475358, + "grad_norm": 3.3265049300937215, + "learning_rate": 3.76259666968054e-05, + "loss": 0.163, + "step": 2889 + }, + { + "epoch": 4.594594594594595, + "grad_norm": 2.11070844646554, + "learning_rate": 3.7631618539343814e-05, + "loss": 0.8586, + "step": 2890 + }, + { + "epoch": 4.596184419713832, + "grad_norm": 2.6287192942695747, + "learning_rate": 3.763727118349359e-05, + "loss": 0.1547, + "step": 2891 + }, + { + "epoch": 4.597774244833069, + "grad_norm": 3.924075880786414, + "learning_rate": 3.764292462734152e-05, + "loss": 0.156, + "step": 2892 + }, + { + "epoch": 4.599364069952305, + "grad_norm": 2.733295188724249, + "learning_rate": 3.76485788689741e-05, + "loss": 0.1621, + "step": 2893 + }, + { + "epoch": 4.600953895071542, + "grad_norm": 3.081777402410622, + "learning_rate": 3.76542339064776e-05, + "loss": 0.2026, + "step": 2894 + }, + { + "epoch": 4.602543720190779, + "grad_norm": 4.99250817482992, + "learning_rate": 3.765988973793798e-05, + "loss": 0.1557, + "step": 2895 + }, + { + "epoch": 4.604133545310016, + "grad_norm": 2.2807979132046046, + "learning_rate": 3.7665546361440945e-05, + "loss": 0.1269, + "step": 2896 + }, + { + "epoch": 4.605723370429253, + "grad_norm": 8.405147682434572, + "learning_rate": 3.767120377507194e-05, + "loss": 0.3099, + "step": 2897 + }, + { + "epoch": 4.60731319554849, + "grad_norm": 3.0530003363190525, + "learning_rate": 3.767686197691613e-05, + "loss": 0.1992, + "step": 2898 + }, + { + "epoch": 4.608903020667727, + "grad_norm": 2.172573133556041, + "learning_rate": 3.7682520965058435e-05, + "loss": 0.1318, + "step": 2899 + }, + { + "epoch": 4.610492845786963, + "grad_norm": 2.6943062677208225, + "learning_rate": 3.768818073758346e-05, + "loss": 0.1428, + "step": 2900 + }, + { + "epoch": 4.6120826709062, + "grad_norm": 2.9187779138871144, + "learning_rate": 3.76938412925756e-05, + "loss": 0.1658, + "step": 2901 + }, + { + "epoch": 4.613672496025437, + "grad_norm": 2.826619002470318, + "learning_rate": 3.7699502628118955e-05, + "loss": 0.1699, + "step": 2902 + }, + { + "epoch": 4.615262321144674, + "grad_norm": 3.3533843818535285, + "learning_rate": 3.770516474229738e-05, + "loss": 0.1767, + "step": 2903 + }, + { + "epoch": 4.616852146263911, + "grad_norm": 2.4722416502378812, + "learning_rate": 3.771082763319443e-05, + "loss": 0.1409, + "step": 2904 + }, + { + "epoch": 4.618441971383148, + "grad_norm": 2.0670050910687046, + "learning_rate": 3.7716491298893444e-05, + "loss": 0.1596, + "step": 2905 + }, + { + "epoch": 4.620031796502385, + "grad_norm": 3.016923117600807, + "learning_rate": 3.772215573747746e-05, + "loss": 0.1449, + "step": 2906 + }, + { + "epoch": 4.621621621621622, + "grad_norm": 1.6640428099149303, + "learning_rate": 3.772782094702929e-05, + "loss": 0.163, + "step": 2907 + }, + { + "epoch": 4.623211446740859, + "grad_norm": 2.8608169340897245, + "learning_rate": 3.7733486925631454e-05, + "loss": 0.1495, + "step": 2908 + }, + { + "epoch": 4.624801271860095, + "grad_norm": 3.0543808279729068, + "learning_rate": 3.773915367136621e-05, + "loss": 0.1467, + "step": 2909 + }, + { + "epoch": 4.626391096979332, + "grad_norm": 2.622726718147098, + "learning_rate": 3.77448211823156e-05, + "loss": 0.1707, + "step": 2910 + }, + { + "epoch": 4.627980922098569, + "grad_norm": 5.576265127092346, + "learning_rate": 3.775048945656135e-05, + "loss": 0.173, + "step": 2911 + }, + { + "epoch": 4.629570747217806, + "grad_norm": 2.7600958364610366, + "learning_rate": 3.775615849218497e-05, + "loss": 0.1624, + "step": 2912 + }, + { + "epoch": 4.631160572337043, + "grad_norm": 5.445072715945005, + "learning_rate": 3.7761828287267685e-05, + "loss": 0.1925, + "step": 2913 + }, + { + "epoch": 4.63275039745628, + "grad_norm": 41.24983257642017, + "learning_rate": 3.776749883989049e-05, + "loss": 11.0079, + "step": 2914 + }, + { + "epoch": 4.634340222575517, + "grad_norm": 3.100296713856521, + "learning_rate": 3.777317014813409e-05, + "loss": 0.1441, + "step": 2915 + }, + { + "epoch": 4.635930047694753, + "grad_norm": 3.574802053971851, + "learning_rate": 3.777884221007897e-05, + "loss": 0.1924, + "step": 2916 + }, + { + "epoch": 4.63751987281399, + "grad_norm": 4.391112410158943, + "learning_rate": 3.7784515023805324e-05, + "loss": 0.1516, + "step": 2917 + }, + { + "epoch": 4.639109697933227, + "grad_norm": 3.779933020332043, + "learning_rate": 3.7790188587393134e-05, + "loss": 0.23, + "step": 2918 + }, + { + "epoch": 4.640699523052464, + "grad_norm": 2.641751129125442, + "learning_rate": 3.7795862898922075e-05, + "loss": 0.1355, + "step": 2919 + }, + { + "epoch": 4.642289348171701, + "grad_norm": 6.048211643722015, + "learning_rate": 3.7801537956471625e-05, + "loss": 0.1961, + "step": 2920 + }, + { + "epoch": 4.643879173290938, + "grad_norm": 2.9238567038790135, + "learning_rate": 3.780721375812097e-05, + "loss": 0.131, + "step": 2921 + }, + { + "epoch": 4.645468998410175, + "grad_norm": 2.8797445111279165, + "learning_rate": 3.781289030194905e-05, + "loss": 0.1714, + "step": 2922 + }, + { + "epoch": 4.647058823529412, + "grad_norm": 4.238923219208015, + "learning_rate": 3.781856758603458e-05, + "loss": 0.2889, + "step": 2923 + }, + { + "epoch": 4.648648648648649, + "grad_norm": 27.17517441629114, + "learning_rate": 3.782424560845598e-05, + "loss": 4.5325, + "step": 2924 + }, + { + "epoch": 4.650238473767885, + "grad_norm": 3.6137526219204528, + "learning_rate": 3.782992436729147e-05, + "loss": 0.1658, + "step": 2925 + }, + { + "epoch": 4.651828298887122, + "grad_norm": 4.800934225703837, + "learning_rate": 3.783560386061897e-05, + "loss": 0.1376, + "step": 2926 + }, + { + "epoch": 4.653418124006359, + "grad_norm": 4.0535697033233, + "learning_rate": 3.78412840865162e-05, + "loss": 0.2865, + "step": 2927 + }, + { + "epoch": 4.655007949125596, + "grad_norm": 4.682326791935431, + "learning_rate": 3.7846965043060597e-05, + "loss": 0.1728, + "step": 2928 + }, + { + "epoch": 4.656597774244833, + "grad_norm": 4.371263340441853, + "learning_rate": 3.7852646728329374e-05, + "loss": 0.2363, + "step": 2929 + }, + { + "epoch": 4.65818759936407, + "grad_norm": 4.9388028511102355, + "learning_rate": 3.785832914039947e-05, + "loss": 0.1924, + "step": 2930 + }, + { + "epoch": 4.659777424483307, + "grad_norm": 3.0128601695484742, + "learning_rate": 3.78640122773476e-05, + "loss": 0.1643, + "step": 2931 + }, + { + "epoch": 4.661367249602543, + "grad_norm": 5.771423900752951, + "learning_rate": 3.786969613725024e-05, + "loss": 1.023, + "step": 2932 + }, + { + "epoch": 4.66295707472178, + "grad_norm": 5.48536869888573, + "learning_rate": 3.7875380718183595e-05, + "loss": 0.2292, + "step": 2933 + }, + { + "epoch": 4.664546899841017, + "grad_norm": 6.098775172864054, + "learning_rate": 3.788106601822364e-05, + "loss": 0.2111, + "step": 2934 + }, + { + "epoch": 4.666136724960254, + "grad_norm": 6.0951413559209255, + "learning_rate": 3.788675203544611e-05, + "loss": 0.1543, + "step": 2935 + }, + { + "epoch": 4.667726550079491, + "grad_norm": 3.744145421572132, + "learning_rate": 3.789243876792651e-05, + "loss": 0.1609, + "step": 2936 + }, + { + "epoch": 4.669316375198728, + "grad_norm": 8.05013352203148, + "learning_rate": 3.7898126213740064e-05, + "loss": 0.1755, + "step": 2937 + }, + { + "epoch": 4.670906200317965, + "grad_norm": 8.15763257841782, + "learning_rate": 3.7903814370961785e-05, + "loss": 0.1515, + "step": 2938 + }, + { + "epoch": 4.672496025437201, + "grad_norm": 4.545912947789513, + "learning_rate": 3.7909503237666435e-05, + "loss": 0.1434, + "step": 2939 + }, + { + "epoch": 4.674085850556438, + "grad_norm": 4.598716644531128, + "learning_rate": 3.791519281192855e-05, + "loss": 0.2647, + "step": 2940 + }, + { + "epoch": 4.675675675675675, + "grad_norm": 9.081038264677316, + "learning_rate": 3.792088309182241e-05, + "loss": 0.2429, + "step": 2941 + }, + { + "epoch": 4.677265500794912, + "grad_norm": 7.360321809767228, + "learning_rate": 3.7926574075422056e-05, + "loss": 0.1485, + "step": 2942 + }, + { + "epoch": 4.678855325914149, + "grad_norm": 5.47606383062219, + "learning_rate": 3.7932265760801294e-05, + "loss": 0.1815, + "step": 2943 + }, + { + "epoch": 4.680445151033386, + "grad_norm": 6.55417122302052, + "learning_rate": 3.7937958146033705e-05, + "loss": 0.1385, + "step": 2944 + }, + { + "epoch": 4.682034976152623, + "grad_norm": 5.404671642146511, + "learning_rate": 3.7943651229192614e-05, + "loss": 0.215, + "step": 2945 + }, + { + "epoch": 4.68362480127186, + "grad_norm": 5.834554209116783, + "learning_rate": 3.7949345008351124e-05, + "loss": 0.1667, + "step": 2946 + }, + { + "epoch": 4.685214626391097, + "grad_norm": 4.115579261924594, + "learning_rate": 3.79550394815821e-05, + "loss": 0.1263, + "step": 2947 + }, + { + "epoch": 4.6868044515103335, + "grad_norm": 3.5753700051751562, + "learning_rate": 3.796073464695816e-05, + "loss": 0.1486, + "step": 2948 + }, + { + "epoch": 4.6883942766295705, + "grad_norm": 4.554149204213057, + "learning_rate": 3.79664305025517e-05, + "loss": 0.1439, + "step": 2949 + }, + { + "epoch": 4.6899841017488075, + "grad_norm": 3.773053705629598, + "learning_rate": 3.7972127046434884e-05, + "loss": 0.178, + "step": 2950 + }, + { + "epoch": 4.6915739268680445, + "grad_norm": 3.6737401041770643, + "learning_rate": 3.797782427667962e-05, + "loss": 0.1618, + "step": 2951 + }, + { + "epoch": 4.6931637519872815, + "grad_norm": 5.608265374853665, + "learning_rate": 3.798352219135763e-05, + "loss": 0.1478, + "step": 2952 + }, + { + "epoch": 4.6947535771065185, + "grad_norm": 7.989345613173626, + "learning_rate": 3.798922078854035e-05, + "loss": 0.4584, + "step": 2953 + }, + { + "epoch": 4.6963434022257555, + "grad_norm": 3.351765565071844, + "learning_rate": 3.7994920066299036e-05, + "loss": 0.1753, + "step": 2954 + }, + { + "epoch": 4.697933227344992, + "grad_norm": 5.728752981867214, + "learning_rate": 3.800062002270467e-05, + "loss": 0.2282, + "step": 2955 + }, + { + "epoch": 4.699523052464229, + "grad_norm": 5.2001883236729345, + "learning_rate": 3.800632065582803e-05, + "loss": 0.1726, + "step": 2956 + }, + { + "epoch": 4.701112877583466, + "grad_norm": 3.663551163561718, + "learning_rate": 3.801202196373966e-05, + "loss": 0.1584, + "step": 2957 + }, + { + "epoch": 4.702702702702703, + "grad_norm": 2.479764929918765, + "learning_rate": 3.801772394450986e-05, + "loss": 0.1554, + "step": 2958 + }, + { + "epoch": 4.70429252782194, + "grad_norm": 6.328522080071939, + "learning_rate": 3.802342659620874e-05, + "loss": 0.1424, + "step": 2959 + }, + { + "epoch": 4.705882352941177, + "grad_norm": 4.903041959057388, + "learning_rate": 3.802912991690614e-05, + "loss": 0.1771, + "step": 2960 + }, + { + "epoch": 4.707472178060414, + "grad_norm": 4.0683432747591315, + "learning_rate": 3.80348339046717e-05, + "loss": 0.1667, + "step": 2961 + }, + { + "epoch": 4.709062003179651, + "grad_norm": 2.5599177866461242, + "learning_rate": 3.8040538557574826e-05, + "loss": 0.1643, + "step": 2962 + }, + { + "epoch": 4.710651828298887, + "grad_norm": 7.731778265898322, + "learning_rate": 3.8046243873684696e-05, + "loss": 0.1248, + "step": 2963 + }, + { + "epoch": 4.712241653418124, + "grad_norm": 3.6230250918108515, + "learning_rate": 3.8051949851070274e-05, + "loss": 0.1382, + "step": 2964 + }, + { + "epoch": 4.713831478537361, + "grad_norm": 4.210084136828786, + "learning_rate": 3.8057656487800284e-05, + "loss": 0.1531, + "step": 2965 + }, + { + "epoch": 4.715421303656598, + "grad_norm": 3.712019543233206, + "learning_rate": 3.806336378194324e-05, + "loss": 0.1677, + "step": 2966 + }, + { + "epoch": 4.717011128775835, + "grad_norm": 3.1880198978075134, + "learning_rate": 3.8069071731567434e-05, + "loss": 0.1601, + "step": 2967 + }, + { + "epoch": 4.718600953895072, + "grad_norm": 4.291405206248979, + "learning_rate": 3.807478033474093e-05, + "loss": 0.196, + "step": 2968 + }, + { + "epoch": 4.720190779014309, + "grad_norm": 2.244007352051777, + "learning_rate": 3.808048958953157e-05, + "loss": 0.1587, + "step": 2969 + }, + { + "epoch": 4.721780604133546, + "grad_norm": 4.877429121242834, + "learning_rate": 3.808619949400697e-05, + "loss": 0.2168, + "step": 2970 + }, + { + "epoch": 4.723370429252782, + "grad_norm": 5.472412880007288, + "learning_rate": 3.8091910046234556e-05, + "loss": 0.1441, + "step": 2971 + }, + { + "epoch": 4.724960254372019, + "grad_norm": 4.6464547110154895, + "learning_rate": 3.809762124428149e-05, + "loss": 0.1817, + "step": 2972 + }, + { + "epoch": 4.726550079491256, + "grad_norm": 5.574834948434259, + "learning_rate": 3.810333308621475e-05, + "loss": 0.2534, + "step": 2973 + }, + { + "epoch": 4.728139904610493, + "grad_norm": 4.698425855483335, + "learning_rate": 3.810904557010109e-05, + "loss": 0.2308, + "step": 2974 + }, + { + "epoch": 4.72972972972973, + "grad_norm": 4.018454706624181, + "learning_rate": 3.811475869400703e-05, + "loss": 0.2559, + "step": 2975 + }, + { + "epoch": 4.731319554848967, + "grad_norm": 4.725735347081218, + "learning_rate": 3.8120472455998885e-05, + "loss": 0.1606, + "step": 2976 + }, + { + "epoch": 4.732909379968204, + "grad_norm": 2.6501809646362466, + "learning_rate": 3.8126186854142755e-05, + "loss": 0.1541, + "step": 2977 + }, + { + "epoch": 4.73449920508744, + "grad_norm": 2.9955998923272373, + "learning_rate": 3.813190188650452e-05, + "loss": 0.1715, + "step": 2978 + }, + { + "epoch": 4.736089030206677, + "grad_norm": 4.183801742787861, + "learning_rate": 3.813761755114987e-05, + "loss": 0.1852, + "step": 2979 + }, + { + "epoch": 4.737678855325914, + "grad_norm": 2.65072119049328, + "learning_rate": 3.814333384614423e-05, + "loss": 0.1775, + "step": 2980 + }, + { + "epoch": 4.739268680445151, + "grad_norm": 1.735938869945464, + "learning_rate": 3.814905076955286e-05, + "loss": 0.2176, + "step": 2981 + }, + { + "epoch": 4.740858505564388, + "grad_norm": 3.3746454204775973, + "learning_rate": 3.815476831944077e-05, + "loss": 0.1666, + "step": 2982 + }, + { + "epoch": 4.742448330683625, + "grad_norm": 3.2335479627086863, + "learning_rate": 3.81604864938728e-05, + "loss": 0.1894, + "step": 2983 + }, + { + "epoch": 4.744038155802862, + "grad_norm": 2.1535029919800794, + "learning_rate": 3.816620529091354e-05, + "loss": 0.1272, + "step": 2984 + }, + { + "epoch": 4.745627980922099, + "grad_norm": 1.6824078861062293, + "learning_rate": 3.817192470862739e-05, + "loss": 0.1408, + "step": 2985 + }, + { + "epoch": 4.747217806041336, + "grad_norm": 4.2899638019314095, + "learning_rate": 3.8177644745078524e-05, + "loss": 0.177, + "step": 2986 + }, + { + "epoch": 4.748807631160572, + "grad_norm": 3.92971756672654, + "learning_rate": 3.8183365398330933e-05, + "loss": 0.204, + "step": 2987 + }, + { + "epoch": 4.750397456279809, + "grad_norm": 2.9381704887726063, + "learning_rate": 3.8189086666448374e-05, + "loss": 0.1961, + "step": 2988 + }, + { + "epoch": 4.751987281399046, + "grad_norm": 4.826389452928204, + "learning_rate": 3.81948085474944e-05, + "loss": 0.4684, + "step": 2989 + }, + { + "epoch": 4.753577106518283, + "grad_norm": 3.1035230630691846, + "learning_rate": 3.820053103953237e-05, + "loss": 0.1969, + "step": 2990 + }, + { + "epoch": 4.75516693163752, + "grad_norm": 3.276407800940697, + "learning_rate": 3.820625414062543e-05, + "loss": 0.242, + "step": 2991 + }, + { + "epoch": 4.756756756756757, + "grad_norm": 3.7809944988819013, + "learning_rate": 3.821197784883651e-05, + "loss": 0.2356, + "step": 2992 + }, + { + "epoch": 4.758346581875994, + "grad_norm": 4.5483131732096185, + "learning_rate": 3.8217702162228335e-05, + "loss": 0.2662, + "step": 2993 + }, + { + "epoch": 4.75993640699523, + "grad_norm": 2.868839161665655, + "learning_rate": 3.822342707886345e-05, + "loss": 0.1543, + "step": 2994 + }, + { + "epoch": 4.761526232114467, + "grad_norm": 2.7140135035763313, + "learning_rate": 3.8229152596804167e-05, + "loss": 0.2482, + "step": 2995 + }, + { + "epoch": 4.763116057233704, + "grad_norm": 3.612947774766078, + "learning_rate": 3.823487871411261e-05, + "loss": 0.2006, + "step": 2996 + }, + { + "epoch": 4.764705882352941, + "grad_norm": 2.517607104243198, + "learning_rate": 3.8240605428850696e-05, + "loss": 0.1862, + "step": 2997 + }, + { + "epoch": 4.766295707472178, + "grad_norm": 1.9601154761340402, + "learning_rate": 3.824633273908013e-05, + "loss": 0.13, + "step": 2998 + }, + { + "epoch": 4.767885532591415, + "grad_norm": 1.8342826909814802, + "learning_rate": 3.8252060642862435e-05, + "loss": 0.1335, + "step": 2999 + }, + { + "epoch": 4.769475357710652, + "grad_norm": 2.7459191557330933, + "learning_rate": 3.825778913825892e-05, + "loss": 0.157, + "step": 3000 + }, + { + "epoch": 4.771065182829888, + "grad_norm": 2.495855523347949, + "learning_rate": 3.82635182233307e-05, + "loss": 0.1464, + "step": 3001 + }, + { + "epoch": 4.772655007949125, + "grad_norm": 2.686792704109436, + "learning_rate": 3.826924789613868e-05, + "loss": 0.1768, + "step": 3002 + }, + { + "epoch": 4.774244833068362, + "grad_norm": 2.5362524970960316, + "learning_rate": 3.827497815474358e-05, + "loss": 0.1864, + "step": 3003 + }, + { + "epoch": 4.775834658187599, + "grad_norm": 4.548583178132155, + "learning_rate": 3.828070899720591e-05, + "loss": 0.1931, + "step": 3004 + }, + { + "epoch": 4.777424483306836, + "grad_norm": 4.182438926166377, + "learning_rate": 3.828644042158598e-05, + "loss": 0.1638, + "step": 3005 + }, + { + "epoch": 4.779014308426073, + "grad_norm": 2.9211668882192607, + "learning_rate": 3.829217242594393e-05, + "loss": 0.1864, + "step": 3006 + }, + { + "epoch": 4.78060413354531, + "grad_norm": 6.190585740945075, + "learning_rate": 3.8297905008339675e-05, + "loss": 0.216, + "step": 3007 + }, + { + "epoch": 4.782193958664547, + "grad_norm": 3.1959291420240206, + "learning_rate": 3.830363816683294e-05, + "loss": 0.1372, + "step": 3008 + }, + { + "epoch": 4.783783783783784, + "grad_norm": 3.5564398692603665, + "learning_rate": 3.8309371899483264e-05, + "loss": 0.2979, + "step": 3009 + }, + { + "epoch": 4.78537360890302, + "grad_norm": 3.744152937116926, + "learning_rate": 3.8315106204349976e-05, + "loss": 0.175, + "step": 3010 + }, + { + "epoch": 4.786963434022257, + "grad_norm": 2.40855839657775, + "learning_rate": 3.832084107949223e-05, + "loss": 0.1902, + "step": 3011 + }, + { + "epoch": 4.788553259141494, + "grad_norm": 4.4067276422411945, + "learning_rate": 3.8326576522968985e-05, + "loss": 0.307, + "step": 3012 + }, + { + "epoch": 4.790143084260731, + "grad_norm": 23.065858315303657, + "learning_rate": 3.8332312532838976e-05, + "loss": 5.4769, + "step": 3013 + }, + { + "epoch": 4.791732909379968, + "grad_norm": 3.239619711022394, + "learning_rate": 3.83380491071608e-05, + "loss": 0.1714, + "step": 3014 + }, + { + "epoch": 4.793322734499205, + "grad_norm": 5.120412285290346, + "learning_rate": 3.834378624399282e-05, + "loss": 0.2041, + "step": 3015 + }, + { + "epoch": 4.794912559618442, + "grad_norm": 4.462799013875103, + "learning_rate": 3.834952394139322e-05, + "loss": 0.1908, + "step": 3016 + }, + { + "epoch": 4.796502384737678, + "grad_norm": 3.750797849260666, + "learning_rate": 3.835526219742001e-05, + "loss": 0.1813, + "step": 3017 + }, + { + "epoch": 4.798092209856915, + "grad_norm": 4.260022621199357, + "learning_rate": 3.836100101013099e-05, + "loss": 0.1859, + "step": 3018 + }, + { + "epoch": 4.799682034976152, + "grad_norm": 4.6083764971251195, + "learning_rate": 3.836674037758378e-05, + "loss": 0.1775, + "step": 3019 + }, + { + "epoch": 4.801271860095389, + "grad_norm": 1.884814548062428, + "learning_rate": 3.837248029783581e-05, + "loss": 0.1228, + "step": 3020 + }, + { + "epoch": 4.802861685214626, + "grad_norm": 5.362004424208794, + "learning_rate": 3.837822076894432e-05, + "loss": 0.2078, + "step": 3021 + }, + { + "epoch": 4.804451510333863, + "grad_norm": 3.1578508127971303, + "learning_rate": 3.8383961788966396e-05, + "loss": 0.2041, + "step": 3022 + }, + { + "epoch": 4.8060413354531, + "grad_norm": 3.217119174595127, + "learning_rate": 3.838970335595887e-05, + "loss": 0.1548, + "step": 3023 + }, + { + "epoch": 4.807631160572337, + "grad_norm": 3.907332624767132, + "learning_rate": 3.839544546797845e-05, + "loss": 0.1814, + "step": 3024 + }, + { + "epoch": 4.809220985691574, + "grad_norm": 2.7174728962230845, + "learning_rate": 3.8401188123081654e-05, + "loss": 0.1604, + "step": 3025 + }, + { + "epoch": 4.8108108108108105, + "grad_norm": 2.8664883378746167, + "learning_rate": 3.840693131932477e-05, + "loss": 0.1298, + "step": 3026 + }, + { + "epoch": 4.8124006359300475, + "grad_norm": 2.877993003538523, + "learning_rate": 3.8412675054763964e-05, + "loss": 0.1999, + "step": 3027 + }, + { + "epoch": 4.8139904610492845, + "grad_norm": 3.4419441334672145, + "learning_rate": 3.841841932745517e-05, + "loss": 0.1571, + "step": 3028 + }, + { + "epoch": 4.8155802861685215, + "grad_norm": 4.6139097584681945, + "learning_rate": 3.842416413545416e-05, + "loss": 0.2102, + "step": 3029 + }, + { + "epoch": 4.8171701112877585, + "grad_norm": 3.3559044315417834, + "learning_rate": 3.842990947681653e-05, + "loss": 0.1908, + "step": 3030 + }, + { + "epoch": 4.8187599364069955, + "grad_norm": 5.3714477350554475, + "learning_rate": 3.8435655349597696e-05, + "loss": 0.1878, + "step": 3031 + }, + { + "epoch": 4.8203497615262325, + "grad_norm": 3.7923853565264705, + "learning_rate": 3.8441401751852875e-05, + "loss": 0.1941, + "step": 3032 + }, + { + "epoch": 4.821939586645469, + "grad_norm": 2.7935698674882072, + "learning_rate": 3.8447148681637124e-05, + "loss": 0.1627, + "step": 3033 + }, + { + "epoch": 4.823529411764706, + "grad_norm": 3.376541887833986, + "learning_rate": 3.845289613700532e-05, + "loss": 0.1856, + "step": 3034 + }, + { + "epoch": 4.825119236883943, + "grad_norm": 4.390988742496754, + "learning_rate": 3.845864411601216e-05, + "loss": 0.1289, + "step": 3035 + }, + { + "epoch": 4.82670906200318, + "grad_norm": 3.8045898092249075, + "learning_rate": 3.846439261671214e-05, + "loss": 0.1819, + "step": 3036 + }, + { + "epoch": 4.828298887122417, + "grad_norm": 2.8590396080800957, + "learning_rate": 3.8470141637159625e-05, + "loss": 0.1422, + "step": 3037 + }, + { + "epoch": 4.829888712241654, + "grad_norm": 25.327219583187013, + "learning_rate": 3.8475891175408764e-05, + "loss": 7.7389, + "step": 3038 + }, + { + "epoch": 4.831478537360891, + "grad_norm": 5.687063206963253, + "learning_rate": 3.848164122951355e-05, + "loss": 0.1876, + "step": 3039 + }, + { + "epoch": 4.833068362480127, + "grad_norm": 4.339890585481548, + "learning_rate": 3.8487391797527804e-05, + "loss": 0.1577, + "step": 3040 + }, + { + "epoch": 4.834658187599364, + "grad_norm": 1.402866518035886, + "learning_rate": 3.8493142877505175e-05, + "loss": 0.1785, + "step": 3041 + }, + { + "epoch": 4.836248012718601, + "grad_norm": 57.02033533889198, + "learning_rate": 3.849889446749911e-05, + "loss": 13.9439, + "step": 3042 + }, + { + "epoch": 4.837837837837838, + "grad_norm": 3.0936122114608797, + "learning_rate": 3.8504646565562906e-05, + "loss": 0.1369, + "step": 3043 + }, + { + "epoch": 4.839427662957075, + "grad_norm": 2.945826960200759, + "learning_rate": 3.8510399169749706e-05, + "loss": 0.1823, + "step": 3044 + }, + { + "epoch": 4.841017488076312, + "grad_norm": 2.511771939012823, + "learning_rate": 3.851615227811244e-05, + "loss": 0.1468, + "step": 3045 + }, + { + "epoch": 4.842607313195549, + "grad_norm": 3.748724025241797, + "learning_rate": 3.8521905888703893e-05, + "loss": 0.1774, + "step": 3046 + }, + { + "epoch": 4.844197138314786, + "grad_norm": 5.397717320548106, + "learning_rate": 3.852765999957669e-05, + "loss": 0.1985, + "step": 3047 + }, + { + "epoch": 4.845786963434023, + "grad_norm": 4.060201377235402, + "learning_rate": 3.8533414608783265e-05, + "loss": 0.2052, + "step": 3048 + }, + { + "epoch": 4.847376788553259, + "grad_norm": 4.528780043025934, + "learning_rate": 3.8539169714375886e-05, + "loss": 0.1826, + "step": 3049 + }, + { + "epoch": 4.848966613672496, + "grad_norm": 3.4507830680495912, + "learning_rate": 3.854492531440666e-05, + "loss": 0.1449, + "step": 3050 + }, + { + "epoch": 4.850556438791733, + "grad_norm": 3.4985331505427797, + "learning_rate": 3.8550681406927535e-05, + "loss": 0.1801, + "step": 3051 + }, + { + "epoch": 4.85214626391097, + "grad_norm": 6.716313893832901, + "learning_rate": 3.8556437989990266e-05, + "loss": 0.1426, + "step": 3052 + }, + { + "epoch": 4.853736089030207, + "grad_norm": 3.933207677646741, + "learning_rate": 3.856219506164647e-05, + "loss": 0.2232, + "step": 3053 + }, + { + "epoch": 4.855325914149444, + "grad_norm": 8.372877894603345, + "learning_rate": 3.856795261994759e-05, + "loss": 0.4122, + "step": 3054 + }, + { + "epoch": 4.856915739268681, + "grad_norm": 8.017840555513521, + "learning_rate": 3.8573710662944885e-05, + "loss": 0.2029, + "step": 3055 + }, + { + "epoch": 4.858505564387917, + "grad_norm": 5.879004983279431, + "learning_rate": 3.857946918868948e-05, + "loss": 0.1584, + "step": 3056 + }, + { + "epoch": 4.860095389507154, + "grad_norm": 5.366191530963387, + "learning_rate": 3.8585228195232313e-05, + "loss": 0.1512, + "step": 3057 + }, + { + "epoch": 4.861685214626391, + "grad_norm": 5.7604413637058025, + "learning_rate": 3.859098768062417e-05, + "loss": 0.1569, + "step": 3058 + }, + { + "epoch": 4.863275039745628, + "grad_norm": 6.526736811339729, + "learning_rate": 3.8596747642915684e-05, + "loss": 0.1842, + "step": 3059 + }, + { + "epoch": 4.864864864864865, + "grad_norm": 5.68490123402592, + "learning_rate": 3.860250808015731e-05, + "loss": 0.2087, + "step": 3060 + }, + { + "epoch": 4.866454689984102, + "grad_norm": 7.932916365313232, + "learning_rate": 3.8608268990399345e-05, + "loss": 0.1992, + "step": 3061 + }, + { + "epoch": 4.868044515103339, + "grad_norm": 5.723877038081454, + "learning_rate": 3.861403037169193e-05, + "loss": 0.1621, + "step": 3062 + }, + { + "epoch": 4.869634340222575, + "grad_norm": 7.5001573825791485, + "learning_rate": 3.8619792222085056e-05, + "loss": 0.1822, + "step": 3063 + }, + { + "epoch": 4.871224165341812, + "grad_norm": 11.666769204610338, + "learning_rate": 3.862555453962854e-05, + "loss": 0.222, + "step": 3064 + }, + { + "epoch": 4.872813990461049, + "grad_norm": 4.057896982215662, + "learning_rate": 3.8631317322372036e-05, + "loss": 0.0982, + "step": 3065 + }, + { + "epoch": 4.874403815580286, + "grad_norm": 5.237446732327668, + "learning_rate": 3.863708056836505e-05, + "loss": 0.1509, + "step": 3066 + }, + { + "epoch": 4.875993640699523, + "grad_norm": 8.848062985536522, + "learning_rate": 3.8642844275656955e-05, + "loss": 0.2099, + "step": 3067 + }, + { + "epoch": 4.87758346581876, + "grad_norm": 5.59100556265263, + "learning_rate": 3.8648608442296925e-05, + "loss": 0.1747, + "step": 3068 + }, + { + "epoch": 4.879173290937997, + "grad_norm": 4.335339671618627, + "learning_rate": 3.8654373066334e-05, + "loss": 0.2023, + "step": 3069 + }, + { + "epoch": 4.880763116057234, + "grad_norm": 6.1660375755880885, + "learning_rate": 3.866013814581708e-05, + "loss": 0.1384, + "step": 3070 + }, + { + "epoch": 4.882352941176471, + "grad_norm": 4.315836463738177, + "learning_rate": 3.866590367879488e-05, + "loss": 0.2015, + "step": 3071 + }, + { + "epoch": 4.883942766295707, + "grad_norm": 51.837744991612645, + "learning_rate": 3.8671669663315966e-05, + "loss": 9.2805, + "step": 3072 + }, + { + "epoch": 4.885532591414944, + "grad_norm": 6.860661464763248, + "learning_rate": 3.867743609742878e-05, + "loss": 0.2658, + "step": 3073 + }, + { + "epoch": 4.887122416534181, + "grad_norm": 3.640286980568797, + "learning_rate": 3.868320297918158e-05, + "loss": 0.186, + "step": 3074 + }, + { + "epoch": 4.888712241653418, + "grad_norm": 8.674872158566979, + "learning_rate": 3.868897030662249e-05, + "loss": 0.2201, + "step": 3075 + }, + { + "epoch": 4.890302066772655, + "grad_norm": 8.636316592995279, + "learning_rate": 3.869473807779948e-05, + "loss": 0.241, + "step": 3076 + }, + { + "epoch": 4.891891891891892, + "grad_norm": 7.652251627907137, + "learning_rate": 3.870050629076037e-05, + "loss": 0.2084, + "step": 3077 + }, + { + "epoch": 4.893481717011129, + "grad_norm": 7.607224868679097, + "learning_rate": 3.8706274943552834e-05, + "loss": 0.2531, + "step": 3078 + }, + { + "epoch": 4.895071542130365, + "grad_norm": 4.213324358795593, + "learning_rate": 3.871204403422437e-05, + "loss": 0.1292, + "step": 3079 + }, + { + "epoch": 4.896661367249602, + "grad_norm": 6.245712388579463, + "learning_rate": 3.871781356082237e-05, + "loss": 0.2433, + "step": 3080 + }, + { + "epoch": 4.898251192368839, + "grad_norm": 5.9606445508352985, + "learning_rate": 3.872358352139405e-05, + "loss": 0.2576, + "step": 3081 + }, + { + "epoch": 4.899841017488076, + "grad_norm": 3.911129096129277, + "learning_rate": 3.87293539139865e-05, + "loss": 0.1654, + "step": 3082 + }, + { + "epoch": 4.901430842607313, + "grad_norm": 6.771520275163338, + "learning_rate": 3.873512473664663e-05, + "loss": 0.2137, + "step": 3083 + }, + { + "epoch": 4.90302066772655, + "grad_norm": 5.304142581286025, + "learning_rate": 3.874089598742123e-05, + "loss": 0.1802, + "step": 3084 + }, + { + "epoch": 4.904610492845787, + "grad_norm": 4.040043617014427, + "learning_rate": 3.874666766435696e-05, + "loss": 0.2137, + "step": 3085 + }, + { + "epoch": 4.906200317965024, + "grad_norm": 7.562522845513537, + "learning_rate": 3.87524397655003e-05, + "loss": 0.2367, + "step": 3086 + }, + { + "epoch": 4.907790143084261, + "grad_norm": 4.213430829014083, + "learning_rate": 3.87582122888976e-05, + "loss": 0.1901, + "step": 3087 + }, + { + "epoch": 4.909379968203497, + "grad_norm": 6.056482399804958, + "learning_rate": 3.8763985232595074e-05, + "loss": 0.2467, + "step": 3088 + }, + { + "epoch": 4.910969793322734, + "grad_norm": 6.4159264473951625, + "learning_rate": 3.8769758594638794e-05, + "loss": 0.1628, + "step": 3089 + }, + { + "epoch": 4.912559618441971, + "grad_norm": 5.777009509157185, + "learning_rate": 3.877553237307468e-05, + "loss": 0.2864, + "step": 3090 + }, + { + "epoch": 4.914149443561208, + "grad_norm": 7.990333782273365, + "learning_rate": 3.8781306565948524e-05, + "loss": 0.1866, + "step": 3091 + }, + { + "epoch": 4.915739268680445, + "grad_norm": 6.5651236602109755, + "learning_rate": 3.878708117130597e-05, + "loss": 0.2128, + "step": 3092 + }, + { + "epoch": 4.917329093799682, + "grad_norm": 9.95734657709305, + "learning_rate": 3.879285618719252e-05, + "loss": 0.1815, + "step": 3093 + }, + { + "epoch": 4.918918918918919, + "grad_norm": 5.1939003965343735, + "learning_rate": 3.879863161165353e-05, + "loss": 0.1626, + "step": 3094 + }, + { + "epoch": 4.920508744038155, + "grad_norm": 5.723299283432753, + "learning_rate": 3.880440744273425e-05, + "loss": 0.1781, + "step": 3095 + }, + { + "epoch": 4.922098569157392, + "grad_norm": 7.218476478290796, + "learning_rate": 3.881018367847975e-05, + "loss": 0.213, + "step": 3096 + }, + { + "epoch": 4.923688394276629, + "grad_norm": 5.426793080187448, + "learning_rate": 3.881596031693499e-05, + "loss": 0.1935, + "step": 3097 + }, + { + "epoch": 4.925278219395866, + "grad_norm": 3.739091091665215, + "learning_rate": 3.88217373561448e-05, + "loss": 0.2606, + "step": 3098 + }, + { + "epoch": 4.926868044515103, + "grad_norm": 10.127601911827314, + "learning_rate": 3.882751479415384e-05, + "loss": 0.1638, + "step": 3099 + }, + { + "epoch": 4.92845786963434, + "grad_norm": 3.848298022275133, + "learning_rate": 3.883329262900667e-05, + "loss": 0.1869, + "step": 3100 + }, + { + "epoch": 4.930047694753577, + "grad_norm": 5.4332112614738985, + "learning_rate": 3.88390708587477e-05, + "loss": 0.1583, + "step": 3101 + }, + { + "epoch": 4.9316375198728135, + "grad_norm": 8.498489618072727, + "learning_rate": 3.88448494814212e-05, + "loss": 0.1447, + "step": 3102 + }, + { + "epoch": 4.9332273449920505, + "grad_norm": 4.13153271477355, + "learning_rate": 3.885062849507133e-05, + "loss": 0.1734, + "step": 3103 + }, + { + "epoch": 4.9348171701112875, + "grad_norm": 4.24989692186795, + "learning_rate": 3.88564078977421e-05, + "loss": 0.1457, + "step": 3104 + }, + { + "epoch": 4.9364069952305245, + "grad_norm": 8.100504599419704, + "learning_rate": 3.8862187687477385e-05, + "loss": 0.1868, + "step": 3105 + }, + { + "epoch": 4.9379968203497615, + "grad_norm": 5.8665941092678215, + "learning_rate": 3.8867967862320934e-05, + "loss": 0.142, + "step": 3106 + }, + { + "epoch": 4.9395866454689985, + "grad_norm": 5.4589548097785965, + "learning_rate": 3.8873748420316374e-05, + "loss": 0.2053, + "step": 3107 + }, + { + "epoch": 4.9411764705882355, + "grad_norm": 3.3942593491296678, + "learning_rate": 3.887952935950719e-05, + "loss": 0.1408, + "step": 3108 + }, + { + "epoch": 4.9427662957074725, + "grad_norm": 5.12486787459897, + "learning_rate": 3.888531067793675e-05, + "loss": 0.2025, + "step": 3109 + }, + { + "epoch": 4.9443561208267095, + "grad_norm": 5.2525956519749135, + "learning_rate": 3.889109237364828e-05, + "loss": 0.1913, + "step": 3110 + }, + { + "epoch": 4.945945945945946, + "grad_norm": 2.864038316996026, + "learning_rate": 3.889687444468488e-05, + "loss": 0.1404, + "step": 3111 + }, + { + "epoch": 4.947535771065183, + "grad_norm": 6.797000131628201, + "learning_rate": 3.890265688908955e-05, + "loss": 0.2165, + "step": 3112 + }, + { + "epoch": 4.94912559618442, + "grad_norm": 3.466099705023118, + "learning_rate": 3.8908439704905117e-05, + "loss": 0.2067, + "step": 3113 + }, + { + "epoch": 4.950715421303657, + "grad_norm": 5.277411737435799, + "learning_rate": 3.891422289017433e-05, + "loss": 0.1917, + "step": 3114 + }, + { + "epoch": 4.952305246422894, + "grad_norm": 3.8337225622576536, + "learning_rate": 3.8920006442939776e-05, + "loss": 0.189, + "step": 3115 + }, + { + "epoch": 4.953895071542131, + "grad_norm": 2.5796117173994944, + "learning_rate": 3.892579036124393e-05, + "loss": 0.1863, + "step": 3116 + }, + { + "epoch": 4.955484896661368, + "grad_norm": 2.940643750393657, + "learning_rate": 3.893157464312915e-05, + "loss": 0.2351, + "step": 3117 + }, + { + "epoch": 4.957074721780604, + "grad_norm": 3.8168939822379033, + "learning_rate": 3.893735928663767e-05, + "loss": 0.1579, + "step": 3118 + }, + { + "epoch": 4.958664546899841, + "grad_norm": 3.7655867256903237, + "learning_rate": 3.894314428981159e-05, + "loss": 0.1185, + "step": 3119 + }, + { + "epoch": 4.960254372019078, + "grad_norm": 3.879014527226025, + "learning_rate": 3.89489296506929e-05, + "loss": 0.1563, + "step": 3120 + }, + { + "epoch": 4.961844197138315, + "grad_norm": 3.8334049876077083, + "learning_rate": 3.8954715367323464e-05, + "loss": 0.1989, + "step": 3121 + }, + { + "epoch": 4.963434022257552, + "grad_norm": 95.99177881105098, + "learning_rate": 3.896050143774503e-05, + "loss": 7.1704, + "step": 3122 + }, + { + "epoch": 4.965023847376789, + "grad_norm": 4.616689773430598, + "learning_rate": 3.896628785999922e-05, + "loss": 0.1759, + "step": 3123 + }, + { + "epoch": 4.966613672496026, + "grad_norm": 3.1736841179069137, + "learning_rate": 3.897207463212753e-05, + "loss": 0.192, + "step": 3124 + }, + { + "epoch": 4.968203497615263, + "grad_norm": 5.365203475002759, + "learning_rate": 3.897786175217137e-05, + "loss": 0.1588, + "step": 3125 + }, + { + "epoch": 4.9697933227345, + "grad_norm": 5.527596614855162, + "learning_rate": 3.898364921817199e-05, + "loss": 0.2012, + "step": 3126 + }, + { + "epoch": 4.971383147853736, + "grad_norm": 3.327038316209095, + "learning_rate": 3.898943702817054e-05, + "loss": 0.2234, + "step": 3127 + }, + { + "epoch": 4.972972972972973, + "grad_norm": 6.278150492275312, + "learning_rate": 3.899522518020807e-05, + "loss": 0.2227, + "step": 3128 + }, + { + "epoch": 4.97456279809221, + "grad_norm": 4.4099050207338735, + "learning_rate": 3.900101367232549e-05, + "loss": 0.2306, + "step": 3129 + }, + { + "epoch": 4.976152623211447, + "grad_norm": 32.832611074731645, + "learning_rate": 3.900680250256361e-05, + "loss": 3.6093, + "step": 3130 + }, + { + "epoch": 4.977742448330684, + "grad_norm": 4.3220987737851155, + "learning_rate": 3.9012591668963124e-05, + "loss": 0.1678, + "step": 3131 + }, + { + "epoch": 4.979332273449921, + "grad_norm": 3.7522681148538624, + "learning_rate": 3.90183811695646e-05, + "loss": 0.1821, + "step": 3132 + }, + { + "epoch": 4.980922098569158, + "grad_norm": 3.364546562516134, + "learning_rate": 3.9024171002408507e-05, + "loss": 0.2195, + "step": 3133 + }, + { + "epoch": 4.982511923688394, + "grad_norm": 3.0099732441028775, + "learning_rate": 3.902996116553519e-05, + "loss": 0.1391, + "step": 3134 + }, + { + "epoch": 4.984101748807631, + "grad_norm": 3.6622506682196216, + "learning_rate": 3.9035751656984906e-05, + "loss": 0.1804, + "step": 3135 + }, + { + "epoch": 4.985691573926868, + "grad_norm": 4.461055451995872, + "learning_rate": 3.904154247479776e-05, + "loss": 0.2082, + "step": 3136 + }, + { + "epoch": 4.987281399046105, + "grad_norm": 5.065649779144699, + "learning_rate": 3.904733361701378e-05, + "loss": 0.1792, + "step": 3137 + }, + { + "epoch": 4.988871224165342, + "grad_norm": 4.688758338761301, + "learning_rate": 3.9053125081672884e-05, + "loss": 0.2159, + "step": 3138 + }, + { + "epoch": 4.990461049284579, + "grad_norm": 36.94888357646705, + "learning_rate": 3.905891686681486e-05, + "loss": 0.7229, + "step": 3139 + }, + { + "epoch": 4.992050874403816, + "grad_norm": 3.4979988817246075, + "learning_rate": 3.9064708970479394e-05, + "loss": 0.1593, + "step": 3140 + }, + { + "epoch": 4.993640699523052, + "grad_norm": 3.836601798619348, + "learning_rate": 3.907050139070608e-05, + "loss": 0.1366, + "step": 3141 + }, + { + "epoch": 4.995230524642289, + "grad_norm": 7.737929992375876, + "learning_rate": 3.907629412553438e-05, + "loss": 0.2417, + "step": 3142 + }, + { + "epoch": 4.996820349761526, + "grad_norm": 4.245040692056753, + "learning_rate": 3.908208717300368e-05, + "loss": 0.1997, + "step": 3143 + }, + { + "epoch": 4.998410174880763, + "grad_norm": 4.493782036073353, + "learning_rate": 3.908788053115324e-05, + "loss": 0.2497, + "step": 3144 + }, + { + "epoch": 5.0, + "grad_norm": 3.813783244571172, + "learning_rate": 3.9093674198022205e-05, + "loss": 0.2005, + "step": 3145 + }, + { + "epoch": 5.001589825119237, + "grad_norm": 3.1417976810739967, + "learning_rate": 3.909946817164963e-05, + "loss": 0.1914, + "step": 3146 + }, + { + "epoch": 5.003179650238474, + "grad_norm": 3.9449831084003137, + "learning_rate": 3.9105262450074476e-05, + "loss": 0.174, + "step": 3147 + }, + { + "epoch": 5.004769475357711, + "grad_norm": 6.406942478942772, + "learning_rate": 3.9111057031335585e-05, + "loss": 0.2943, + "step": 3148 + }, + { + "epoch": 5.006359300476947, + "grad_norm": 6.309121861947579, + "learning_rate": 3.91168519134717e-05, + "loss": 0.1483, + "step": 3149 + }, + { + "epoch": 5.007949125596184, + "grad_norm": 6.913258123726462, + "learning_rate": 3.912264709452147e-05, + "loss": 0.1531, + "step": 3150 + }, + { + "epoch": 5.009538950715421, + "grad_norm": 5.1484398513077565, + "learning_rate": 3.912844257252342e-05, + "loss": 0.1614, + "step": 3151 + }, + { + "epoch": 5.011128775834658, + "grad_norm": 5.152002639415075, + "learning_rate": 3.913423834551601e-05, + "loss": 0.2535, + "step": 3152 + }, + { + "epoch": 5.012718600953895, + "grad_norm": 6.753549498925175, + "learning_rate": 3.914003441153756e-05, + "loss": 0.2111, + "step": 3153 + }, + { + "epoch": 5.014308426073132, + "grad_norm": 6.631188047880723, + "learning_rate": 3.914583076862632e-05, + "loss": 0.2029, + "step": 3154 + }, + { + "epoch": 5.015898251192369, + "grad_norm": 4.47973446001777, + "learning_rate": 3.915162741482045e-05, + "loss": 0.1836, + "step": 3155 + }, + { + "epoch": 5.017488076311606, + "grad_norm": 4.821564486769781, + "learning_rate": 3.915742434815797e-05, + "loss": 0.1872, + "step": 3156 + }, + { + "epoch": 5.019077901430842, + "grad_norm": 4.236725563156107, + "learning_rate": 3.916322156667684e-05, + "loss": 0.2052, + "step": 3157 + }, + { + "epoch": 5.020667726550079, + "grad_norm": 6.193891378671041, + "learning_rate": 3.9169019068414915e-05, + "loss": 0.1834, + "step": 3158 + }, + { + "epoch": 5.022257551669316, + "grad_norm": 4.206869567992898, + "learning_rate": 3.9174816851409946e-05, + "loss": 0.151, + "step": 3159 + }, + { + "epoch": 5.023847376788553, + "grad_norm": 4.201639270549849, + "learning_rate": 3.918061491369959e-05, + "loss": 0.1732, + "step": 3160 + }, + { + "epoch": 5.02543720190779, + "grad_norm": 9.223174267708767, + "learning_rate": 3.9186413253321415e-05, + "loss": 0.1554, + "step": 3161 + }, + { + "epoch": 5.027027027027027, + "grad_norm": 3.951693125178095, + "learning_rate": 3.91922118683129e-05, + "loss": 0.1459, + "step": 3162 + }, + { + "epoch": 5.028616852146264, + "grad_norm": 5.464513987640649, + "learning_rate": 3.919801075671141e-05, + "loss": 0.155, + "step": 3163 + }, + { + "epoch": 5.030206677265501, + "grad_norm": 4.296348378630646, + "learning_rate": 3.9203809916554244e-05, + "loss": 0.1718, + "step": 3164 + }, + { + "epoch": 5.031796502384737, + "grad_norm": 6.216783349367615, + "learning_rate": 3.920960934587859e-05, + "loss": 0.1661, + "step": 3165 + }, + { + "epoch": 5.033386327503974, + "grad_norm": 5.746216944526852, + "learning_rate": 3.921540904272155e-05, + "loss": 0.1482, + "step": 3166 + }, + { + "epoch": 5.034976152623211, + "grad_norm": 6.646989962893031, + "learning_rate": 3.922120900512014e-05, + "loss": 0.2222, + "step": 3167 + }, + { + "epoch": 5.036565977742448, + "grad_norm": 4.274581344410608, + "learning_rate": 3.9227009231111287e-05, + "loss": 0.1962, + "step": 3168 + }, + { + "epoch": 5.038155802861685, + "grad_norm": 5.526619859866819, + "learning_rate": 3.9232809718731816e-05, + "loss": 0.1538, + "step": 3169 + }, + { + "epoch": 5.039745627980922, + "grad_norm": 3.466969258873424, + "learning_rate": 3.9238610466018474e-05, + "loss": 0.188, + "step": 3170 + }, + { + "epoch": 5.041335453100159, + "grad_norm": 4.678882367267456, + "learning_rate": 3.924441147100792e-05, + "loss": 0.1577, + "step": 3171 + }, + { + "epoch": 5.042925278219396, + "grad_norm": 6.234370958066066, + "learning_rate": 3.9250212731736725e-05, + "loss": 0.1545, + "step": 3172 + }, + { + "epoch": 5.044515103338632, + "grad_norm": 6.22621475238784, + "learning_rate": 3.9256014246241365e-05, + "loss": 0.2901, + "step": 3173 + }, + { + "epoch": 5.046104928457869, + "grad_norm": 2.930700640998104, + "learning_rate": 3.9261816012558254e-05, + "loss": 0.2099, + "step": 3174 + }, + { + "epoch": 5.047694753577106, + "grad_norm": 5.456786983899228, + "learning_rate": 3.9267618028723686e-05, + "loss": 0.1296, + "step": 3175 + }, + { + "epoch": 5.049284578696343, + "grad_norm": 3.3555529633921, + "learning_rate": 3.927342029277389e-05, + "loss": 0.2012, + "step": 3176 + }, + { + "epoch": 5.05087440381558, + "grad_norm": 3.851414442594951, + "learning_rate": 3.9279222802745025e-05, + "loss": 0.1742, + "step": 3177 + }, + { + "epoch": 5.052464228934817, + "grad_norm": 8.304943650481555, + "learning_rate": 3.928502555667314e-05, + "loss": 0.184, + "step": 3178 + }, + { + "epoch": 5.054054054054054, + "grad_norm": 2.458337897465749, + "learning_rate": 3.9290828552594215e-05, + "loss": 0.215, + "step": 3179 + }, + { + "epoch": 5.0556438791732905, + "grad_norm": 8.75973173521465, + "learning_rate": 3.929663178854415e-05, + "loss": 0.2166, + "step": 3180 + }, + { + "epoch": 5.0572337042925275, + "grad_norm": 8.031836035969858, + "learning_rate": 3.9302435262558754e-05, + "loss": 0.1975, + "step": 3181 + }, + { + "epoch": 5.0588235294117645, + "grad_norm": 6.718383278025328, + "learning_rate": 3.930823897267376e-05, + "loss": 0.171, + "step": 3182 + }, + { + "epoch": 5.0604133545310015, + "grad_norm": 7.544222715371571, + "learning_rate": 3.931404291692482e-05, + "loss": 0.1957, + "step": 3183 + }, + { + "epoch": 5.0620031796502385, + "grad_norm": 4.207459218002939, + "learning_rate": 3.931984709334752e-05, + "loss": 0.1771, + "step": 3184 + }, + { + "epoch": 5.0635930047694755, + "grad_norm": 5.2824820028364545, + "learning_rate": 3.9325651499977346e-05, + "loss": 0.1335, + "step": 3185 + }, + { + "epoch": 5.0651828298887125, + "grad_norm": 4.981247974779617, + "learning_rate": 3.933145613484973e-05, + "loss": 0.1212, + "step": 3186 + }, + { + "epoch": 5.0667726550079495, + "grad_norm": 6.804385146422567, + "learning_rate": 3.9337260996e-05, + "loss": 0.2551, + "step": 3187 + }, + { + "epoch": 5.068362480127186, + "grad_norm": 5.870305467893209, + "learning_rate": 3.934306608146343e-05, + "loss": 0.182, + "step": 3188 + }, + { + "epoch": 5.069952305246423, + "grad_norm": 3.783745988016876, + "learning_rate": 3.934887138927519e-05, + "loss": 0.1281, + "step": 3189 + }, + { + "epoch": 5.07154213036566, + "grad_norm": 5.4158916666554395, + "learning_rate": 3.935467691747042e-05, + "loss": 0.1636, + "step": 3190 + }, + { + "epoch": 5.073131955484897, + "grad_norm": 5.4575347927256415, + "learning_rate": 3.936048266408415e-05, + "loss": 0.2357, + "step": 3191 + }, + { + "epoch": 5.074721780604134, + "grad_norm": 1.9378795006277796, + "learning_rate": 3.936628862715133e-05, + "loss": 0.1402, + "step": 3192 + }, + { + "epoch": 5.076311605723371, + "grad_norm": 4.030018064462303, + "learning_rate": 3.9372094804706866e-05, + "loss": 0.1065, + "step": 3193 + }, + { + "epoch": 5.077901430842608, + "grad_norm": 16.864532344243376, + "learning_rate": 3.937790119478558e-05, + "loss": 2.0799, + "step": 3194 + }, + { + "epoch": 5.079491255961845, + "grad_norm": 2.1652832401350377, + "learning_rate": 3.9383707795422206e-05, + "loss": 0.1495, + "step": 3195 + }, + { + "epoch": 5.081081081081081, + "grad_norm": 6.010736434788803, + "learning_rate": 3.938951460465143e-05, + "loss": 0.1946, + "step": 3196 + }, + { + "epoch": 5.082670906200318, + "grad_norm": 4.6598299262982845, + "learning_rate": 3.939532162050786e-05, + "loss": 0.1434, + "step": 3197 + }, + { + "epoch": 5.084260731319555, + "grad_norm": 4.116653593391845, + "learning_rate": 3.940112884102602e-05, + "loss": 0.1846, + "step": 3198 + }, + { + "epoch": 5.085850556438792, + "grad_norm": 5.347967583013869, + "learning_rate": 3.940693626424038e-05, + "loss": 0.149, + "step": 3199 + }, + { + "epoch": 5.087440381558029, + "grad_norm": 4.141401834558379, + "learning_rate": 3.9412743888185346e-05, + "loss": 0.1861, + "step": 3200 + }, + { + "epoch": 5.089030206677266, + "grad_norm": 3.134515148826531, + "learning_rate": 3.9418551710895245e-05, + "loss": 0.1369, + "step": 3201 + }, + { + "epoch": 5.090620031796503, + "grad_norm": 3.342417052993152, + "learning_rate": 3.9424359730404326e-05, + "loss": 0.1721, + "step": 3202 + }, + { + "epoch": 5.09220985691574, + "grad_norm": 4.2037465630375666, + "learning_rate": 3.943016794474681e-05, + "loss": 0.1365, + "step": 3203 + }, + { + "epoch": 5.093799682034976, + "grad_norm": 3.4948477781928817, + "learning_rate": 3.943597635195679e-05, + "loss": 0.1631, + "step": 3204 + }, + { + "epoch": 5.095389507154213, + "grad_norm": 5.418980214654038, + "learning_rate": 3.944178495006837e-05, + "loss": 0.1665, + "step": 3205 + }, + { + "epoch": 5.09697933227345, + "grad_norm": 6.748148422544536, + "learning_rate": 3.944759373711552e-05, + "loss": 0.1442, + "step": 3206 + }, + { + "epoch": 5.098569157392687, + "grad_norm": 2.7301299987849483, + "learning_rate": 3.9453402711132186e-05, + "loss": 0.1858, + "step": 3207 + }, + { + "epoch": 5.100158982511924, + "grad_norm": 5.289910021179296, + "learning_rate": 3.945921187015225e-05, + "loss": 0.1651, + "step": 3208 + }, + { + "epoch": 5.101748807631161, + "grad_norm": 3.9987481519172365, + "learning_rate": 3.9465021212209516e-05, + "loss": 0.1734, + "step": 3209 + }, + { + "epoch": 5.103338632750398, + "grad_norm": 8.694527939580983, + "learning_rate": 3.947083073533772e-05, + "loss": 0.1702, + "step": 3210 + }, + { + "epoch": 5.104928457869635, + "grad_norm": 4.200271018200816, + "learning_rate": 3.9476640437570556e-05, + "loss": 0.1762, + "step": 3211 + }, + { + "epoch": 5.106518282988871, + "grad_norm": 5.000848684418953, + "learning_rate": 3.948245031694167e-05, + "loss": 0.18, + "step": 3212 + }, + { + "epoch": 5.108108108108108, + "grad_norm": 3.4518084572134162, + "learning_rate": 3.94882603714846e-05, + "loss": 0.1451, + "step": 3213 + }, + { + "epoch": 5.109697933227345, + "grad_norm": 6.324116123340721, + "learning_rate": 3.9494070599232865e-05, + "loss": 0.2063, + "step": 3214 + }, + { + "epoch": 5.111287758346582, + "grad_norm": 9.26886097780332, + "learning_rate": 3.9499880998219915e-05, + "loss": 0.6387, + "step": 3215 + }, + { + "epoch": 5.112877583465819, + "grad_norm": 3.334544865492814, + "learning_rate": 3.950569156647914e-05, + "loss": 0.2308, + "step": 3216 + }, + { + "epoch": 5.114467408585056, + "grad_norm": 6.556957324346155, + "learning_rate": 3.9511502302043866e-05, + "loss": 0.1696, + "step": 3217 + }, + { + "epoch": 5.116057233704293, + "grad_norm": 3.543963033174834, + "learning_rate": 3.951731320294738e-05, + "loss": 0.2287, + "step": 3218 + }, + { + "epoch": 5.117647058823529, + "grad_norm": 145.31802865354538, + "learning_rate": 3.9523124267222896e-05, + "loss": 19.6723, + "step": 3219 + }, + { + "epoch": 5.119236883942766, + "grad_norm": 10.808288613485125, + "learning_rate": 3.952893549290357e-05, + "loss": 0.2665, + "step": 3220 + }, + { + "epoch": 5.120826709062003, + "grad_norm": 4.531459823519616, + "learning_rate": 3.9534746878022534e-05, + "loss": 0.1729, + "step": 3221 + }, + { + "epoch": 5.12241653418124, + "grad_norm": 5.816490601092628, + "learning_rate": 3.9540558420612835e-05, + "loss": 0.2112, + "step": 3222 + }, + { + "epoch": 5.124006359300477, + "grad_norm": 6.345665738840117, + "learning_rate": 3.954637011870746e-05, + "loss": 0.2446, + "step": 3223 + }, + { + "epoch": 5.125596184419714, + "grad_norm": 7.331748459175939, + "learning_rate": 3.955218197033939e-05, + "loss": 0.2145, + "step": 3224 + }, + { + "epoch": 5.127186009538951, + "grad_norm": 5.652738366933304, + "learning_rate": 3.9557993973541496e-05, + "loss": 0.2006, + "step": 3225 + }, + { + "epoch": 5.128775834658188, + "grad_norm": 11.039973620648666, + "learning_rate": 3.9563806126346645e-05, + "loss": 0.2168, + "step": 3226 + }, + { + "epoch": 5.130365659777424, + "grad_norm": 7.40845146039602, + "learning_rate": 3.956961842678762e-05, + "loss": 0.3478, + "step": 3227 + }, + { + "epoch": 5.131955484896661, + "grad_norm": 10.789564130931259, + "learning_rate": 3.9575430872897176e-05, + "loss": 0.1828, + "step": 3228 + }, + { + "epoch": 5.133545310015898, + "grad_norm": 9.06250493324314, + "learning_rate": 3.958124346270801e-05, + "loss": 0.1507, + "step": 3229 + }, + { + "epoch": 5.135135135135135, + "grad_norm": 5.2972303558972635, + "learning_rate": 3.958705619425276e-05, + "loss": 0.1879, + "step": 3230 + }, + { + "epoch": 5.136724960254372, + "grad_norm": 7.993466568212637, + "learning_rate": 3.9592869065564043e-05, + "loss": 0.2312, + "step": 3231 + }, + { + "epoch": 5.138314785373609, + "grad_norm": 8.578904337817681, + "learning_rate": 3.9598682074674406e-05, + "loss": 0.2132, + "step": 3232 + }, + { + "epoch": 5.139904610492846, + "grad_norm": 5.414221216728222, + "learning_rate": 3.960449521961635e-05, + "loss": 0.1625, + "step": 3233 + }, + { + "epoch": 5.141494435612083, + "grad_norm": 8.939647641844699, + "learning_rate": 3.9610308498422346e-05, + "loss": 0.19, + "step": 3234 + }, + { + "epoch": 5.143084260731319, + "grad_norm": 4092.262599592264, + "learning_rate": 3.9616121909124805e-05, + "loss": 0.8037, + "step": 3235 + }, + { + "epoch": 5.144674085850556, + "grad_norm": 7.23206547269352, + "learning_rate": 3.962193544975609e-05, + "loss": 0.2043, + "step": 3236 + }, + { + "epoch": 5.146263910969793, + "grad_norm": 8.93656683745644, + "learning_rate": 3.962774911834854e-05, + "loss": 0.1126, + "step": 3237 + }, + { + "epoch": 5.14785373608903, + "grad_norm": 4.323413224009858, + "learning_rate": 3.963356291293444e-05, + "loss": 0.1965, + "step": 3238 + }, + { + "epoch": 5.149443561208267, + "grad_norm": 5.428463932826973, + "learning_rate": 3.963937683154602e-05, + "loss": 0.1433, + "step": 3239 + }, + { + "epoch": 5.151033386327504, + "grad_norm": 4.449657009795377, + "learning_rate": 3.9645190872215485e-05, + "loss": 0.1661, + "step": 3240 + }, + { + "epoch": 5.152623211446741, + "grad_norm": 5.787103594564749, + "learning_rate": 3.9651005032975e-05, + "loss": 0.1817, + "step": 3241 + }, + { + "epoch": 5.154213036565977, + "grad_norm": 6.983731210632461, + "learning_rate": 3.9656819311856655e-05, + "loss": 0.2322, + "step": 3242 + }, + { + "epoch": 5.155802861685214, + "grad_norm": 6.836654150500689, + "learning_rate": 3.9662633706892565e-05, + "loss": 0.2177, + "step": 3243 + }, + { + "epoch": 5.157392686804451, + "grad_norm": 6.553532738540634, + "learning_rate": 3.9668448216114736e-05, + "loss": 0.2438, + "step": 3244 + }, + { + "epoch": 5.158982511923688, + "grad_norm": 4.597066789152632, + "learning_rate": 3.967426283755519e-05, + "loss": 0.1506, + "step": 3245 + }, + { + "epoch": 5.160572337042925, + "grad_norm": 3.9257205352117843, + "learning_rate": 3.968007756924587e-05, + "loss": 0.174, + "step": 3246 + }, + { + "epoch": 5.162162162162162, + "grad_norm": 5.039360461836179, + "learning_rate": 3.968589240921872e-05, + "loss": 0.1864, + "step": 3247 + }, + { + "epoch": 5.163751987281399, + "grad_norm": 3.2656331476213194, + "learning_rate": 3.969170735550561e-05, + "loss": 0.1405, + "step": 3248 + }, + { + "epoch": 5.165341812400636, + "grad_norm": 5.186387579253642, + "learning_rate": 3.969752240613839e-05, + "loss": 0.3296, + "step": 3249 + }, + { + "epoch": 5.166931637519872, + "grad_norm": 7.761758220008351, + "learning_rate": 3.970333755914889e-05, + "loss": 0.2316, + "step": 3250 + }, + { + "epoch": 5.168521462639109, + "grad_norm": 2.947448626730211, + "learning_rate": 3.970915281256889e-05, + "loss": 0.1702, + "step": 3251 + }, + { + "epoch": 5.170111287758346, + "grad_norm": 4.6527327075401566, + "learning_rate": 3.971496816443012e-05, + "loss": 0.1717, + "step": 3252 + }, + { + "epoch": 5.171701112877583, + "grad_norm": 7.074653775487216, + "learning_rate": 3.9720783612764316e-05, + "loss": 0.1162, + "step": 3253 + }, + { + "epoch": 5.17329093799682, + "grad_norm": 4.435167150520846, + "learning_rate": 3.972659915560314e-05, + "loss": 0.2195, + "step": 3254 + }, + { + "epoch": 5.174880763116057, + "grad_norm": 4.245252395138678, + "learning_rate": 3.9732414790978256e-05, + "loss": 0.1491, + "step": 3255 + }, + { + "epoch": 5.176470588235294, + "grad_norm": 6.579511092477509, + "learning_rate": 3.9738230516921264e-05, + "loss": 0.8204, + "step": 3256 + }, + { + "epoch": 5.178060413354531, + "grad_norm": 2.638582405072483, + "learning_rate": 3.974404633146378e-05, + "loss": 0.1757, + "step": 3257 + }, + { + "epoch": 5.1796502384737675, + "grad_norm": 3.423092395941031, + "learning_rate": 3.974986223263734e-05, + "loss": 0.1567, + "step": 3258 + }, + { + "epoch": 5.1812400635930045, + "grad_norm": 5.188173294617301, + "learning_rate": 3.975567821847347e-05, + "loss": 0.1773, + "step": 3259 + }, + { + "epoch": 5.1828298887122415, + "grad_norm": 1.897788890273969, + "learning_rate": 3.9761494287003676e-05, + "loss": 0.2289, + "step": 3260 + }, + { + "epoch": 5.1844197138314785, + "grad_norm": 4.475780906306349, + "learning_rate": 3.976731043625944e-05, + "loss": 0.1919, + "step": 3261 + }, + { + "epoch": 5.1860095389507155, + "grad_norm": 3.9359197141331075, + "learning_rate": 3.977312666427219e-05, + "loss": 0.144, + "step": 3262 + }, + { + "epoch": 5.1875993640699525, + "grad_norm": 2.269791581924391, + "learning_rate": 3.977894296907335e-05, + "loss": 0.1865, + "step": 3263 + }, + { + "epoch": 5.1891891891891895, + "grad_norm": 2.4280072915130173, + "learning_rate": 3.9784759348694306e-05, + "loss": 0.1425, + "step": 3264 + }, + { + "epoch": 5.1907790143084265, + "grad_norm": 2.8096177936195104, + "learning_rate": 3.979057580116643e-05, + "loss": 0.1904, + "step": 3265 + }, + { + "epoch": 5.192368839427663, + "grad_norm": 4.409887184584052, + "learning_rate": 3.9796392324521065e-05, + "loss": 0.1311, + "step": 3266 + }, + { + "epoch": 5.1939586645469, + "grad_norm": 3.4016637983221583, + "learning_rate": 3.9802208916789524e-05, + "loss": 0.1331, + "step": 3267 + }, + { + "epoch": 5.195548489666137, + "grad_norm": 4.769797843588128, + "learning_rate": 3.98080255760031e-05, + "loss": 0.1388, + "step": 3268 + }, + { + "epoch": 5.197138314785374, + "grad_norm": 40.81795972676857, + "learning_rate": 3.9813842300193074e-05, + "loss": 4.4096, + "step": 3269 + }, + { + "epoch": 5.198728139904611, + "grad_norm": 2.825967566402579, + "learning_rate": 3.981965908739068e-05, + "loss": 0.1179, + "step": 3270 + }, + { + "epoch": 5.200317965023848, + "grad_norm": 4.762896386637612, + "learning_rate": 3.9825475935627164e-05, + "loss": 0.1797, + "step": 3271 + }, + { + "epoch": 5.201907790143085, + "grad_norm": 3.7688040914293, + "learning_rate": 3.983129284293372e-05, + "loss": 0.1727, + "step": 3272 + }, + { + "epoch": 5.203497615262322, + "grad_norm": 18.756918820172697, + "learning_rate": 3.983710980734154e-05, + "loss": 2.2986, + "step": 3273 + }, + { + "epoch": 5.205087440381558, + "grad_norm": 5.068450244438738, + "learning_rate": 3.98429268268818e-05, + "loss": 0.1633, + "step": 3274 + }, + { + "epoch": 5.206677265500795, + "grad_norm": 6.316160716035796, + "learning_rate": 3.9848743899585624e-05, + "loss": 0.1777, + "step": 3275 + }, + { + "epoch": 5.208267090620032, + "grad_norm": 4.978922563980388, + "learning_rate": 3.985456102348417e-05, + "loss": 0.1758, + "step": 3276 + }, + { + "epoch": 5.209856915739269, + "grad_norm": 5.128355211832838, + "learning_rate": 3.9860378196608546e-05, + "loss": 0.155, + "step": 3277 + }, + { + "epoch": 5.211446740858506, + "grad_norm": 4.605099223317655, + "learning_rate": 3.986619541698985e-05, + "loss": 0.1333, + "step": 3278 + }, + { + "epoch": 5.213036565977743, + "grad_norm": 5.014356736648708, + "learning_rate": 3.9872012682659156e-05, + "loss": 0.2243, + "step": 3279 + }, + { + "epoch": 5.21462639109698, + "grad_norm": 6.247221292130196, + "learning_rate": 3.987782999164753e-05, + "loss": 0.1712, + "step": 3280 + }, + { + "epoch": 5.216216216216216, + "grad_norm": 4.029832460639627, + "learning_rate": 3.988364734198603e-05, + "loss": 0.1758, + "step": 3281 + }, + { + "epoch": 5.217806041335453, + "grad_norm": 2.8470184076843075, + "learning_rate": 3.98894647317057e-05, + "loss": 0.1547, + "step": 3282 + }, + { + "epoch": 5.21939586645469, + "grad_norm": 3.743275809736415, + "learning_rate": 3.9895282158837544e-05, + "loss": 0.1708, + "step": 3283 + }, + { + "epoch": 5.220985691573927, + "grad_norm": 3.367643093305827, + "learning_rate": 3.990109962141259e-05, + "loss": 0.8137, + "step": 3284 + }, + { + "epoch": 5.222575516693164, + "grad_norm": 3.162086008074213, + "learning_rate": 3.990691711746183e-05, + "loss": 0.1538, + "step": 3285 + }, + { + "epoch": 5.224165341812401, + "grad_norm": 4.858849251407634, + "learning_rate": 3.991273464501626e-05, + "loss": 0.153, + "step": 3286 + }, + { + "epoch": 5.225755166931638, + "grad_norm": 31.148176906356223, + "learning_rate": 3.9918552202106855e-05, + "loss": 3.5175, + "step": 3287 + }, + { + "epoch": 5.227344992050875, + "grad_norm": 2.5285767735175972, + "learning_rate": 3.9924369786764576e-05, + "loss": 0.1612, + "step": 3288 + }, + { + "epoch": 5.228934817170111, + "grad_norm": 2.353182039923773, + "learning_rate": 3.9930187397020386e-05, + "loss": 0.1328, + "step": 3289 + }, + { + "epoch": 5.230524642289348, + "grad_norm": 4.475020504788664, + "learning_rate": 3.9936005030905236e-05, + "loss": 0.1434, + "step": 3290 + }, + { + "epoch": 5.232114467408585, + "grad_norm": 4.449036431526402, + "learning_rate": 3.994182268645006e-05, + "loss": 0.1849, + "step": 3291 + }, + { + "epoch": 5.233704292527822, + "grad_norm": 2.6514551813284974, + "learning_rate": 3.9947640361685806e-05, + "loss": 0.1433, + "step": 3292 + }, + { + "epoch": 5.235294117647059, + "grad_norm": 2.8249125552238588, + "learning_rate": 3.995345805464339e-05, + "loss": 0.168, + "step": 3293 + }, + { + "epoch": 5.236883942766296, + "grad_norm": 1.9998998085158213, + "learning_rate": 3.9959275763353736e-05, + "loss": 0.1304, + "step": 3294 + }, + { + "epoch": 5.238473767885533, + "grad_norm": 2.776144824906679, + "learning_rate": 3.996509348584777e-05, + "loss": 0.1639, + "step": 3295 + }, + { + "epoch": 5.24006359300477, + "grad_norm": 4.202170410640712, + "learning_rate": 3.9970911220156376e-05, + "loss": 0.1085, + "step": 3296 + }, + { + "epoch": 5.241653418124006, + "grad_norm": 5.437767960722192, + "learning_rate": 3.9976728964310496e-05, + "loss": 0.4754, + "step": 3297 + }, + { + "epoch": 5.243243243243243, + "grad_norm": 2.364960647546418, + "learning_rate": 3.998254671634102e-05, + "loss": 0.1758, + "step": 3298 + }, + { + "epoch": 5.24483306836248, + "grad_norm": 2.484354277729835, + "learning_rate": 3.9988364474278846e-05, + "loss": 0.1545, + "step": 3299 + }, + { + "epoch": 5.246422893481717, + "grad_norm": 3.2765260505908214, + "learning_rate": 3.9994182236154874e-05, + "loss": 0.1243, + "step": 3300 + }, + { + "epoch": 5.248012718600954, + "grad_norm": 4.676120416950602, + "learning_rate": 4e-05, + "loss": 0.1345, + "step": 3301 + }, + { + "epoch": 5.249602543720191, + "grad_norm": 4.706684573215264, + "learning_rate": 4.000581776384513e-05, + "loss": 0.1956, + "step": 3302 + }, + { + "epoch": 5.251192368839428, + "grad_norm": 2.9243588502642552, + "learning_rate": 4.001163552572116e-05, + "loss": 0.2116, + "step": 3303 + }, + { + "epoch": 5.252782193958664, + "grad_norm": 4.474191424182363, + "learning_rate": 4.001745328365899e-05, + "loss": 0.1562, + "step": 3304 + }, + { + "epoch": 5.254372019077901, + "grad_norm": 3.2772366021728554, + "learning_rate": 4.0023271035689504e-05, + "loss": 0.1661, + "step": 3305 + }, + { + "epoch": 5.255961844197138, + "grad_norm": 23.306050064455068, + "learning_rate": 4.002908877984362e-05, + "loss": 2.7261, + "step": 3306 + }, + { + "epoch": 5.257551669316375, + "grad_norm": 4.585041783181546, + "learning_rate": 4.003490651415224e-05, + "loss": 0.1744, + "step": 3307 + }, + { + "epoch": 5.259141494435612, + "grad_norm": 3.2449976298934313, + "learning_rate": 4.004072423664627e-05, + "loss": 0.2096, + "step": 3308 + }, + { + "epoch": 5.260731319554849, + "grad_norm": 2.824591069935683, + "learning_rate": 4.004654194535661e-05, + "loss": 0.1679, + "step": 3309 + }, + { + "epoch": 5.262321144674086, + "grad_norm": 5.7845290415263015, + "learning_rate": 4.0052359638314194e-05, + "loss": 0.1972, + "step": 3310 + }, + { + "epoch": 5.263910969793323, + "grad_norm": 6.705097214681461, + "learning_rate": 4.005817731354994e-05, + "loss": 0.1877, + "step": 3311 + }, + { + "epoch": 5.26550079491256, + "grad_norm": 2.934404440871054, + "learning_rate": 4.0063994969094764e-05, + "loss": 0.2175, + "step": 3312 + }, + { + "epoch": 5.267090620031796, + "grad_norm": 3.4177765599559145, + "learning_rate": 4.0069812602979614e-05, + "loss": 0.1387, + "step": 3313 + }, + { + "epoch": 5.268680445151033, + "grad_norm": 4.74975715103323, + "learning_rate": 4.007563021323543e-05, + "loss": 0.1783, + "step": 3314 + }, + { + "epoch": 5.27027027027027, + "grad_norm": 3.56572285334144, + "learning_rate": 4.008144779789315e-05, + "loss": 0.1493, + "step": 3315 + }, + { + "epoch": 5.271860095389507, + "grad_norm": 3.8024960375836505, + "learning_rate": 4.0087265354983745e-05, + "loss": 0.2099, + "step": 3316 + }, + { + "epoch": 5.273449920508744, + "grad_norm": 3.656701980603239, + "learning_rate": 4.009308288253817e-05, + "loss": 0.1295, + "step": 3317 + }, + { + "epoch": 5.275039745627981, + "grad_norm": 5.733608800726178, + "learning_rate": 4.009890037858742e-05, + "loss": 0.1278, + "step": 3318 + }, + { + "epoch": 5.276629570747218, + "grad_norm": 2.434737257393362, + "learning_rate": 4.010471784116246e-05, + "loss": 0.1535, + "step": 3319 + }, + { + "epoch": 5.278219395866454, + "grad_norm": 4.698910794092207, + "learning_rate": 4.011053526829431e-05, + "loss": 0.1996, + "step": 3320 + }, + { + "epoch": 5.279809220985691, + "grad_norm": 3.145563712189685, + "learning_rate": 4.011635265801397e-05, + "loss": 0.1789, + "step": 3321 + }, + { + "epoch": 5.281399046104928, + "grad_norm": 4.3804363845573215, + "learning_rate": 4.0122170008352475e-05, + "loss": 0.1816, + "step": 3322 + }, + { + "epoch": 5.282988871224165, + "grad_norm": 3.1806527403775884, + "learning_rate": 4.012798731734086e-05, + "loss": 0.2061, + "step": 3323 + }, + { + "epoch": 5.284578696343402, + "grad_norm": 3.556633213133545, + "learning_rate": 4.013380458301016e-05, + "loss": 0.2213, + "step": 3324 + }, + { + "epoch": 5.286168521462639, + "grad_norm": 1.9485092067399932, + "learning_rate": 4.0139621803391454e-05, + "loss": 0.1373, + "step": 3325 + }, + { + "epoch": 5.287758346581876, + "grad_norm": 2.0451088156073176, + "learning_rate": 4.0145438976515825e-05, + "loss": 0.1632, + "step": 3326 + }, + { + "epoch": 5.289348171701113, + "grad_norm": 17.237479277775094, + "learning_rate": 4.0151256100414376e-05, + "loss": 0.4156, + "step": 3327 + }, + { + "epoch": 5.290937996820349, + "grad_norm": 43.2954541541336, + "learning_rate": 4.015707317311821e-05, + "loss": 0.3911, + "step": 3328 + }, + { + "epoch": 5.292527821939586, + "grad_norm": 3.6154481441004047, + "learning_rate": 4.0162890192658464e-05, + "loss": 0.2335, + "step": 3329 + }, + { + "epoch": 5.294117647058823, + "grad_norm": 3.012067627211368, + "learning_rate": 4.0168707157066274e-05, + "loss": 0.2298, + "step": 3330 + }, + { + "epoch": 5.29570747217806, + "grad_norm": 3.489115072049047, + "learning_rate": 4.0174524064372836e-05, + "loss": 0.1175, + "step": 3331 + }, + { + "epoch": 5.297297297297297, + "grad_norm": 3.410082102229313, + "learning_rate": 4.018034091260931e-05, + "loss": 0.1145, + "step": 3332 + }, + { + "epoch": 5.298887122416534, + "grad_norm": 9.052742602366273, + "learning_rate": 4.0186157699806926e-05, + "loss": 0.3153, + "step": 3333 + }, + { + "epoch": 5.300476947535771, + "grad_norm": 17.17451838416837, + "learning_rate": 4.01919744239969e-05, + "loss": 0.397, + "step": 3334 + }, + { + "epoch": 5.302066772655008, + "grad_norm": 5.752759364622487, + "learning_rate": 4.0197791083210476e-05, + "loss": 0.1658, + "step": 3335 + }, + { + "epoch": 5.3036565977742445, + "grad_norm": 7.453149093219002, + "learning_rate": 4.020360767547894e-05, + "loss": 0.1605, + "step": 3336 + }, + { + "epoch": 5.3052464228934815, + "grad_norm": 8.115578451092915, + "learning_rate": 4.020942419883357e-05, + "loss": 0.1908, + "step": 3337 + }, + { + "epoch": 5.3068362480127185, + "grad_norm": 3.0685502823653583, + "learning_rate": 4.02152406513057e-05, + "loss": 0.1433, + "step": 3338 + }, + { + "epoch": 5.3084260731319555, + "grad_norm": 5.393659004773812, + "learning_rate": 4.022105703092665e-05, + "loss": 0.2678, + "step": 3339 + }, + { + "epoch": 5.3100158982511925, + "grad_norm": 9.39331757308684, + "learning_rate": 4.0226873335727816e-05, + "loss": 0.2255, + "step": 3340 + }, + { + "epoch": 5.3116057233704295, + "grad_norm": 3.9868721866794203, + "learning_rate": 4.023268956374057e-05, + "loss": 0.2265, + "step": 3341 + }, + { + "epoch": 5.3131955484896665, + "grad_norm": 8.424448771118728, + "learning_rate": 4.0238505712996324e-05, + "loss": 0.1699, + "step": 3342 + }, + { + "epoch": 5.314785373608903, + "grad_norm": 4.619119798854262, + "learning_rate": 4.024432178152654e-05, + "loss": 0.1759, + "step": 3343 + }, + { + "epoch": 5.31637519872814, + "grad_norm": 3.3315937944122394, + "learning_rate": 4.025013776736267e-05, + "loss": 0.1836, + "step": 3344 + }, + { + "epoch": 5.317965023847377, + "grad_norm": 2.4252355387105062, + "learning_rate": 4.0255953668536224e-05, + "loss": 0.1514, + "step": 3345 + }, + { + "epoch": 5.319554848966614, + "grad_norm": 8.488216370881416, + "learning_rate": 4.026176948307873e-05, + "loss": 0.2144, + "step": 3346 + }, + { + "epoch": 5.321144674085851, + "grad_norm": 5.2124161327931295, + "learning_rate": 4.026758520902175e-05, + "loss": 0.152, + "step": 3347 + }, + { + "epoch": 5.322734499205088, + "grad_norm": 2.7240027332725525, + "learning_rate": 4.0273400844396865e-05, + "loss": 0.1249, + "step": 3348 + }, + { + "epoch": 5.324324324324325, + "grad_norm": 4.360065068984866, + "learning_rate": 4.027921638723569e-05, + "loss": 0.2305, + "step": 3349 + }, + { + "epoch": 5.325914149443562, + "grad_norm": 5.014145145816082, + "learning_rate": 4.0285031835569884e-05, + "loss": 0.1435, + "step": 3350 + }, + { + "epoch": 5.327503974562799, + "grad_norm": 3.873091993981591, + "learning_rate": 4.029084718743112e-05, + "loss": 0.1527, + "step": 3351 + }, + { + "epoch": 5.329093799682035, + "grad_norm": 4.992634031784794, + "learning_rate": 4.029666244085111e-05, + "loss": 0.15, + "step": 3352 + }, + { + "epoch": 5.330683624801272, + "grad_norm": 4.22407647573742, + "learning_rate": 4.030247759386161e-05, + "loss": 0.2143, + "step": 3353 + }, + { + "epoch": 5.332273449920509, + "grad_norm": 2.5110114272929094, + "learning_rate": 4.030829264449439e-05, + "loss": 0.1263, + "step": 3354 + }, + { + "epoch": 5.333863275039746, + "grad_norm": 5.4509518847331835, + "learning_rate": 4.031410759078128e-05, + "loss": 0.2156, + "step": 3355 + }, + { + "epoch": 5.335453100158983, + "grad_norm": 4.9706294777102045, + "learning_rate": 4.031992243075413e-05, + "loss": 0.1412, + "step": 3356 + }, + { + "epoch": 5.33704292527822, + "grad_norm": 3.375314329235515, + "learning_rate": 4.0325737162444806e-05, + "loss": 0.1563, + "step": 3357 + }, + { + "epoch": 5.338632750397457, + "grad_norm": 31.85813605087713, + "learning_rate": 4.033155178388526e-05, + "loss": 4.2646, + "step": 3358 + }, + { + "epoch": 5.340222575516693, + "grad_norm": 6.814988206293408, + "learning_rate": 4.033736629310744e-05, + "loss": 0.1767, + "step": 3359 + }, + { + "epoch": 5.34181240063593, + "grad_norm": 4.548251702813716, + "learning_rate": 4.0343180688143345e-05, + "loss": 0.2183, + "step": 3360 + }, + { + "epoch": 5.343402225755167, + "grad_norm": 5.716802161596852, + "learning_rate": 4.034899496702501e-05, + "loss": 0.2087, + "step": 3361 + }, + { + "epoch": 5.344992050874404, + "grad_norm": 4.661047188475285, + "learning_rate": 4.0354809127784515e-05, + "loss": 0.1702, + "step": 3362 + }, + { + "epoch": 5.346581875993641, + "grad_norm": 2.771958683021182, + "learning_rate": 4.0360623168453986e-05, + "loss": 0.1777, + "step": 3363 + }, + { + "epoch": 5.348171701112878, + "grad_norm": 6.198725519731575, + "learning_rate": 4.036643708706557e-05, + "loss": 0.2361, + "step": 3364 + }, + { + "epoch": 5.349761526232115, + "grad_norm": 5.942926854997057, + "learning_rate": 4.037225088165146e-05, + "loss": 0.1879, + "step": 3365 + }, + { + "epoch": 5.351351351351352, + "grad_norm": 5.709530159618127, + "learning_rate": 4.037806455024391e-05, + "loss": 0.3747, + "step": 3366 + }, + { + "epoch": 5.352941176470588, + "grad_norm": 5.921600595461843, + "learning_rate": 4.03838780908752e-05, + "loss": 0.1576, + "step": 3367 + }, + { + "epoch": 5.354531001589825, + "grad_norm": 2.8304427046862135, + "learning_rate": 4.038969150157766e-05, + "loss": 0.1523, + "step": 3368 + }, + { + "epoch": 5.356120826709062, + "grad_norm": 4.590225166731911, + "learning_rate": 4.039550478038365e-05, + "loss": 0.1767, + "step": 3369 + }, + { + "epoch": 5.357710651828299, + "grad_norm": 7.07948111921339, + "learning_rate": 4.04013179253256e-05, + "loss": 0.1522, + "step": 3370 + }, + { + "epoch": 5.359300476947536, + "grad_norm": 3.9079634668104393, + "learning_rate": 4.040713093443596e-05, + "loss": 0.1633, + "step": 3371 + }, + { + "epoch": 5.360890302066773, + "grad_norm": 4.2952145187029585, + "learning_rate": 4.0412943805747245e-05, + "loss": 0.1844, + "step": 3372 + }, + { + "epoch": 5.36248012718601, + "grad_norm": 22.065681080778937, + "learning_rate": 4.0418756537292e-05, + "loss": 2.134, + "step": 3373 + }, + { + "epoch": 5.364069952305247, + "grad_norm": 3.1645902101629457, + "learning_rate": 4.042456912710283e-05, + "loss": 0.143, + "step": 3374 + }, + { + "epoch": 5.365659777424483, + "grad_norm": 68.07460740384182, + "learning_rate": 4.043038157321238e-05, + "loss": 11.0958, + "step": 3375 + }, + { + "epoch": 5.36724960254372, + "grad_norm": 84.63360063316766, + "learning_rate": 4.043619387365336e-05, + "loss": 17.6538, + "step": 3376 + }, + { + "epoch": 5.368839427662957, + "grad_norm": 7.687785150814514, + "learning_rate": 4.044200602645851e-05, + "loss": 0.1898, + "step": 3377 + }, + { + "epoch": 5.370429252782194, + "grad_norm": 6.8688803067296735, + "learning_rate": 4.044781802966062e-05, + "loss": 0.2827, + "step": 3378 + }, + { + "epoch": 5.372019077901431, + "grad_norm": 4.861078091775898, + "learning_rate": 4.045362988129254e-05, + "loss": 0.1649, + "step": 3379 + }, + { + "epoch": 5.373608903020668, + "grad_norm": 6.193300038722646, + "learning_rate": 4.045944157938718e-05, + "loss": 0.1798, + "step": 3380 + }, + { + "epoch": 5.375198728139905, + "grad_norm": 7.897007442024807, + "learning_rate": 4.046525312197747e-05, + "loss": 0.1475, + "step": 3381 + }, + { + "epoch": 5.376788553259141, + "grad_norm": 5.578984562435793, + "learning_rate": 4.047106450709643e-05, + "loss": 0.1526, + "step": 3382 + }, + { + "epoch": 5.378378378378378, + "grad_norm": 6.946858136019721, + "learning_rate": 4.047687573277711e-05, + "loss": 0.169, + "step": 3383 + }, + { + "epoch": 5.379968203497615, + "grad_norm": 6.99040305123249, + "learning_rate": 4.048268679705262e-05, + "loss": 0.188, + "step": 3384 + }, + { + "epoch": 5.381558028616852, + "grad_norm": 6.178934717371057, + "learning_rate": 4.048849769795613e-05, + "loss": 0.1676, + "step": 3385 + }, + { + "epoch": 5.383147853736089, + "grad_norm": 4.124958855854203, + "learning_rate": 4.049430843352086e-05, + "loss": 0.1746, + "step": 3386 + }, + { + "epoch": 5.384737678855326, + "grad_norm": 629.043649003002, + "learning_rate": 4.0500119001780084e-05, + "loss": 10.3385, + "step": 3387 + }, + { + "epoch": 5.386327503974563, + "grad_norm": 5.800127781007638, + "learning_rate": 4.0505929400767134e-05, + "loss": 0.2342, + "step": 3388 + }, + { + "epoch": 5.3879173290938, + "grad_norm": 8.80352542521553, + "learning_rate": 4.05117396285154e-05, + "loss": 0.2009, + "step": 3389 + }, + { + "epoch": 5.389507154213036, + "grad_norm": 9.504592614212847, + "learning_rate": 4.051754968305833e-05, + "loss": 0.2688, + "step": 3390 + }, + { + "epoch": 5.391096979332273, + "grad_norm": 4.686426137358139, + "learning_rate": 4.052335956242944e-05, + "loss": 0.2281, + "step": 3391 + }, + { + "epoch": 5.39268680445151, + "grad_norm": 4.764332275994334, + "learning_rate": 4.052916926466229e-05, + "loss": 0.2784, + "step": 3392 + }, + { + "epoch": 5.394276629570747, + "grad_norm": 5.641877214753397, + "learning_rate": 4.05349787877905e-05, + "loss": 0.2065, + "step": 3393 + }, + { + "epoch": 5.395866454689984, + "grad_norm": 4.75842647787935, + "learning_rate": 4.0540788129847756e-05, + "loss": 0.1942, + "step": 3394 + }, + { + "epoch": 5.397456279809221, + "grad_norm": 5.419073537455076, + "learning_rate": 4.0546597288867814e-05, + "loss": 0.1938, + "step": 3395 + }, + { + "epoch": 5.399046104928458, + "grad_norm": 5.153260302590068, + "learning_rate": 4.0552406262884486e-05, + "loss": 0.1726, + "step": 3396 + }, + { + "epoch": 5.400635930047695, + "grad_norm": 5.793451002280416, + "learning_rate": 4.055821504993164e-05, + "loss": 0.2244, + "step": 3397 + }, + { + "epoch": 5.402225755166931, + "grad_norm": 4.654020802446549, + "learning_rate": 4.056402364804321e-05, + "loss": 0.1899, + "step": 3398 + }, + { + "epoch": 5.403815580286168, + "grad_norm": 3.369713826262213, + "learning_rate": 4.05698320552532e-05, + "loss": 0.1532, + "step": 3399 + }, + { + "epoch": 5.405405405405405, + "grad_norm": 5.063153721530426, + "learning_rate": 4.057564026959568e-05, + "loss": 0.1907, + "step": 3400 + }, + { + "epoch": 5.406995230524642, + "grad_norm": 4.728035001353066, + "learning_rate": 4.058144828910476e-05, + "loss": 0.1683, + "step": 3401 + }, + { + "epoch": 5.408585055643879, + "grad_norm": 3.2048221268248507, + "learning_rate": 4.058725611181465e-05, + "loss": 0.1747, + "step": 3402 + }, + { + "epoch": 5.410174880763116, + "grad_norm": 6.010211143757257, + "learning_rate": 4.059306373575962e-05, + "loss": 0.7913, + "step": 3403 + }, + { + "epoch": 5.411764705882353, + "grad_norm": 2.51097635330686, + "learning_rate": 4.059887115897398e-05, + "loss": 0.1546, + "step": 3404 + }, + { + "epoch": 5.413354531001589, + "grad_norm": 2.949495837613159, + "learning_rate": 4.060467837949215e-05, + "loss": 0.2043, + "step": 3405 + }, + { + "epoch": 5.414944356120826, + "grad_norm": 3.636139195685939, + "learning_rate": 4.0610485395348575e-05, + "loss": 0.1881, + "step": 3406 + }, + { + "epoch": 5.416534181240063, + "grad_norm": 3.8135306184627114, + "learning_rate": 4.0616292204577794e-05, + "loss": 0.2721, + "step": 3407 + }, + { + "epoch": 5.4181240063593, + "grad_norm": 2.5776997873427323, + "learning_rate": 4.062209880521443e-05, + "loss": 0.1456, + "step": 3408 + }, + { + "epoch": 5.419713831478537, + "grad_norm": 4.862697346584616, + "learning_rate": 4.062790519529314e-05, + "loss": 0.1784, + "step": 3409 + }, + { + "epoch": 5.421303656597774, + "grad_norm": 23.804509056014602, + "learning_rate": 4.063371137284868e-05, + "loss": 2.0955, + "step": 3410 + }, + { + "epoch": 5.422893481717011, + "grad_norm": 3.1810064246810312, + "learning_rate": 4.063951733591586e-05, + "loss": 0.1672, + "step": 3411 + }, + { + "epoch": 5.424483306836248, + "grad_norm": 2.825769729708267, + "learning_rate": 4.0645323082529576e-05, + "loss": 0.2175, + "step": 3412 + }, + { + "epoch": 5.426073131955485, + "grad_norm": 3.749217123506885, + "learning_rate": 4.0651128610724813e-05, + "loss": 0.1186, + "step": 3413 + }, + { + "epoch": 5.4276629570747215, + "grad_norm": 4.737349738752162, + "learning_rate": 4.065693391853658e-05, + "loss": 0.1738, + "step": 3414 + }, + { + "epoch": 5.4292527821939585, + "grad_norm": 2.3756235189647015, + "learning_rate": 4.0662739004e-05, + "loss": 0.1469, + "step": 3415 + }, + { + "epoch": 5.4308426073131955, + "grad_norm": 1.9303143586524525, + "learning_rate": 4.0668543865150274e-05, + "loss": 0.1455, + "step": 3416 + }, + { + "epoch": 5.4324324324324325, + "grad_norm": 3.8194861119842733, + "learning_rate": 4.0674348500022654e-05, + "loss": 0.1926, + "step": 3417 + }, + { + "epoch": 5.4340222575516695, + "grad_norm": 4.737618187670062, + "learning_rate": 4.0680152906652485e-05, + "loss": 0.2661, + "step": 3418 + }, + { + "epoch": 5.4356120826709065, + "grad_norm": 3.092654360607652, + "learning_rate": 4.068595708307518e-05, + "loss": 0.1544, + "step": 3419 + }, + { + "epoch": 5.4372019077901435, + "grad_norm": 3.936886971436258, + "learning_rate": 4.069176102732625e-05, + "loss": 0.1809, + "step": 3420 + }, + { + "epoch": 5.43879173290938, + "grad_norm": 5.107064397296858, + "learning_rate": 4.069756473744125e-05, + "loss": 0.185, + "step": 3421 + }, + { + "epoch": 5.440381558028617, + "grad_norm": 4.854920272273713, + "learning_rate": 4.070336821145586e-05, + "loss": 0.1546, + "step": 3422 + }, + { + "epoch": 5.441971383147854, + "grad_norm": 4.791076619573743, + "learning_rate": 4.0709171447405785e-05, + "loss": 0.203, + "step": 3423 + }, + { + "epoch": 5.443561208267091, + "grad_norm": 3.0663372186779116, + "learning_rate": 4.071497444332686e-05, + "loss": 0.1726, + "step": 3424 + }, + { + "epoch": 5.4451510333863276, + "grad_norm": 5.314565504562536, + "learning_rate": 4.0720777197254975e-05, + "loss": 0.1975, + "step": 3425 + }, + { + "epoch": 5.4467408585055646, + "grad_norm": 2.090052674935401, + "learning_rate": 4.072657970722611e-05, + "loss": 0.2041, + "step": 3426 + }, + { + "epoch": 5.4483306836248016, + "grad_norm": 5.59213317121822, + "learning_rate": 4.073238197127632e-05, + "loss": 0.1644, + "step": 3427 + }, + { + "epoch": 5.4499205087440385, + "grad_norm": 2.7828319422145733, + "learning_rate": 4.073818398744175e-05, + "loss": 0.1732, + "step": 3428 + }, + { + "epoch": 5.451510333863275, + "grad_norm": 3.91583310076718, + "learning_rate": 4.074398575375863e-05, + "loss": 0.1978, + "step": 3429 + }, + { + "epoch": 5.453100158982512, + "grad_norm": 2.3144339957234217, + "learning_rate": 4.0749787268263275e-05, + "loss": 0.1445, + "step": 3430 + }, + { + "epoch": 5.454689984101749, + "grad_norm": 4.143710219264504, + "learning_rate": 4.075558852899208e-05, + "loss": 0.2212, + "step": 3431 + }, + { + "epoch": 5.456279809220986, + "grad_norm": 3.5193712714714507, + "learning_rate": 4.076138953398153e-05, + "loss": 0.1956, + "step": 3432 + }, + { + "epoch": 5.457869634340223, + "grad_norm": 4.586965476915138, + "learning_rate": 4.076719028126819e-05, + "loss": 0.1794, + "step": 3433 + }, + { + "epoch": 5.45945945945946, + "grad_norm": 3.9206468694213323, + "learning_rate": 4.077299076888872e-05, + "loss": 0.1624, + "step": 3434 + }, + { + "epoch": 5.461049284578697, + "grad_norm": 2.6218352222848385, + "learning_rate": 4.077879099487986e-05, + "loss": 0.1948, + "step": 3435 + }, + { + "epoch": 5.462639109697934, + "grad_norm": 2.3794331758871445, + "learning_rate": 4.0784590957278455e-05, + "loss": 0.1657, + "step": 3436 + }, + { + "epoch": 5.46422893481717, + "grad_norm": 4.473880039184331, + "learning_rate": 4.079039065412141e-05, + "loss": 0.1603, + "step": 3437 + }, + { + "epoch": 5.465818759936407, + "grad_norm": 6.210250484301298, + "learning_rate": 4.079619008344576e-05, + "loss": 0.2471, + "step": 3438 + }, + { + "epoch": 5.467408585055644, + "grad_norm": 1.874490578051546, + "learning_rate": 4.080198924328859e-05, + "loss": 0.1566, + "step": 3439 + }, + { + "epoch": 5.468998410174881, + "grad_norm": 3.504553237225042, + "learning_rate": 4.08077881316871e-05, + "loss": 0.2066, + "step": 3440 + }, + { + "epoch": 5.470588235294118, + "grad_norm": 3.1977245279355153, + "learning_rate": 4.0813586746678584e-05, + "loss": 0.1604, + "step": 3441 + }, + { + "epoch": 5.472178060413355, + "grad_norm": 2.2377346091992556, + "learning_rate": 4.081938508630041e-05, + "loss": 0.1283, + "step": 3442 + }, + { + "epoch": 5.473767885532592, + "grad_norm": 2.7994478914366847, + "learning_rate": 4.0825183148590054e-05, + "loss": 0.1715, + "step": 3443 + }, + { + "epoch": 5.475357710651828, + "grad_norm": 77.7064826261626, + "learning_rate": 4.083098093158508e-05, + "loss": 2.062, + "step": 3444 + }, + { + "epoch": 5.476947535771065, + "grad_norm": 4.515084357690424, + "learning_rate": 4.083677843332315e-05, + "loss": 0.2102, + "step": 3445 + }, + { + "epoch": 5.478537360890302, + "grad_norm": 4.040298926171633, + "learning_rate": 4.0842575651842024e-05, + "loss": 0.1736, + "step": 3446 + }, + { + "epoch": 5.480127186009539, + "grad_norm": 5.981077272348579, + "learning_rate": 4.084837258517955e-05, + "loss": 0.1643, + "step": 3447 + }, + { + "epoch": 5.481717011128776, + "grad_norm": 5.658248989481495, + "learning_rate": 4.085416923137368e-05, + "loss": 0.1601, + "step": 3448 + }, + { + "epoch": 5.483306836248013, + "grad_norm": 3.328226001593728, + "learning_rate": 4.085996558846244e-05, + "loss": 0.1921, + "step": 3449 + }, + { + "epoch": 5.48489666136725, + "grad_norm": 4.9879068672674265, + "learning_rate": 4.0865761654484e-05, + "loss": 0.1886, + "step": 3450 + }, + { + "epoch": 5.486486486486487, + "grad_norm": 4.378168725476973, + "learning_rate": 4.087155742747659e-05, + "loss": 0.1884, + "step": 3451 + }, + { + "epoch": 5.488076311605723, + "grad_norm": 5.513338832678585, + "learning_rate": 4.087735290547854e-05, + "loss": 0.1193, + "step": 3452 + }, + { + "epoch": 5.48966613672496, + "grad_norm": 3.4217011376955715, + "learning_rate": 4.0883148086528305e-05, + "loss": 0.1533, + "step": 3453 + }, + { + "epoch": 5.491255961844197, + "grad_norm": 3.738968097429164, + "learning_rate": 4.088894296866442e-05, + "loss": 0.1932, + "step": 3454 + }, + { + "epoch": 5.492845786963434, + "grad_norm": 3.4510870249090204, + "learning_rate": 4.0894737549925524e-05, + "loss": 0.1211, + "step": 3455 + }, + { + "epoch": 5.494435612082671, + "grad_norm": 3.714451763025221, + "learning_rate": 4.090053182835037e-05, + "loss": 0.1301, + "step": 3456 + }, + { + "epoch": 5.496025437201908, + "grad_norm": 4.610617076349642, + "learning_rate": 4.09063258019778e-05, + "loss": 0.1808, + "step": 3457 + }, + { + "epoch": 5.497615262321145, + "grad_norm": 26.090143820000243, + "learning_rate": 4.0912119468846766e-05, + "loss": 2.3678, + "step": 3458 + }, + { + "epoch": 5.499205087440382, + "grad_norm": 4.372755050191866, + "learning_rate": 4.091791282699632e-05, + "loss": 0.1868, + "step": 3459 + }, + { + "epoch": 5.500794912559618, + "grad_norm": 4.030548647089388, + "learning_rate": 4.092370587446562e-05, + "loss": 0.1755, + "step": 3460 + }, + { + "epoch": 5.502384737678855, + "grad_norm": 4.3315802951379245, + "learning_rate": 4.092949860929392e-05, + "loss": 0.1769, + "step": 3461 + }, + { + "epoch": 5.503974562798092, + "grad_norm": 4.689817135888319, + "learning_rate": 4.0935291029520606e-05, + "loss": 0.2921, + "step": 3462 + }, + { + "epoch": 5.505564387917329, + "grad_norm": 4.266144627485116, + "learning_rate": 4.094108313318514e-05, + "loss": 0.2374, + "step": 3463 + }, + { + "epoch": 5.507154213036566, + "grad_norm": 3.820662567916575, + "learning_rate": 4.0946874918327116e-05, + "loss": 0.1415, + "step": 3464 + }, + { + "epoch": 5.508744038155803, + "grad_norm": 2.643315299360915, + "learning_rate": 4.095266638298622e-05, + "loss": 0.1536, + "step": 3465 + }, + { + "epoch": 5.51033386327504, + "grad_norm": 4.521595746406973, + "learning_rate": 4.0958457525202244e-05, + "loss": 0.1904, + "step": 3466 + }, + { + "epoch": 5.511923688394276, + "grad_norm": 4.505696567813395, + "learning_rate": 4.09642483430151e-05, + "loss": 0.1652, + "step": 3467 + }, + { + "epoch": 5.513513513513513, + "grad_norm": 4.014245951067984, + "learning_rate": 4.097003883446481e-05, + "loss": 0.1935, + "step": 3468 + }, + { + "epoch": 5.51510333863275, + "grad_norm": 5.781521091656406, + "learning_rate": 4.09758289975915e-05, + "loss": 0.1777, + "step": 3469 + }, + { + "epoch": 5.516693163751987, + "grad_norm": 3.6578266230706102, + "learning_rate": 4.098161883043541e-05, + "loss": 0.1162, + "step": 3470 + }, + { + "epoch": 5.518282988871224, + "grad_norm": 2.3132288442970528, + "learning_rate": 4.098740833103688e-05, + "loss": 0.1108, + "step": 3471 + }, + { + "epoch": 5.519872813990461, + "grad_norm": 5.407748163798726, + "learning_rate": 4.0993197497436386e-05, + "loss": 0.1801, + "step": 3472 + }, + { + "epoch": 5.521462639109698, + "grad_norm": 3.329793890699531, + "learning_rate": 4.099898632767451e-05, + "loss": 0.1318, + "step": 3473 + }, + { + "epoch": 5.523052464228935, + "grad_norm": 5.910665660379687, + "learning_rate": 4.1004774819791934e-05, + "loss": 0.1569, + "step": 3474 + }, + { + "epoch": 5.524642289348172, + "grad_norm": 5.050827384682833, + "learning_rate": 4.101056297182947e-05, + "loss": 0.3092, + "step": 3475 + }, + { + "epoch": 5.526232114467408, + "grad_norm": 4.0948264063279165, + "learning_rate": 4.1016350781828025e-05, + "loss": 0.1782, + "step": 3476 + }, + { + "epoch": 5.527821939586645, + "grad_norm": 5.630487810895971, + "learning_rate": 4.102213824782864e-05, + "loss": 0.216, + "step": 3477 + }, + { + "epoch": 5.529411764705882, + "grad_norm": 2.800741145942806, + "learning_rate": 4.102792536787247e-05, + "loss": 0.1505, + "step": 3478 + }, + { + "epoch": 5.531001589825119, + "grad_norm": 3.194379471610461, + "learning_rate": 4.103371214000079e-05, + "loss": 0.1597, + "step": 3479 + }, + { + "epoch": 5.532591414944356, + "grad_norm": 4.740916322156127, + "learning_rate": 4.103949856225497e-05, + "loss": 0.1697, + "step": 3480 + }, + { + "epoch": 5.534181240063593, + "grad_norm": 5.1889471271261, + "learning_rate": 4.1045284632676536e-05, + "loss": 0.1675, + "step": 3481 + }, + { + "epoch": 5.53577106518283, + "grad_norm": 2.5931350980961354, + "learning_rate": 4.1051070349307106e-05, + "loss": 0.1266, + "step": 3482 + }, + { + "epoch": 5.537360890302066, + "grad_norm": 3.2627854694719414, + "learning_rate": 4.105685571018841e-05, + "loss": 0.1738, + "step": 3483 + }, + { + "epoch": 5.538950715421303, + "grad_norm": 3.6638465657779156, + "learning_rate": 4.106264071336233e-05, + "loss": 0.1588, + "step": 3484 + }, + { + "epoch": 5.54054054054054, + "grad_norm": 1.7559859394128108, + "learning_rate": 4.1068425356870854e-05, + "loss": 0.1651, + "step": 3485 + }, + { + "epoch": 5.542130365659777, + "grad_norm": 6.152629230582684, + "learning_rate": 4.1074209638756075e-05, + "loss": 0.1495, + "step": 3486 + }, + { + "epoch": 5.543720190779014, + "grad_norm": 3.565802993346733, + "learning_rate": 4.107999355706023e-05, + "loss": 0.196, + "step": 3487 + }, + { + "epoch": 5.545310015898251, + "grad_norm": 4.108758945349951, + "learning_rate": 4.108577710982568e-05, + "loss": 0.1624, + "step": 3488 + }, + { + "epoch": 5.546899841017488, + "grad_norm": 4.029915692082075, + "learning_rate": 4.109156029509488e-05, + "loss": 0.1662, + "step": 3489 + }, + { + "epoch": 5.548489666136725, + "grad_norm": 5.382498226749465, + "learning_rate": 4.1097343110910455e-05, + "loss": 0.167, + "step": 3490 + }, + { + "epoch": 5.550079491255962, + "grad_norm": 5.575061430864564, + "learning_rate": 4.110312555531512e-05, + "loss": 0.1843, + "step": 3491 + }, + { + "epoch": 5.5516693163751984, + "grad_norm": 3.269901997475535, + "learning_rate": 4.110890762635173e-05, + "loss": 0.1276, + "step": 3492 + }, + { + "epoch": 5.5532591414944354, + "grad_norm": 4.133496963063792, + "learning_rate": 4.1114689322063256e-05, + "loss": 0.1842, + "step": 3493 + }, + { + "epoch": 5.5548489666136724, + "grad_norm": 8.018012174165118, + "learning_rate": 4.112047064049281e-05, + "loss": 0.2136, + "step": 3494 + }, + { + "epoch": 5.556438791732909, + "grad_norm": 2.866702799949682, + "learning_rate": 4.112625157968363e-05, + "loss": 0.1271, + "step": 3495 + }, + { + "epoch": 5.558028616852146, + "grad_norm": 10.186479284214233, + "learning_rate": 4.1132032137679066e-05, + "loss": 0.1777, + "step": 3496 + }, + { + "epoch": 5.559618441971383, + "grad_norm": 16.182862053960115, + "learning_rate": 4.113781231252262e-05, + "loss": 1.9994, + "step": 3497 + }, + { + "epoch": 5.56120826709062, + "grad_norm": 12.086501949131218, + "learning_rate": 4.1143592102257905e-05, + "loss": 0.1936, + "step": 3498 + }, + { + "epoch": 5.5627980922098565, + "grad_norm": 6.252209674515089, + "learning_rate": 4.114937150492866e-05, + "loss": 0.1851, + "step": 3499 + }, + { + "epoch": 5.5643879173290935, + "grad_norm": 2.869925137699074, + "learning_rate": 4.115515051857879e-05, + "loss": 0.1697, + "step": 3500 + }, + { + "epoch": 5.5659777424483305, + "grad_norm": 6.882674873492677, + "learning_rate": 4.1160929141252305e-05, + "loss": 0.1742, + "step": 3501 + }, + { + "epoch": 5.5675675675675675, + "grad_norm": 10.152593929931726, + "learning_rate": 4.1166707370993335e-05, + "loss": 0.2197, + "step": 3502 + }, + { + "epoch": 5.5691573926868045, + "grad_norm": 4.81208611317315, + "learning_rate": 4.117248520584616e-05, + "loss": 0.1172, + "step": 3503 + }, + { + "epoch": 5.5707472178060415, + "grad_norm": 9.503223835166155, + "learning_rate": 4.117826264385521e-05, + "loss": 0.1773, + "step": 3504 + }, + { + "epoch": 5.5723370429252785, + "grad_norm": 4.262640492987117, + "learning_rate": 4.118403968306502e-05, + "loss": 0.152, + "step": 3505 + }, + { + "epoch": 5.573926868044515, + "grad_norm": 5.755438896354904, + "learning_rate": 4.1189816321520256e-05, + "loss": 0.1557, + "step": 3506 + }, + { + "epoch": 5.575516693163752, + "grad_norm": 6.117060521763252, + "learning_rate": 4.119559255726576e-05, + "loss": 0.1416, + "step": 3507 + }, + { + "epoch": 5.577106518282989, + "grad_norm": 6.845977426783046, + "learning_rate": 4.1201368388346474e-05, + "loss": 0.1778, + "step": 3508 + }, + { + "epoch": 5.578696343402226, + "grad_norm": 2.8193206146673324, + "learning_rate": 4.120714381280749e-05, + "loss": 0.1704, + "step": 3509 + }, + { + "epoch": 5.580286168521463, + "grad_norm": 5.274255981726564, + "learning_rate": 4.1212918828694036e-05, + "loss": 0.1602, + "step": 3510 + }, + { + "epoch": 5.5818759936407, + "grad_norm": 6.495705653383769, + "learning_rate": 4.1218693434051476e-05, + "loss": 0.1593, + "step": 3511 + }, + { + "epoch": 5.583465818759937, + "grad_norm": 3.4638060279956697, + "learning_rate": 4.122446762692532e-05, + "loss": 0.1601, + "step": 3512 + }, + { + "epoch": 5.585055643879174, + "grad_norm": 4.189082616840884, + "learning_rate": 4.1230241405361206e-05, + "loss": 0.1283, + "step": 3513 + }, + { + "epoch": 5.586645468998411, + "grad_norm": 5.025504710962899, + "learning_rate": 4.1236014767404926e-05, + "loss": 0.1797, + "step": 3514 + }, + { + "epoch": 5.588235294117647, + "grad_norm": 4.4796608788917895, + "learning_rate": 4.124178771110241e-05, + "loss": 0.1708, + "step": 3515 + }, + { + "epoch": 5.589825119236884, + "grad_norm": 4.028955413443789, + "learning_rate": 4.124756023449971e-05, + "loss": 0.1355, + "step": 3516 + }, + { + "epoch": 5.591414944356121, + "grad_norm": 5.343291678228311, + "learning_rate": 4.125333233564305e-05, + "loss": 0.1673, + "step": 3517 + }, + { + "epoch": 5.593004769475358, + "grad_norm": 3.3760154092971026, + "learning_rate": 4.125910401257877e-05, + "loss": 0.1639, + "step": 3518 + }, + { + "epoch": 5.594594594594595, + "grad_norm": 4.265866276845395, + "learning_rate": 4.1264875263353375e-05, + "loss": 0.1854, + "step": 3519 + }, + { + "epoch": 5.596184419713832, + "grad_norm": 4.233028644070434, + "learning_rate": 4.127064608601351e-05, + "loss": 0.1335, + "step": 3520 + }, + { + "epoch": 5.597774244833069, + "grad_norm": 2.540447137073106, + "learning_rate": 4.1276416478605945e-05, + "loss": 0.1658, + "step": 3521 + }, + { + "epoch": 5.599364069952305, + "grad_norm": 5.280107269674043, + "learning_rate": 4.128218643917763e-05, + "loss": 0.2377, + "step": 3522 + }, + { + "epoch": 5.600953895071542, + "grad_norm": 5.135579492734996, + "learning_rate": 4.128795596577563e-05, + "loss": 0.2164, + "step": 3523 + }, + { + "epoch": 5.602543720190779, + "grad_norm": 2.768460961072922, + "learning_rate": 4.129372505644717e-05, + "loss": 0.1126, + "step": 3524 + }, + { + "epoch": 5.604133545310016, + "grad_norm": 2.2740281553732395, + "learning_rate": 4.129949370923963e-05, + "loss": 0.1401, + "step": 3525 + }, + { + "epoch": 5.605723370429253, + "grad_norm": 3.723624952247425, + "learning_rate": 4.1305261922200514e-05, + "loss": 0.1469, + "step": 3526 + }, + { + "epoch": 5.60731319554849, + "grad_norm": 2.0350564121791783, + "learning_rate": 4.131102969337751e-05, + "loss": 0.2077, + "step": 3527 + }, + { + "epoch": 5.608903020667727, + "grad_norm": 3.3865523363243315, + "learning_rate": 4.1316797020818426e-05, + "loss": 0.1417, + "step": 3528 + }, + { + "epoch": 5.610492845786963, + "grad_norm": 5.037453867759402, + "learning_rate": 4.132256390257123e-05, + "loss": 0.1559, + "step": 3529 + }, + { + "epoch": 5.6120826709062, + "grad_norm": 3.7287961061159094, + "learning_rate": 4.132833033668404e-05, + "loss": 0.1704, + "step": 3530 + }, + { + "epoch": 5.613672496025437, + "grad_norm": 7.850719308824216, + "learning_rate": 4.133409632120513e-05, + "loss": 0.119, + "step": 3531 + }, + { + "epoch": 5.615262321144674, + "grad_norm": 4.862917991230707, + "learning_rate": 4.133986185418292e-05, + "loss": 0.1487, + "step": 3532 + }, + { + "epoch": 5.616852146263911, + "grad_norm": 3.0442252194192285, + "learning_rate": 4.134562693366599e-05, + "loss": 0.137, + "step": 3533 + }, + { + "epoch": 5.618441971383148, + "grad_norm": 3.8491770703181136, + "learning_rate": 4.135139155770307e-05, + "loss": 0.1897, + "step": 3534 + }, + { + "epoch": 5.620031796502385, + "grad_norm": 4.0024578731714175, + "learning_rate": 4.1357155724343045e-05, + "loss": 0.1691, + "step": 3535 + }, + { + "epoch": 5.621621621621622, + "grad_norm": 4.682485120463948, + "learning_rate": 4.136291943163495e-05, + "loss": 0.1519, + "step": 3536 + }, + { + "epoch": 5.623211446740859, + "grad_norm": 2.458240983344537, + "learning_rate": 4.136868267762797e-05, + "loss": 0.1609, + "step": 3537 + }, + { + "epoch": 5.624801271860095, + "grad_norm": 9.68324067321005, + "learning_rate": 4.137444546037147e-05, + "loss": 0.1657, + "step": 3538 + }, + { + "epoch": 5.626391096979332, + "grad_norm": 3.215412148341009, + "learning_rate": 4.138020777791495e-05, + "loss": 0.1722, + "step": 3539 + }, + { + "epoch": 5.627980922098569, + "grad_norm": 6.878029685423107, + "learning_rate": 4.138596962830806e-05, + "loss": 0.2372, + "step": 3540 + }, + { + "epoch": 5.629570747217806, + "grad_norm": 4.115927930756329, + "learning_rate": 4.1391731009600655e-05, + "loss": 0.1461, + "step": 3541 + }, + { + "epoch": 5.631160572337043, + "grad_norm": 4.951904190929962, + "learning_rate": 4.139749191984269e-05, + "loss": 0.2153, + "step": 3542 + }, + { + "epoch": 5.63275039745628, + "grad_norm": 13.418561158520081, + "learning_rate": 4.1403252357084316e-05, + "loss": 0.3356, + "step": 3543 + }, + { + "epoch": 5.634340222575517, + "grad_norm": 3.2824328690176996, + "learning_rate": 4.140901231937583e-05, + "loss": 0.1495, + "step": 3544 + }, + { + "epoch": 5.635930047694753, + "grad_norm": 6.529084884491945, + "learning_rate": 4.141477180476769e-05, + "loss": 0.1622, + "step": 3545 + }, + { + "epoch": 5.63751987281399, + "grad_norm": 6.981152502970928, + "learning_rate": 4.142053081131053e-05, + "loss": 0.2736, + "step": 3546 + }, + { + "epoch": 5.639109697933227, + "grad_norm": 2.5822003361734747, + "learning_rate": 4.1426289337055115e-05, + "loss": 0.143, + "step": 3547 + }, + { + "epoch": 5.640699523052464, + "grad_norm": 5.861439918078993, + "learning_rate": 4.1432047380052415e-05, + "loss": 0.1654, + "step": 3548 + }, + { + "epoch": 5.642289348171701, + "grad_norm": 13.288406931426305, + "learning_rate": 4.143780493835353e-05, + "loss": 1.2675, + "step": 3549 + }, + { + "epoch": 5.643879173290938, + "grad_norm": 5.2120525299928575, + "learning_rate": 4.144356201000973e-05, + "loss": 0.143, + "step": 3550 + }, + { + "epoch": 5.645468998410175, + "grad_norm": 8.893154766437872, + "learning_rate": 4.144931859307247e-05, + "loss": 0.2851, + "step": 3551 + }, + { + "epoch": 5.647058823529412, + "grad_norm": 7.137810325615889, + "learning_rate": 4.1455074685593344e-05, + "loss": 0.2875, + "step": 3552 + }, + { + "epoch": 5.648648648648649, + "grad_norm": 8.881645599930087, + "learning_rate": 4.146083028562412e-05, + "loss": 0.1904, + "step": 3553 + }, + { + "epoch": 5.650238473767885, + "grad_norm": 4.62432636268524, + "learning_rate": 4.1466585391216735e-05, + "loss": 0.1551, + "step": 3554 + }, + { + "epoch": 5.651828298887122, + "grad_norm": 5.171092264271625, + "learning_rate": 4.1472340000423315e-05, + "loss": 0.2183, + "step": 3555 + }, + { + "epoch": 5.653418124006359, + "grad_norm": 3.9362504483298464, + "learning_rate": 4.1478094111296106e-05, + "loss": 0.1671, + "step": 3556 + }, + { + "epoch": 5.655007949125596, + "grad_norm": 2.701500458143243, + "learning_rate": 4.148384772188757e-05, + "loss": 0.159, + "step": 3557 + }, + { + "epoch": 5.656597774244833, + "grad_norm": 5.981794997363775, + "learning_rate": 4.148960083025031e-05, + "loss": 0.2941, + "step": 3558 + }, + { + "epoch": 5.65818759936407, + "grad_norm": 3.532599263157308, + "learning_rate": 4.14953534344371e-05, + "loss": 0.1597, + "step": 3559 + }, + { + "epoch": 5.659777424483307, + "grad_norm": 4.042307834639393, + "learning_rate": 4.15011055325009e-05, + "loss": 0.1455, + "step": 3560 + }, + { + "epoch": 5.661367249602543, + "grad_norm": 5.934255130100581, + "learning_rate": 4.150685712249483e-05, + "loss": 0.1878, + "step": 3561 + }, + { + "epoch": 5.66295707472178, + "grad_norm": 4.493916496764399, + "learning_rate": 4.1512608202472196e-05, + "loss": 0.2118, + "step": 3562 + }, + { + "epoch": 5.664546899841017, + "grad_norm": 7.626155022752274, + "learning_rate": 4.151835877048645e-05, + "loss": 0.2025, + "step": 3563 + }, + { + "epoch": 5.666136724960254, + "grad_norm": 4.808823162394276, + "learning_rate": 4.152410882459124e-05, + "loss": 0.2459, + "step": 3564 + }, + { + "epoch": 5.667726550079491, + "grad_norm": 5.684340727044658, + "learning_rate": 4.152985836284038e-05, + "loss": 0.1667, + "step": 3565 + }, + { + "epoch": 5.669316375198728, + "grad_norm": 5.310663674205226, + "learning_rate": 4.153560738328786e-05, + "loss": 0.2217, + "step": 3566 + }, + { + "epoch": 5.670906200317965, + "grad_norm": 4.285982633120311, + "learning_rate": 4.154135588398785e-05, + "loss": 0.1844, + "step": 3567 + }, + { + "epoch": 5.672496025437201, + "grad_norm": 5.853740228976494, + "learning_rate": 4.154710386299468e-05, + "loss": 0.3292, + "step": 3568 + }, + { + "epoch": 5.674085850556438, + "grad_norm": 2.4129165594917357, + "learning_rate": 4.155285131836288e-05, + "loss": 0.1655, + "step": 3569 + }, + { + "epoch": 5.675675675675675, + "grad_norm": 2.84498988791173, + "learning_rate": 4.155859824814713e-05, + "loss": 0.1474, + "step": 3570 + }, + { + "epoch": 5.677265500794912, + "grad_norm": 5.441554907570572, + "learning_rate": 4.156434465040231e-05, + "loss": 0.2917, + "step": 3571 + }, + { + "epoch": 5.678855325914149, + "grad_norm": 3.7338267088847683, + "learning_rate": 4.1570090523183476e-05, + "loss": 0.2343, + "step": 3572 + }, + { + "epoch": 5.680445151033386, + "grad_norm": 4.866911027686962, + "learning_rate": 4.1575835864545846e-05, + "loss": 0.1897, + "step": 3573 + }, + { + "epoch": 5.682034976152623, + "grad_norm": 4.540194655482006, + "learning_rate": 4.158158067254484e-05, + "loss": 0.1764, + "step": 3574 + }, + { + "epoch": 5.68362480127186, + "grad_norm": 4.6418071580685165, + "learning_rate": 4.158732494523604e-05, + "loss": 0.179, + "step": 3575 + }, + { + "epoch": 5.685214626391097, + "grad_norm": 6.466668194920761, + "learning_rate": 4.159306868067522e-05, + "loss": 0.2702, + "step": 3576 + }, + { + "epoch": 5.6868044515103335, + "grad_norm": 4.354666345274069, + "learning_rate": 4.159881187691835e-05, + "loss": 0.2326, + "step": 3577 + }, + { + "epoch": 5.6883942766295705, + "grad_norm": 6.5937533384199085, + "learning_rate": 4.160455453202154e-05, + "loss": 0.1824, + "step": 3578 + }, + { + "epoch": 5.6899841017488075, + "grad_norm": 4.6294136006138515, + "learning_rate": 4.1610296644041135e-05, + "loss": 0.1773, + "step": 3579 + }, + { + "epoch": 5.6915739268680445, + "grad_norm": 5.155299628758655, + "learning_rate": 4.161603821103361e-05, + "loss": 0.2244, + "step": 3580 + }, + { + "epoch": 5.6931637519872815, + "grad_norm": 6.6661039647989995, + "learning_rate": 4.162177923105567e-05, + "loss": 0.1957, + "step": 3581 + }, + { + "epoch": 5.6947535771065185, + "grad_norm": 4.906444088300017, + "learning_rate": 4.162751970216419e-05, + "loss": 0.1311, + "step": 3582 + }, + { + "epoch": 5.6963434022257555, + "grad_norm": 4.373817815093437, + "learning_rate": 4.163325962241622e-05, + "loss": 0.1196, + "step": 3583 + }, + { + "epoch": 5.697933227344992, + "grad_norm": 4.302527051624377, + "learning_rate": 4.1638998989869015e-05, + "loss": 0.1549, + "step": 3584 + }, + { + "epoch": 5.699523052464229, + "grad_norm": 5.1294306981653275, + "learning_rate": 4.1644737802579986e-05, + "loss": 0.265, + "step": 3585 + }, + { + "epoch": 5.701112877583466, + "grad_norm": 4.2444931529491186, + "learning_rate": 4.165047605860678e-05, + "loss": 0.1624, + "step": 3586 + }, + { + "epoch": 5.702702702702703, + "grad_norm": 2.8026001866276324, + "learning_rate": 4.165621375600719e-05, + "loss": 0.1405, + "step": 3587 + }, + { + "epoch": 5.70429252782194, + "grad_norm": 6.991211036086354, + "learning_rate": 4.166195089283921e-05, + "loss": 0.1556, + "step": 3588 + }, + { + "epoch": 5.705882352941177, + "grad_norm": 3.83167452029789, + "learning_rate": 4.1667687467161024e-05, + "loss": 0.1986, + "step": 3589 + }, + { + "epoch": 5.707472178060414, + "grad_norm": 3.4441179490614333, + "learning_rate": 4.167342347703102e-05, + "loss": 0.1844, + "step": 3590 + }, + { + "epoch": 5.709062003179651, + "grad_norm": 6.941645808711311, + "learning_rate": 4.1679158920507774e-05, + "loss": 0.2263, + "step": 3591 + }, + { + "epoch": 5.710651828298887, + "grad_norm": 4.929715151184763, + "learning_rate": 4.168489379565002e-05, + "loss": 0.1374, + "step": 3592 + }, + { + "epoch": 5.712241653418124, + "grad_norm": 4.437708232779739, + "learning_rate": 4.169062810051674e-05, + "loss": 0.2626, + "step": 3593 + }, + { + "epoch": 5.713831478537361, + "grad_norm": 4.2083631330075075, + "learning_rate": 4.169636183316706e-05, + "loss": 0.1629, + "step": 3594 + }, + { + "epoch": 5.715421303656598, + "grad_norm": 6.213039573154681, + "learning_rate": 4.170209499166033e-05, + "loss": 0.2183, + "step": 3595 + }, + { + "epoch": 5.717011128775835, + "grad_norm": 3.983177329738075, + "learning_rate": 4.170782757405607e-05, + "loss": 0.1587, + "step": 3596 + }, + { + "epoch": 5.718600953895072, + "grad_norm": 3.734816843258369, + "learning_rate": 4.171355957841401e-05, + "loss": 0.1399, + "step": 3597 + }, + { + "epoch": 5.720190779014309, + "grad_norm": 4.507721694416351, + "learning_rate": 4.17192910027941e-05, + "loss": 0.1915, + "step": 3598 + }, + { + "epoch": 5.721780604133546, + "grad_norm": 2.90596554102352, + "learning_rate": 4.172502184525642e-05, + "loss": 0.1824, + "step": 3599 + }, + { + "epoch": 5.723370429252782, + "grad_norm": 2.7374824302826006, + "learning_rate": 4.173075210386132e-05, + "loss": 0.1884, + "step": 3600 + }, + { + "epoch": 5.724960254372019, + "grad_norm": 16.4312468200261, + "learning_rate": 4.173648177666931e-05, + "loss": 9.0676, + "step": 3601 + }, + { + "epoch": 5.726550079491256, + "grad_norm": 6.095352735102084, + "learning_rate": 4.174221086174108e-05, + "loss": 0.186, + "step": 3602 + }, + { + "epoch": 5.728139904610493, + "grad_norm": 5.866689213660126, + "learning_rate": 4.1747939357137565e-05, + "loss": 0.148, + "step": 3603 + }, + { + "epoch": 5.72972972972973, + "grad_norm": 2.7980511466346267, + "learning_rate": 4.175366726091987e-05, + "loss": 0.16, + "step": 3604 + }, + { + "epoch": 5.731319554848967, + "grad_norm": 10.811645603901455, + "learning_rate": 4.175939457114931e-05, + "loss": 0.2012, + "step": 3605 + }, + { + "epoch": 5.732909379968204, + "grad_norm": 4.505592112826139, + "learning_rate": 4.176512128588739e-05, + "loss": 0.1632, + "step": 3606 + }, + { + "epoch": 5.73449920508744, + "grad_norm": 5.965046141745751, + "learning_rate": 4.177084740319584e-05, + "loss": 0.1502, + "step": 3607 + }, + { + "epoch": 5.736089030206677, + "grad_norm": 10.814215843387212, + "learning_rate": 4.177657292113655e-05, + "loss": 0.3067, + "step": 3608 + }, + { + "epoch": 5.737678855325914, + "grad_norm": 7.647187139727761, + "learning_rate": 4.1782297837771665e-05, + "loss": 0.3066, + "step": 3609 + }, + { + "epoch": 5.739268680445151, + "grad_norm": 8.99357238066339, + "learning_rate": 4.17880221511635e-05, + "loss": 0.2102, + "step": 3610 + }, + { + "epoch": 5.740858505564388, + "grad_norm": 9.40344301353052, + "learning_rate": 4.179374585937458e-05, + "loss": 0.1646, + "step": 3611 + }, + { + "epoch": 5.742448330683625, + "grad_norm": 4.42341678843568, + "learning_rate": 4.179946896046763e-05, + "loss": 0.2147, + "step": 3612 + }, + { + "epoch": 5.744038155802862, + "grad_norm": 5.6226941767604, + "learning_rate": 4.18051914525056e-05, + "loss": 0.1859, + "step": 3613 + }, + { + "epoch": 5.745627980922099, + "grad_norm": 8.704121780433018, + "learning_rate": 4.181091333355163e-05, + "loss": 0.163, + "step": 3614 + }, + { + "epoch": 5.747217806041336, + "grad_norm": 4.222689601477409, + "learning_rate": 4.181663460166907e-05, + "loss": 0.1378, + "step": 3615 + }, + { + "epoch": 5.748807631160572, + "grad_norm": 3.6905314494685157, + "learning_rate": 4.1822355254921475e-05, + "loss": 0.159, + "step": 3616 + }, + { + "epoch": 5.750397456279809, + "grad_norm": 7.292567704846656, + "learning_rate": 4.182807529137262e-05, + "loss": 0.2096, + "step": 3617 + }, + { + "epoch": 5.751987281399046, + "grad_norm": 5.133862597059989, + "learning_rate": 4.183379470908646e-05, + "loss": 0.1501, + "step": 3618 + }, + { + "epoch": 5.753577106518283, + "grad_norm": 4.682314580058747, + "learning_rate": 4.1839513506127204e-05, + "loss": 0.204, + "step": 3619 + }, + { + "epoch": 5.75516693163752, + "grad_norm": 8.007037441243162, + "learning_rate": 4.184523168055923e-05, + "loss": 0.1654, + "step": 3620 + }, + { + "epoch": 5.756756756756757, + "grad_norm": 3.8517680696524415, + "learning_rate": 4.185094923044715e-05, + "loss": 0.2107, + "step": 3621 + }, + { + "epoch": 5.758346581875994, + "grad_norm": 3.5611696403569746, + "learning_rate": 4.185666615385577e-05, + "loss": 0.201, + "step": 3622 + }, + { + "epoch": 5.75993640699523, + "grad_norm": 5.301085064891199, + "learning_rate": 4.1862382448850136e-05, + "loss": 0.1614, + "step": 3623 + }, + { + "epoch": 5.761526232114467, + "grad_norm": 4.190785583028749, + "learning_rate": 4.186809811349548e-05, + "loss": 0.1476, + "step": 3624 + }, + { + "epoch": 5.763116057233704, + "grad_norm": 2.5783138992477928, + "learning_rate": 4.187381314585725e-05, + "loss": 0.1681, + "step": 3625 + }, + { + "epoch": 5.764705882352941, + "grad_norm": 4.841903769849468, + "learning_rate": 4.187952754400112e-05, + "loss": 0.1415, + "step": 3626 + }, + { + "epoch": 5.766295707472178, + "grad_norm": 5.644910761060418, + "learning_rate": 4.188524130599298e-05, + "loss": 0.1385, + "step": 3627 + }, + { + "epoch": 5.767885532591415, + "grad_norm": 3.7197968032903277, + "learning_rate": 4.189095442989892e-05, + "loss": 0.1648, + "step": 3628 + }, + { + "epoch": 5.769475357710652, + "grad_norm": 4.933122926630562, + "learning_rate": 4.1896666913785244e-05, + "loss": 0.1943, + "step": 3629 + }, + { + "epoch": 5.771065182829888, + "grad_norm": 3.0394631460685404, + "learning_rate": 4.190237875571851e-05, + "loss": 0.1561, + "step": 3630 + }, + { + "epoch": 5.772655007949125, + "grad_norm": 2.6675420105977388, + "learning_rate": 4.190808995376545e-05, + "loss": 0.1707, + "step": 3631 + }, + { + "epoch": 5.774244833068362, + "grad_norm": 3.0741117857822764, + "learning_rate": 4.1913800505993026e-05, + "loss": 0.1354, + "step": 3632 + }, + { + "epoch": 5.775834658187599, + "grad_norm": 2.170031227202194, + "learning_rate": 4.191951041046844e-05, + "loss": 0.1616, + "step": 3633 + }, + { + "epoch": 5.777424483306836, + "grad_norm": 2.6378062907382405, + "learning_rate": 4.1925219665259075e-05, + "loss": 0.1781, + "step": 3634 + }, + { + "epoch": 5.779014308426073, + "grad_norm": 4.130640877044012, + "learning_rate": 4.1930928268432566e-05, + "loss": 0.1895, + "step": 3635 + }, + { + "epoch": 5.78060413354531, + "grad_norm": 2.718405567404681, + "learning_rate": 4.1936636218056766e-05, + "loss": 0.159, + "step": 3636 + }, + { + "epoch": 5.782193958664547, + "grad_norm": 3.6614055085148958, + "learning_rate": 4.1942343512199716e-05, + "loss": 0.1385, + "step": 3637 + }, + { + "epoch": 5.783783783783784, + "grad_norm": 2.2212284178480983, + "learning_rate": 4.194805014892973e-05, + "loss": 0.151, + "step": 3638 + }, + { + "epoch": 5.78537360890302, + "grad_norm": 4.752488312801556, + "learning_rate": 4.195375612631531e-05, + "loss": 0.1485, + "step": 3639 + }, + { + "epoch": 5.786963434022257, + "grad_norm": 5.233987900513094, + "learning_rate": 4.195946144242518e-05, + "loss": 0.1385, + "step": 3640 + }, + { + "epoch": 5.788553259141494, + "grad_norm": 3.835924410297876, + "learning_rate": 4.196516609532831e-05, + "loss": 0.1855, + "step": 3641 + }, + { + "epoch": 5.790143084260731, + "grad_norm": 7.3630206708722605, + "learning_rate": 4.1970870083093864e-05, + "loss": 0.1937, + "step": 3642 + }, + { + "epoch": 5.791732909379968, + "grad_norm": 5.103351926101738, + "learning_rate": 4.1976573403791265e-05, + "loss": 0.1901, + "step": 3643 + }, + { + "epoch": 5.793322734499205, + "grad_norm": 2.0332053867266997, + "learning_rate": 4.198227605549014e-05, + "loss": 0.1696, + "step": 3644 + }, + { + "epoch": 5.794912559618442, + "grad_norm": 5.134492673278592, + "learning_rate": 4.198797803626035e-05, + "loss": 0.1853, + "step": 3645 + }, + { + "epoch": 5.796502384737678, + "grad_norm": 4.020376674562339, + "learning_rate": 4.199367934417198e-05, + "loss": 0.1604, + "step": 3646 + }, + { + "epoch": 5.798092209856915, + "grad_norm": 6.150763198713712, + "learning_rate": 4.199937997729533e-05, + "loss": 0.192, + "step": 3647 + }, + { + "epoch": 5.799682034976152, + "grad_norm": 1.5029670089694867, + "learning_rate": 4.200507993370097e-05, + "loss": 0.1646, + "step": 3648 + }, + { + "epoch": 5.801271860095389, + "grad_norm": 5.136485816862838, + "learning_rate": 4.2010779211459644e-05, + "loss": 0.168, + "step": 3649 + }, + { + "epoch": 5.802861685214626, + "grad_norm": 4.096748123695933, + "learning_rate": 4.2016477808642375e-05, + "loss": 0.1734, + "step": 3650 + }, + { + "epoch": 5.804451510333863, + "grad_norm": 4.1015645073249125, + "learning_rate": 4.2022175723320374e-05, + "loss": 0.1435, + "step": 3651 + }, + { + "epoch": 5.8060413354531, + "grad_norm": 2.8565002553040713, + "learning_rate": 4.202787295356512e-05, + "loss": 0.1578, + "step": 3652 + }, + { + "epoch": 5.807631160572337, + "grad_norm": 6.172245962402591, + "learning_rate": 4.2033569497448307e-05, + "loss": 0.21, + "step": 3653 + }, + { + "epoch": 5.809220985691574, + "grad_norm": 4.586271363448587, + "learning_rate": 4.203926535304185e-05, + "loss": 0.1613, + "step": 3654 + }, + { + "epoch": 5.8108108108108105, + "grad_norm": 3.6669606277815334, + "learning_rate": 4.20449605184179e-05, + "loss": 0.184, + "step": 3655 + }, + { + "epoch": 5.8124006359300475, + "grad_norm": 2.3453732874220212, + "learning_rate": 4.2050654991648876e-05, + "loss": 0.1688, + "step": 3656 + }, + { + "epoch": 5.8139904610492845, + "grad_norm": 3.15105188634312, + "learning_rate": 4.2056348770807386e-05, + "loss": 0.1918, + "step": 3657 + }, + { + "epoch": 5.8155802861685215, + "grad_norm": 1.3710226305801365, + "learning_rate": 4.2062041853966295e-05, + "loss": 0.1335, + "step": 3658 + }, + { + "epoch": 5.8171701112877585, + "grad_norm": 5.571447556836932, + "learning_rate": 4.2067734239198706e-05, + "loss": 19.3559, + "step": 3659 + }, + { + "epoch": 5.8187599364069955, + "grad_norm": 3.754204877966765, + "learning_rate": 4.207342592457795e-05, + "loss": 0.158, + "step": 3660 + }, + { + "epoch": 5.8203497615262325, + "grad_norm": 1.7876462456491027, + "learning_rate": 4.20791169081776e-05, + "loss": 0.189, + "step": 3661 + }, + { + "epoch": 5.821939586645469, + "grad_norm": 5.157772226611338, + "learning_rate": 4.2084807188071455e-05, + "loss": 0.1695, + "step": 3662 + }, + { + "epoch": 5.823529411764706, + "grad_norm": 2.2879042327694927, + "learning_rate": 4.2090496762333564e-05, + "loss": 0.2395, + "step": 3663 + }, + { + "epoch": 5.825119236883943, + "grad_norm": 6.207792802245383, + "learning_rate": 4.209618562903822e-05, + "loss": 0.1475, + "step": 3664 + }, + { + "epoch": 5.82670906200318, + "grad_norm": 4.548627381998895, + "learning_rate": 4.210187378625994e-05, + "loss": 0.2324, + "step": 3665 + }, + { + "epoch": 5.828298887122417, + "grad_norm": 4.089463655503371, + "learning_rate": 4.210756123207349e-05, + "loss": 0.156, + "step": 3666 + }, + { + "epoch": 5.829888712241654, + "grad_norm": 4.545709586328353, + "learning_rate": 4.211324796455389e-05, + "loss": 0.1718, + "step": 3667 + }, + { + "epoch": 5.831478537360891, + "grad_norm": 4.1825321890928695, + "learning_rate": 4.2118933981776365e-05, + "loss": 0.1843, + "step": 3668 + }, + { + "epoch": 5.833068362480127, + "grad_norm": 4.68811710804842, + "learning_rate": 4.212461928181641e-05, + "loss": 0.1868, + "step": 3669 + }, + { + "epoch": 5.834658187599364, + "grad_norm": 2.957886344049319, + "learning_rate": 4.2130303862749766e-05, + "loss": 0.1636, + "step": 3670 + }, + { + "epoch": 5.836248012718601, + "grad_norm": 2.5055723589093764, + "learning_rate": 4.21359877226524e-05, + "loss": 0.203, + "step": 3671 + }, + { + "epoch": 5.837837837837838, + "grad_norm": 5.503628711945985, + "learning_rate": 4.214167085960053e-05, + "loss": 0.1728, + "step": 3672 + }, + { + "epoch": 5.839427662957075, + "grad_norm": 2.4761138214406757, + "learning_rate": 4.214735327167063e-05, + "loss": 0.1749, + "step": 3673 + }, + { + "epoch": 5.841017488076312, + "grad_norm": 3.8324593024155953, + "learning_rate": 4.21530349569394e-05, + "loss": 0.2022, + "step": 3674 + }, + { + "epoch": 5.842607313195549, + "grad_norm": 3.3779456908078527, + "learning_rate": 4.21587159134838e-05, + "loss": 0.2066, + "step": 3675 + }, + { + "epoch": 5.844197138314786, + "grad_norm": 4.142300893011948, + "learning_rate": 4.2164396139381035e-05, + "loss": 0.2548, + "step": 3676 + }, + { + "epoch": 5.845786963434023, + "grad_norm": 55.74061589934113, + "learning_rate": 4.2170075632708536e-05, + "loss": 3.7771, + "step": 3677 + }, + { + "epoch": 5.847376788553259, + "grad_norm": 4.722009271551694, + "learning_rate": 4.217575439154402e-05, + "loss": 0.2466, + "step": 3678 + }, + { + "epoch": 5.848966613672496, + "grad_norm": 112.5796298611852, + "learning_rate": 4.218143241396543e-05, + "loss": 5.5487, + "step": 3679 + }, + { + "epoch": 5.850556438791733, + "grad_norm": 1.8379519740167987, + "learning_rate": 4.218710969805095e-05, + "loss": 0.2294, + "step": 3680 + }, + { + "epoch": 5.85214626391097, + "grad_norm": 5.350662107833079, + "learning_rate": 4.2192786241879035e-05, + "loss": 0.1786, + "step": 3681 + }, + { + "epoch": 5.853736089030207, + "grad_norm": 4.441891500131396, + "learning_rate": 4.219846204352838e-05, + "loss": 0.1824, + "step": 3682 + }, + { + "epoch": 5.855325914149444, + "grad_norm": 2.235357280709743, + "learning_rate": 4.220413710107792e-05, + "loss": 0.2013, + "step": 3683 + }, + { + "epoch": 5.856915739268681, + "grad_norm": 3.377067360510304, + "learning_rate": 4.220981141260687e-05, + "loss": 0.2125, + "step": 3684 + }, + { + "epoch": 5.858505564387917, + "grad_norm": 6.0018675676175945, + "learning_rate": 4.2215484976194676e-05, + "loss": 0.1867, + "step": 3685 + }, + { + "epoch": 5.860095389507154, + "grad_norm": 5.46688675069841, + "learning_rate": 4.222115778992103e-05, + "loss": 0.1488, + "step": 3686 + }, + { + "epoch": 5.861685214626391, + "grad_norm": 4.312011571430304, + "learning_rate": 4.2226829851865914e-05, + "loss": 0.2006, + "step": 3687 + }, + { + "epoch": 5.863275039745628, + "grad_norm": 4.081494671227908, + "learning_rate": 4.223250116010952e-05, + "loss": 0.1897, + "step": 3688 + }, + { + "epoch": 5.864864864864865, + "grad_norm": 6.5192452463739965, + "learning_rate": 4.2238171712732315e-05, + "loss": 0.2936, + "step": 3689 + }, + { + "epoch": 5.866454689984102, + "grad_norm": 7.854059671076194, + "learning_rate": 4.224384150781504e-05, + "loss": 0.2286, + "step": 3690 + }, + { + "epoch": 5.868044515103339, + "grad_norm": 12.282106122123663, + "learning_rate": 4.224951054343865e-05, + "loss": 38.4781, + "step": 3691 + }, + { + "epoch": 5.869634340222575, + "grad_norm": 3.7273867264211713, + "learning_rate": 4.22551788176844e-05, + "loss": 0.1426, + "step": 3692 + }, + { + "epoch": 5.871224165341812, + "grad_norm": 3.237353340352837, + "learning_rate": 4.226084632863379e-05, + "loss": 0.1644, + "step": 3693 + }, + { + "epoch": 5.872813990461049, + "grad_norm": 4.3123572075119405, + "learning_rate": 4.226651307436855e-05, + "loss": 0.3162, + "step": 3694 + }, + { + "epoch": 5.874403815580286, + "grad_norm": 4.65363587225841, + "learning_rate": 4.227217905297071e-05, + "loss": 0.2123, + "step": 3695 + }, + { + "epoch": 5.875993640699523, + "grad_norm": 7.113432791231884, + "learning_rate": 4.227784426252253e-05, + "loss": 0.1511, + "step": 3696 + }, + { + "epoch": 5.87758346581876, + "grad_norm": 6.228660445769009, + "learning_rate": 4.2283508701106556e-05, + "loss": 0.1795, + "step": 3697 + }, + { + "epoch": 5.879173290937997, + "grad_norm": 4.419004441665432, + "learning_rate": 4.2289172366805576e-05, + "loss": 0.1769, + "step": 3698 + }, + { + "epoch": 5.880763116057234, + "grad_norm": 77.04134937815041, + "learning_rate": 4.229483525770263e-05, + "loss": 9.7127, + "step": 3699 + }, + { + "epoch": 5.882352941176471, + "grad_norm": 5.204151768428491, + "learning_rate": 4.2300497371881045e-05, + "loss": 0.1744, + "step": 3700 + }, + { + "epoch": 5.883942766295707, + "grad_norm": 557.6987847378355, + "learning_rate": 4.2306158707424404e-05, + "loss": 10.3648, + "step": 3701 + }, + { + "epoch": 5.885532591414944, + "grad_norm": 4.0083445869943555, + "learning_rate": 4.231181926241654e-05, + "loss": 0.2367, + "step": 3702 + }, + { + "epoch": 5.887122416534181, + "grad_norm": 5.390395128645863, + "learning_rate": 4.231747903494158e-05, + "loss": 0.2592, + "step": 3703 + }, + { + "epoch": 5.888712241653418, + "grad_norm": 5.07342740750419, + "learning_rate": 4.232313802308386e-05, + "loss": 0.2216, + "step": 3704 + }, + { + "epoch": 5.890302066772655, + "grad_norm": 3.873942790563805, + "learning_rate": 4.232879622492806e-05, + "loss": 0.1298, + "step": 3705 + }, + { + "epoch": 5.891891891891892, + "grad_norm": 3.515298669432799, + "learning_rate": 4.2334453638559054e-05, + "loss": 0.2394, + "step": 3706 + }, + { + "epoch": 5.893481717011129, + "grad_norm": 3.6253910965596354, + "learning_rate": 4.2340110262062025e-05, + "loss": 0.181, + "step": 3707 + }, + { + "epoch": 5.895071542130365, + "grad_norm": 5.126090489685165, + "learning_rate": 4.234576609352241e-05, + "loss": 0.1671, + "step": 3708 + }, + { + "epoch": 5.896661367249602, + "grad_norm": 2.976235701475839, + "learning_rate": 4.235142113102591e-05, + "loss": 0.1726, + "step": 3709 + }, + { + "epoch": 5.898251192368839, + "grad_norm": 2.8892368960462314, + "learning_rate": 4.2357075372658494e-05, + "loss": 0.2211, + "step": 3710 + }, + { + "epoch": 5.899841017488076, + "grad_norm": 2.3610001074176266, + "learning_rate": 4.236272881650642e-05, + "loss": 0.1848, + "step": 3711 + }, + { + "epoch": 5.901430842607313, + "grad_norm": 2.8092422261131005, + "learning_rate": 4.2368381460656185e-05, + "loss": 0.12, + "step": 3712 + }, + { + "epoch": 5.90302066772655, + "grad_norm": 2.6907451327296195, + "learning_rate": 4.23740333031946e-05, + "loss": 0.2126, + "step": 3713 + }, + { + "epoch": 5.904610492845787, + "grad_norm": 3.80747826109959, + "learning_rate": 4.2379684342208697e-05, + "loss": 0.2311, + "step": 3714 + }, + { + "epoch": 5.906200317965024, + "grad_norm": 3.5710296811269915, + "learning_rate": 4.238533457578581e-05, + "loss": 0.1813, + "step": 3715 + }, + { + "epoch": 5.907790143084261, + "grad_norm": 4.327553754116943, + "learning_rate": 4.2390984002013544e-05, + "loss": 0.1797, + "step": 3716 + }, + { + "epoch": 5.909379968203497, + "grad_norm": 2.30806952481913, + "learning_rate": 4.239663261897977e-05, + "loss": 0.1729, + "step": 3717 + }, + { + "epoch": 5.910969793322734, + "grad_norm": 2.8982249634874213, + "learning_rate": 4.2402280424772635e-05, + "loss": 0.1522, + "step": 3718 + }, + { + "epoch": 5.912559618441971, + "grad_norm": 2.8675575433437497, + "learning_rate": 4.240792741748056e-05, + "loss": 0.1956, + "step": 3719 + }, + { + "epoch": 5.914149443561208, + "grad_norm": 3.476578792593314, + "learning_rate": 4.2413573595192254e-05, + "loss": 0.1578, + "step": 3720 + }, + { + "epoch": 5.915739268680445, + "grad_norm": 2.85438464188752, + "learning_rate": 4.241921895599668e-05, + "loss": 0.1821, + "step": 3721 + }, + { + "epoch": 5.917329093799682, + "grad_norm": 3.9875756057812177, + "learning_rate": 4.2424863497983084e-05, + "loss": 0.149, + "step": 3722 + }, + { + "epoch": 5.918918918918919, + "grad_norm": 4.796303298476455, + "learning_rate": 4.2430507219241e-05, + "loss": 0.1605, + "step": 3723 + }, + { + "epoch": 5.920508744038155, + "grad_norm": 2.957871735305267, + "learning_rate": 4.2436150117860225e-05, + "loss": 0.1578, + "step": 3724 + }, + { + "epoch": 5.922098569157392, + "grad_norm": 4.831291360747747, + "learning_rate": 4.244179219193085e-05, + "loss": 0.206, + "step": 3725 + }, + { + "epoch": 5.923688394276629, + "grad_norm": 4.004169953641362, + "learning_rate": 4.244743343954324e-05, + "loss": 0.15, + "step": 3726 + }, + { + "epoch": 5.925278219395866, + "grad_norm": 3.041288320257261, + "learning_rate": 4.2453073858788024e-05, + "loss": 0.156, + "step": 3727 + }, + { + "epoch": 5.926868044515103, + "grad_norm": 4.102219068126756, + "learning_rate": 4.245871344775614e-05, + "loss": 0.1791, + "step": 3728 + }, + { + "epoch": 5.92845786963434, + "grad_norm": 4.0877470988890785, + "learning_rate": 4.246435220453878e-05, + "loss": 0.1721, + "step": 3729 + }, + { + "epoch": 5.930047694753577, + "grad_norm": 51.12812197381091, + "learning_rate": 4.246999012722743e-05, + "loss": 2.4659, + "step": 3730 + }, + { + "epoch": 5.9316375198728135, + "grad_norm": 5.882338398829938, + "learning_rate": 4.247562721391386e-05, + "loss": 0.1512, + "step": 3731 + }, + { + "epoch": 5.9332273449920505, + "grad_norm": 4.494236819111463, + "learning_rate": 4.248126346269012e-05, + "loss": 0.2701, + "step": 3732 + }, + { + "epoch": 5.9348171701112875, + "grad_norm": 3.7634633745708266, + "learning_rate": 4.2486898871648554e-05, + "loss": 0.1951, + "step": 3733 + }, + { + "epoch": 5.9364069952305245, + "grad_norm": 7.040683414468272, + "learning_rate": 4.249253343888176e-05, + "loss": 0.1439, + "step": 3734 + }, + { + "epoch": 5.9379968203497615, + "grad_norm": 3.9761027567711085, + "learning_rate": 4.249816716248265e-05, + "loss": 0.1799, + "step": 3735 + }, + { + "epoch": 5.9395866454689985, + "grad_norm": 3.084035835740625, + "learning_rate": 4.2503800040544416e-05, + "loss": 0.1663, + "step": 3736 + }, + { + "epoch": 5.9411764705882355, + "grad_norm": 3.116249908625141, + "learning_rate": 4.250943207116053e-05, + "loss": 0.1905, + "step": 3737 + }, + { + "epoch": 5.9427662957074725, + "grad_norm": 4.624749649184074, + "learning_rate": 4.251506325242475e-05, + "loss": 0.1802, + "step": 3738 + }, + { + "epoch": 5.9443561208267095, + "grad_norm": 3.4751079001942182, + "learning_rate": 4.252069358243114e-05, + "loss": 0.2014, + "step": 3739 + }, + { + "epoch": 5.945945945945946, + "grad_norm": 2.572043675234543, + "learning_rate": 4.252632305927402e-05, + "loss": 0.1075, + "step": 3740 + }, + { + "epoch": 5.947535771065183, + "grad_norm": 2.8649430851578006, + "learning_rate": 4.253195168104802e-05, + "loss": 0.2527, + "step": 3741 + }, + { + "epoch": 5.94912559618442, + "grad_norm": 3.653292125997511, + "learning_rate": 4.253757944584806e-05, + "loss": 0.2116, + "step": 3742 + }, + { + "epoch": 5.950715421303657, + "grad_norm": 5.120975617348639, + "learning_rate": 4.254320635176934e-05, + "loss": 0.1657, + "step": 3743 + }, + { + "epoch": 5.952305246422894, + "grad_norm": 2.939695898245953, + "learning_rate": 4.254883239690736e-05, + "loss": 0.1413, + "step": 3744 + }, + { + "epoch": 5.953895071542131, + "grad_norm": 2.4483540505135637, + "learning_rate": 4.255445757935791e-05, + "loss": 0.1663, + "step": 3745 + }, + { + "epoch": 5.955484896661368, + "grad_norm": 4.8001667011050415, + "learning_rate": 4.2560081897217055e-05, + "loss": 0.147, + "step": 3746 + }, + { + "epoch": 5.957074721780604, + "grad_norm": 4.3750835110291355, + "learning_rate": 4.256570534858119e-05, + "loss": 0.2163, + "step": 3747 + }, + { + "epoch": 5.958664546899841, + "grad_norm": 3.9059794962963736, + "learning_rate": 4.257132793154696e-05, + "loss": 0.1692, + "step": 3748 + }, + { + "epoch": 5.960254372019078, + "grad_norm": 1.8419324093293465, + "learning_rate": 4.2576949644211345e-05, + "loss": 0.1245, + "step": 3749 + }, + { + "epoch": 5.961844197138315, + "grad_norm": 2.2664057914440763, + "learning_rate": 4.258257048467157e-05, + "loss": 0.1471, + "step": 3750 + }, + { + "epoch": 5.963434022257552, + "grad_norm": 3.9948490752862997, + "learning_rate": 4.258819045102521e-05, + "loss": 0.2003, + "step": 3751 + }, + { + "epoch": 5.965023847376789, + "grad_norm": 1.506121889002566, + "learning_rate": 4.25938095413701e-05, + "loss": 0.1513, + "step": 3752 + }, + { + "epoch": 5.966613672496026, + "grad_norm": 1.8804431970219035, + "learning_rate": 4.259942775380438e-05, + "loss": 0.1951, + "step": 3753 + }, + { + "epoch": 5.968203497615263, + "grad_norm": 3.0869523543033974, + "learning_rate": 4.2605045086426484e-05, + "loss": 0.1521, + "step": 3754 + }, + { + "epoch": 5.9697933227345, + "grad_norm": 3.3455771580419897, + "learning_rate": 4.2610661537335166e-05, + "loss": 0.1543, + "step": 3755 + }, + { + "epoch": 5.971383147853736, + "grad_norm": 2.7361885839594513, + "learning_rate": 4.261627710462944e-05, + "loss": 0.1956, + "step": 3756 + }, + { + "epoch": 5.972972972972973, + "grad_norm": 2.2746978027066094, + "learning_rate": 4.2621891786408646e-05, + "loss": 0.1316, + "step": 3757 + }, + { + "epoch": 5.97456279809221, + "grad_norm": 2.3136108858193105, + "learning_rate": 4.262750558077243e-05, + "loss": 0.1511, + "step": 3758 + }, + { + "epoch": 5.976152623211447, + "grad_norm": 2.191026673350536, + "learning_rate": 4.263311848582071e-05, + "loss": 0.1526, + "step": 3759 + }, + { + "epoch": 5.977742448330684, + "grad_norm": 1.773620167310186, + "learning_rate": 4.263873049965373e-05, + "loss": 0.1086, + "step": 3760 + }, + { + "epoch": 5.979332273449921, + "grad_norm": 3.1303147144288346, + "learning_rate": 4.2644341620372026e-05, + "loss": 0.1216, + "step": 3761 + }, + { + "epoch": 5.980922098569158, + "grad_norm": 2.554524742452868, + "learning_rate": 4.264995184607642e-05, + "loss": 0.1268, + "step": 3762 + }, + { + "epoch": 5.982511923688394, + "grad_norm": 3.4511912303530687, + "learning_rate": 4.2655561174868094e-05, + "loss": 0.1607, + "step": 3763 + }, + { + "epoch": 5.984101748807631, + "grad_norm": 2.049998293096109, + "learning_rate": 4.266116960484845e-05, + "loss": 0.1454, + "step": 3764 + }, + { + "epoch": 5.985691573926868, + "grad_norm": 1.9038803610007013, + "learning_rate": 4.2666777134119265e-05, + "loss": 0.1614, + "step": 3765 + }, + { + "epoch": 5.987281399046105, + "grad_norm": 2.549695902163392, + "learning_rate": 4.267238376078257e-05, + "loss": 0.1444, + "step": 3766 + }, + { + "epoch": 5.988871224165342, + "grad_norm": 4.7439712943432735, + "learning_rate": 4.2677989482940745e-05, + "loss": 0.1939, + "step": 3767 + }, + { + "epoch": 5.990461049284579, + "grad_norm": 1.8812991892629263, + "learning_rate": 4.2683594298696454e-05, + "loss": 0.1992, + "step": 3768 + }, + { + "epoch": 5.992050874403816, + "grad_norm": 2.931392826241659, + "learning_rate": 4.268919820615266e-05, + "loss": 0.2233, + "step": 3769 + }, + { + "epoch": 5.993640699523052, + "grad_norm": 5.387775404104411, + "learning_rate": 4.269480120341265e-05, + "loss": 0.171, + "step": 3770 + }, + { + "epoch": 5.995230524642289, + "grad_norm": 2.298986764644481, + "learning_rate": 4.2700403288580016e-05, + "loss": 0.2234, + "step": 3771 + }, + { + "epoch": 5.996820349761526, + "grad_norm": 2.0852416473100566, + "learning_rate": 4.270600445975863e-05, + "loss": 0.1084, + "step": 3772 + }, + { + "epoch": 5.998410174880763, + "grad_norm": 3.6764515676553198, + "learning_rate": 4.2711604715052736e-05, + "loss": 0.1454, + "step": 3773 + }, + { + "epoch": 6.0, + "grad_norm": 5.318312095701312, + "learning_rate": 4.271720405256683e-05, + "loss": 0.1595, + "step": 3774 + }, + { + "epoch": 6.001589825119237, + "grad_norm": 3.4884402915311212, + "learning_rate": 4.272280247040575e-05, + "loss": 0.1269, + "step": 3775 + }, + { + "epoch": 6.003179650238474, + "grad_norm": 4.243773906660166, + "learning_rate": 4.272839996667461e-05, + "loss": 0.2005, + "step": 3776 + }, + { + "epoch": 6.004769475357711, + "grad_norm": 3.9229473405231774, + "learning_rate": 4.2733996539478886e-05, + "loss": 0.1372, + "step": 3777 + }, + { + "epoch": 6.006359300476947, + "grad_norm": 2.794396756281207, + "learning_rate": 4.2739592186924327e-05, + "loss": 0.095, + "step": 3778 + }, + { + "epoch": 6.007949125596184, + "grad_norm": 6.262069749219166, + "learning_rate": 4.274518690711701e-05, + "loss": 0.1596, + "step": 3779 + }, + { + "epoch": 6.009538950715421, + "grad_norm": 3.0122092617712553, + "learning_rate": 4.275078069816334e-05, + "loss": 0.1602, + "step": 3780 + }, + { + "epoch": 6.011128775834658, + "grad_norm": 4.726415659973259, + "learning_rate": 4.2756373558169995e-05, + "loss": 0.1146, + "step": 3781 + }, + { + "epoch": 6.012718600953895, + "grad_norm": 2.006408911366836, + "learning_rate": 4.2761965485244006e-05, + "loss": 0.2206, + "step": 3782 + }, + { + "epoch": 6.014308426073132, + "grad_norm": 3.75415602853274, + "learning_rate": 4.2767556477492727e-05, + "loss": 0.2018, + "step": 3783 + }, + { + "epoch": 6.015898251192369, + "grad_norm": 2.936446543471104, + "learning_rate": 4.2773146533023784e-05, + "loss": 0.1493, + "step": 3784 + }, + { + "epoch": 6.017488076311606, + "grad_norm": 2.857405260675501, + "learning_rate": 4.2778735649945145e-05, + "loss": 0.1678, + "step": 3785 + }, + { + "epoch": 6.019077901430842, + "grad_norm": 3.226097668427512, + "learning_rate": 4.278432382636511e-05, + "loss": 0.1649, + "step": 3786 + }, + { + "epoch": 6.020667726550079, + "grad_norm": 3.2867825026679647, + "learning_rate": 4.2789911060392296e-05, + "loss": 0.2736, + "step": 3787 + }, + { + "epoch": 6.022257551669316, + "grad_norm": 2.7559008930501903, + "learning_rate": 4.2795497350135596e-05, + "loss": 0.2252, + "step": 3788 + }, + { + "epoch": 6.023847376788553, + "grad_norm": 1.8436379578720412, + "learning_rate": 4.2801082693704266e-05, + "loss": 0.1713, + "step": 3789 + }, + { + "epoch": 6.02543720190779, + "grad_norm": 1.6217653421494564, + "learning_rate": 4.280666708920788e-05, + "loss": 0.1475, + "step": 3790 + }, + { + "epoch": 6.027027027027027, + "grad_norm": 6.250305306558637, + "learning_rate": 4.281225053475631e-05, + "loss": 0.1969, + "step": 3791 + }, + { + "epoch": 6.028616852146264, + "grad_norm": 2.5982143918651195, + "learning_rate": 4.2817833028459764e-05, + "loss": 0.1261, + "step": 3792 + }, + { + "epoch": 6.030206677265501, + "grad_norm": 3.8688566625873055, + "learning_rate": 4.2823414568428767e-05, + "loss": 0.1747, + "step": 3793 + }, + { + "epoch": 6.031796502384737, + "grad_norm": 3.37110905728922, + "learning_rate": 4.2828995152774175e-05, + "loss": 0.1707, + "step": 3794 + }, + { + "epoch": 6.033386327503974, + "grad_norm": 3.0048936340114394, + "learning_rate": 4.283457477960716e-05, + "loss": 0.185, + "step": 3795 + }, + { + "epoch": 6.034976152623211, + "grad_norm": 4.196159303763704, + "learning_rate": 4.284015344703923e-05, + "loss": 0.1623, + "step": 3796 + }, + { + "epoch": 6.036565977742448, + "grad_norm": 2.678654712535462, + "learning_rate": 4.284573115318219e-05, + "loss": 0.1648, + "step": 3797 + }, + { + "epoch": 6.038155802861685, + "grad_norm": 4.783984180879707, + "learning_rate": 4.28513078961482e-05, + "loss": 0.1303, + "step": 3798 + }, + { + "epoch": 6.039745627980922, + "grad_norm": 4.980025871326622, + "learning_rate": 4.285688367404974e-05, + "loss": 0.2358, + "step": 3799 + }, + { + "epoch": 6.041335453100159, + "grad_norm": 4.118072292197232, + "learning_rate": 4.28624584849996e-05, + "loss": 0.1562, + "step": 3800 + }, + { + "epoch": 6.042925278219396, + "grad_norm": 2.2256264261706815, + "learning_rate": 4.28680323271109e-05, + "loss": 0.119, + "step": 3801 + }, + { + "epoch": 6.044515103338632, + "grad_norm": 3.0027988203392533, + "learning_rate": 4.287360519849712e-05, + "loss": 0.1348, + "step": 3802 + }, + { + "epoch": 6.046104928457869, + "grad_norm": 2.718964319786782, + "learning_rate": 4.287917709727203e-05, + "loss": 0.1351, + "step": 3803 + }, + { + "epoch": 6.047694753577106, + "grad_norm": 3.140571198690672, + "learning_rate": 4.288474802154975e-05, + "loss": 0.1632, + "step": 3804 + }, + { + "epoch": 6.049284578696343, + "grad_norm": 2.3037945455579787, + "learning_rate": 4.2890317969444724e-05, + "loss": 0.1907, + "step": 3805 + }, + { + "epoch": 6.05087440381558, + "grad_norm": 3.0103910103955545, + "learning_rate": 4.289588693907171e-05, + "loss": 0.1673, + "step": 3806 + }, + { + "epoch": 6.052464228934817, + "grad_norm": 3.390010101457781, + "learning_rate": 4.290145492854583e-05, + "loss": 0.1579, + "step": 3807 + }, + { + "epoch": 6.054054054054054, + "grad_norm": 2.88111431493689, + "learning_rate": 4.290702193598253e-05, + "loss": 0.1487, + "step": 3808 + }, + { + "epoch": 6.0556438791732905, + "grad_norm": 3.1973676098415282, + "learning_rate": 4.291258795949756e-05, + "loss": 0.1349, + "step": 3809 + }, + { + "epoch": 6.0572337042925275, + "grad_norm": 3.575724119152044, + "learning_rate": 4.2918152997207024e-05, + "loss": 0.1079, + "step": 3810 + }, + { + "epoch": 6.0588235294117645, + "grad_norm": 71.33525197738116, + "learning_rate": 4.292371704722737e-05, + "loss": 10.6124, + "step": 3811 + }, + { + "epoch": 6.0604133545310015, + "grad_norm": 2.183241093832023, + "learning_rate": 4.292928010767536e-05, + "loss": 0.1934, + "step": 3812 + }, + { + "epoch": 6.0620031796502385, + "grad_norm": 1.5925657311808048, + "learning_rate": 4.2934842176668104e-05, + "loss": 0.1391, + "step": 3813 + }, + { + "epoch": 6.0635930047694755, + "grad_norm": 6.127758633144182, + "learning_rate": 4.294040325232304e-05, + "loss": 0.2086, + "step": 3814 + }, + { + "epoch": 6.0651828298887125, + "grad_norm": 2.509417515341288, + "learning_rate": 4.294596333275795e-05, + "loss": 0.1249, + "step": 3815 + }, + { + "epoch": 6.0667726550079495, + "grad_norm": 3.2945839919856383, + "learning_rate": 4.295152241609094e-05, + "loss": 0.1487, + "step": 3816 + }, + { + "epoch": 6.068362480127186, + "grad_norm": 3.444841072552306, + "learning_rate": 4.295708050044047e-05, + "loss": 0.1742, + "step": 3817 + }, + { + "epoch": 6.069952305246423, + "grad_norm": 3.834275363744185, + "learning_rate": 4.296263758392532e-05, + "loss": 0.1568, + "step": 3818 + }, + { + "epoch": 6.07154213036566, + "grad_norm": 6.642829753042326, + "learning_rate": 4.296819366466463e-05, + "loss": 0.1626, + "step": 3819 + }, + { + "epoch": 6.073131955484897, + "grad_norm": 2.8470201065656724, + "learning_rate": 4.2973748740777864e-05, + "loss": 0.201, + "step": 3820 + }, + { + "epoch": 6.074721780604134, + "grad_norm": 10.09150841123969, + "learning_rate": 4.297930281038482e-05, + "loss": 1.2174, + "step": 3821 + }, + { + "epoch": 6.076311605723371, + "grad_norm": 6.875786685178163, + "learning_rate": 4.2984855871605664e-05, + "loss": 0.1341, + "step": 3822 + }, + { + "epoch": 6.077901430842608, + "grad_norm": 5.0376635553470255, + "learning_rate": 4.299040792256086e-05, + "loss": 0.1678, + "step": 3823 + }, + { + "epoch": 6.079491255961845, + "grad_norm": 4.191154685507322, + "learning_rate": 4.299595896137127e-05, + "loss": 0.2037, + "step": 3824 + }, + { + "epoch": 6.081081081081081, + "grad_norm": 7.7529853259891, + "learning_rate": 4.300150898615806e-05, + "loss": 0.1533, + "step": 3825 + }, + { + "epoch": 6.082670906200318, + "grad_norm": 4.303897390322826, + "learning_rate": 4.300705799504273e-05, + "loss": 0.0918, + "step": 3826 + }, + { + "epoch": 6.084260731319555, + "grad_norm": 10.221934450044976, + "learning_rate": 4.301260598614716e-05, + "loss": 0.3481, + "step": 3827 + }, + { + "epoch": 6.085850556438792, + "grad_norm": 3.4908912422427583, + "learning_rate": 4.3018152957593545e-05, + "loss": 0.1298, + "step": 3828 + }, + { + "epoch": 6.087440381558029, + "grad_norm": 7.8154249835252925, + "learning_rate": 4.3023698907504446e-05, + "loss": 0.2107, + "step": 3829 + }, + { + "epoch": 6.089030206677266, + "grad_norm": 4.977298396692504, + "learning_rate": 4.302924383400275e-05, + "loss": 0.1186, + "step": 3830 + }, + { + "epoch": 6.090620031796503, + "grad_norm": 3.9760716562457334, + "learning_rate": 4.3034787735211704e-05, + "loss": 0.1621, + "step": 3831 + }, + { + "epoch": 6.09220985691574, + "grad_norm": 9.35904315254086, + "learning_rate": 4.3040330609254906e-05, + "loss": 0.1956, + "step": 3832 + }, + { + "epoch": 6.093799682034976, + "grad_norm": 3.6522555121735336, + "learning_rate": 4.3045872454256286e-05, + "loss": 0.1183, + "step": 3833 + }, + { + "epoch": 6.095389507154213, + "grad_norm": 7.651878149124375, + "learning_rate": 4.305141326834012e-05, + "loss": 0.1708, + "step": 3834 + }, + { + "epoch": 6.09697933227345, + "grad_norm": 4.502420400954156, + "learning_rate": 4.305695304963106e-05, + "loss": 0.2289, + "step": 3835 + }, + { + "epoch": 6.098569157392687, + "grad_norm": 3.5525505976336174, + "learning_rate": 4.306249179625408e-05, + "loss": 0.1679, + "step": 3836 + }, + { + "epoch": 6.100158982511924, + "grad_norm": 5.062809461364913, + "learning_rate": 4.3068029506334525e-05, + "loss": 0.1086, + "step": 3837 + }, + { + "epoch": 6.101748807631161, + "grad_norm": 150.93394429795555, + "learning_rate": 4.307356617799807e-05, + "loss": 2.1138, + "step": 3838 + }, + { + "epoch": 6.103338632750398, + "grad_norm": 5.7193338505265565, + "learning_rate": 4.307910180937076e-05, + "loss": 0.1904, + "step": 3839 + }, + { + "epoch": 6.104928457869635, + "grad_norm": 13.067778126022434, + "learning_rate": 4.308463639857898e-05, + "loss": 1.6698, + "step": 3840 + }, + { + "epoch": 6.106518282988871, + "grad_norm": 6.513797903891124, + "learning_rate": 4.309016994374948e-05, + "loss": 1.3768, + "step": 3841 + }, + { + "epoch": 6.108108108108108, + "grad_norm": 4.648228525682147, + "learning_rate": 4.309570244300934e-05, + "loss": 0.2673, + "step": 3842 + }, + { + "epoch": 6.109697933227345, + "grad_norm": 2.902675380691719, + "learning_rate": 4.310123389448601e-05, + "loss": 0.1636, + "step": 3843 + }, + { + "epoch": 6.111287758346582, + "grad_norm": 3.6576679521238633, + "learning_rate": 4.310676429630732e-05, + "loss": 0.1829, + "step": 3844 + }, + { + "epoch": 6.112877583465819, + "grad_norm": 3.301364465893495, + "learning_rate": 4.31122936466014e-05, + "loss": 0.1763, + "step": 3845 + }, + { + "epoch": 6.114467408585056, + "grad_norm": 5.347002075878882, + "learning_rate": 4.311782194349678e-05, + "loss": 0.1684, + "step": 3846 + }, + { + "epoch": 6.116057233704293, + "grad_norm": 2.941263037780606, + "learning_rate": 4.3123349185122325e-05, + "loss": 0.1962, + "step": 3847 + }, + { + "epoch": 6.117647058823529, + "grad_norm": 4.588665714552765, + "learning_rate": 4.312887536960727e-05, + "loss": 0.2269, + "step": 3848 + }, + { + "epoch": 6.119236883942766, + "grad_norm": 5.391831968229123, + "learning_rate": 4.31344004950812e-05, + "loss": 0.1778, + "step": 3849 + }, + { + "epoch": 6.120826709062003, + "grad_norm": 4.862871169589613, + "learning_rate": 4.3139924559674054e-05, + "loss": 0.1299, + "step": 3850 + }, + { + "epoch": 6.12241653418124, + "grad_norm": 3.951072356042652, + "learning_rate": 4.314544756151614e-05, + "loss": 0.208, + "step": 3851 + }, + { + "epoch": 6.124006359300477, + "grad_norm": 3.6290899603033955, + "learning_rate": 4.3150969498738125e-05, + "loss": 0.2119, + "step": 3852 + }, + { + "epoch": 6.125596184419714, + "grad_norm": 7.00841532181519, + "learning_rate": 4.315649036947103e-05, + "loss": 0.2783, + "step": 3853 + }, + { + "epoch": 6.127186009538951, + "grad_norm": 3.7113663186183263, + "learning_rate": 4.316201017184623e-05, + "loss": 0.1852, + "step": 3854 + }, + { + "epoch": 6.128775834658188, + "grad_norm": 3.3012612358963604, + "learning_rate": 4.31675289039955e-05, + "loss": 0.1777, + "step": 3855 + }, + { + "epoch": 6.130365659777424, + "grad_norm": 8.017046671679289, + "learning_rate": 4.317304656405092e-05, + "loss": 0.2217, + "step": 3856 + }, + { + "epoch": 6.131955484896661, + "grad_norm": 70.68526557637794, + "learning_rate": 4.317856315014498e-05, + "loss": 3.707, + "step": 3857 + }, + { + "epoch": 6.133545310015898, + "grad_norm": 3.042082201955096, + "learning_rate": 4.3184078660410507e-05, + "loss": 0.2327, + "step": 3858 + }, + { + "epoch": 6.135135135135135, + "grad_norm": 4.84759485176823, + "learning_rate": 4.31895930929807e-05, + "loss": 0.1903, + "step": 3859 + }, + { + "epoch": 6.136724960254372, + "grad_norm": 3.6893624471946547, + "learning_rate": 4.319510644598913e-05, + "loss": 0.1567, + "step": 3860 + }, + { + "epoch": 6.138314785373609, + "grad_norm": 2.603669842919212, + "learning_rate": 4.320061871756972e-05, + "loss": 0.1913, + "step": 3861 + }, + { + "epoch": 6.139904610492846, + "grad_norm": 6.2915017321647735, + "learning_rate": 4.320612990585676e-05, + "loss": 0.1737, + "step": 3862 + }, + { + "epoch": 6.141494435612083, + "grad_norm": 5.185503510259138, + "learning_rate": 4.321164000898493e-05, + "loss": 0.2781, + "step": 3863 + }, + { + "epoch": 6.143084260731319, + "grad_norm": 3.3337601476475736, + "learning_rate": 4.321714902508925e-05, + "loss": 0.1484, + "step": 3864 + }, + { + "epoch": 6.144674085850556, + "grad_norm": 8.966649868343184, + "learning_rate": 4.322265695230511e-05, + "loss": 0.2052, + "step": 3865 + }, + { + "epoch": 6.146263910969793, + "grad_norm": 4.802419078747724, + "learning_rate": 4.3228163788768295e-05, + "loss": 0.1456, + "step": 3866 + }, + { + "epoch": 6.14785373608903, + "grad_norm": 4.239593222104979, + "learning_rate": 4.3233669532614915e-05, + "loss": 0.1216, + "step": 3867 + }, + { + "epoch": 6.149443561208267, + "grad_norm": 7.273711956229438, + "learning_rate": 4.323917418198149e-05, + "loss": 0.2475, + "step": 3868 + }, + { + "epoch": 6.151033386327504, + "grad_norm": 4.407103720587932, + "learning_rate": 4.3244677735004905e-05, + "loss": 0.1232, + "step": 3869 + }, + { + "epoch": 6.152623211446741, + "grad_norm": 3.1849859604425705, + "learning_rate": 4.325018018982239e-05, + "loss": 0.1436, + "step": 3870 + }, + { + "epoch": 6.154213036565977, + "grad_norm": 3.5664583600182356, + "learning_rate": 4.3255681544571564e-05, + "loss": 0.1683, + "step": 3871 + }, + { + "epoch": 6.155802861685214, + "grad_norm": 4.233853595089014, + "learning_rate": 4.3261181797390426e-05, + "loss": 0.1624, + "step": 3872 + }, + { + "epoch": 6.157392686804451, + "grad_norm": 2.9067073598039035, + "learning_rate": 4.3266680946417345e-05, + "loss": 0.1538, + "step": 3873 + }, + { + "epoch": 6.158982511923688, + "grad_norm": 6.710175670754754, + "learning_rate": 4.327217898979104e-05, + "loss": 0.1674, + "step": 3874 + }, + { + "epoch": 6.160572337042925, + "grad_norm": 11.514694121061552, + "learning_rate": 4.3277675925650634e-05, + "loss": 0.4228, + "step": 3875 + }, + { + "epoch": 6.162162162162162, + "grad_norm": 10.555504160079467, + "learning_rate": 4.328317175213561e-05, + "loss": 0.2889, + "step": 3876 + }, + { + "epoch": 6.163751987281399, + "grad_norm": 3.3750221255545134, + "learning_rate": 4.328866646738583e-05, + "loss": 0.1417, + "step": 3877 + }, + { + "epoch": 6.165341812400636, + "grad_norm": 2.7676116640245776, + "learning_rate": 4.329416006954154e-05, + "loss": 0.1512, + "step": 3878 + }, + { + "epoch": 6.166931637519872, + "grad_norm": 2.383154568289766, + "learning_rate": 4.329965255674334e-05, + "loss": 0.1176, + "step": 3879 + }, + { + "epoch": 6.168521462639109, + "grad_norm": 2.129204080011837, + "learning_rate": 4.3305143927132236e-05, + "loss": 0.1248, + "step": 3880 + }, + { + "epoch": 6.170111287758346, + "grad_norm": 3.5627789313469482, + "learning_rate": 4.331063417884958e-05, + "loss": 0.2035, + "step": 3881 + }, + { + "epoch": 6.171701112877583, + "grad_norm": 3.2863234685301315, + "learning_rate": 4.331612331003714e-05, + "loss": 0.1618, + "step": 3882 + }, + { + "epoch": 6.17329093799682, + "grad_norm": 3.9247289422727683, + "learning_rate": 4.332161131883703e-05, + "loss": 0.1537, + "step": 3883 + }, + { + "epoch": 6.174880763116057, + "grad_norm": 2.3924770675834552, + "learning_rate": 4.332709820339177e-05, + "loss": 0.1695, + "step": 3884 + }, + { + "epoch": 6.176470588235294, + "grad_norm": 2.2163912888938047, + "learning_rate": 4.333258396184424e-05, + "loss": 0.1422, + "step": 3885 + }, + { + "epoch": 6.178060413354531, + "grad_norm": 2.747925814600689, + "learning_rate": 4.333806859233771e-05, + "loss": 0.2105, + "step": 3886 + }, + { + "epoch": 6.1796502384737675, + "grad_norm": 3.768341278931603, + "learning_rate": 4.334355209301584e-05, + "loss": 0.1332, + "step": 3887 + }, + { + "epoch": 6.1812400635930045, + "grad_norm": 1.8780248470998868, + "learning_rate": 4.3349034462022646e-05, + "loss": 0.1549, + "step": 3888 + }, + { + "epoch": 6.1828298887122415, + "grad_norm": 3.397112397573954, + "learning_rate": 4.335451569750255e-05, + "loss": 0.1509, + "step": 3889 + }, + { + "epoch": 6.1844197138314785, + "grad_norm": 4.14598729826173, + "learning_rate": 4.3359995797600367e-05, + "loss": 0.1771, + "step": 3890 + }, + { + "epoch": 6.1860095389507155, + "grad_norm": 2.3819821296917314, + "learning_rate": 4.3365474760461266e-05, + "loss": 0.2591, + "step": 3891 + }, + { + "epoch": 6.1875993640699525, + "grad_norm": 8.589642173477007, + "learning_rate": 4.337095258423082e-05, + "loss": 0.2104, + "step": 3892 + }, + { + "epoch": 6.1891891891891895, + "grad_norm": 3.121592684422043, + "learning_rate": 4.337642926705499e-05, + "loss": 0.1396, + "step": 3893 + }, + { + "epoch": 6.1907790143084265, + "grad_norm": 4.456598861147551, + "learning_rate": 4.3381904807080114e-05, + "loss": 0.1414, + "step": 3894 + }, + { + "epoch": 6.192368839427663, + "grad_norm": 5.568981772603662, + "learning_rate": 4.3387379202452916e-05, + "loss": 0.1577, + "step": 3895 + }, + { + "epoch": 6.1939586645469, + "grad_norm": 3.159193650495683, + "learning_rate": 4.339285245132051e-05, + "loss": 0.1398, + "step": 3896 + }, + { + "epoch": 6.195548489666137, + "grad_norm": 4.968549981966246, + "learning_rate": 4.339832455183042e-05, + "loss": 0.1584, + "step": 3897 + }, + { + "epoch": 6.197138314785374, + "grad_norm": 5.384828594762002, + "learning_rate": 4.34037955021305e-05, + "loss": 0.1825, + "step": 3898 + }, + { + "epoch": 6.198728139904611, + "grad_norm": 3.884476370622544, + "learning_rate": 4.3409265300369066e-05, + "loss": 0.1376, + "step": 3899 + }, + { + "epoch": 6.200317965023848, + "grad_norm": 6.0543881244866675, + "learning_rate": 4.341473394469477e-05, + "loss": 0.1256, + "step": 3900 + }, + { + "epoch": 6.201907790143085, + "grad_norm": 3.312737600605519, + "learning_rate": 4.342020143325669e-05, + "loss": 0.13, + "step": 3901 + }, + { + "epoch": 6.203497615262322, + "grad_norm": 2.5131444217689736, + "learning_rate": 4.342566776420426e-05, + "loss": 0.1321, + "step": 3902 + }, + { + "epoch": 6.205087440381558, + "grad_norm": 2.689499730366301, + "learning_rate": 4.3431132935687345e-05, + "loss": 0.1661, + "step": 3903 + }, + { + "epoch": 6.206677265500795, + "grad_norm": 4.062436930000234, + "learning_rate": 4.343659694585616e-05, + "loss": 0.1587, + "step": 3904 + }, + { + "epoch": 6.208267090620032, + "grad_norm": 3.751191597129353, + "learning_rate": 4.344205979286136e-05, + "loss": 0.1388, + "step": 3905 + }, + { + "epoch": 6.209856915739269, + "grad_norm": 25.09997476149906, + "learning_rate": 4.3447521474853946e-05, + "loss": 2.8884, + "step": 3906 + }, + { + "epoch": 6.211446740858506, + "grad_norm": 5.1030592467879075, + "learning_rate": 4.345298198998535e-05, + "loss": 0.2002, + "step": 3907 + }, + { + "epoch": 6.213036565977743, + "grad_norm": 2.8430710119335934, + "learning_rate": 4.345844133640738e-05, + "loss": 0.2205, + "step": 3908 + }, + { + "epoch": 6.21462639109698, + "grad_norm": 3.477673301205507, + "learning_rate": 4.3463899512272245e-05, + "loss": 0.1673, + "step": 3909 + }, + { + "epoch": 6.216216216216216, + "grad_norm": 2.4435639432317577, + "learning_rate": 4.346935651573256e-05, + "loss": 0.1755, + "step": 3910 + }, + { + "epoch": 6.217806041335453, + "grad_norm": 5.640203551544657, + "learning_rate": 4.347481234494132e-05, + "loss": 0.1982, + "step": 3911 + }, + { + "epoch": 6.21939586645469, + "grad_norm": 2.621056197909092, + "learning_rate": 4.348026699805191e-05, + "loss": 0.1807, + "step": 3912 + }, + { + "epoch": 6.220985691573927, + "grad_norm": 4.201967597534844, + "learning_rate": 4.3485720473218155e-05, + "loss": 0.1649, + "step": 3913 + }, + { + "epoch": 6.222575516693164, + "grad_norm": 4.25098358469494, + "learning_rate": 4.349117276859423e-05, + "loss": 0.1553, + "step": 3914 + }, + { + "epoch": 6.224165341812401, + "grad_norm": 2.830675159770114, + "learning_rate": 4.349662388233474e-05, + "loss": 0.1976, + "step": 3915 + }, + { + "epoch": 6.225755166931638, + "grad_norm": 4.468595334849825, + "learning_rate": 4.350207381259468e-05, + "loss": 0.1684, + "step": 3916 + }, + { + "epoch": 6.227344992050875, + "grad_norm": 4.3087209837071425, + "learning_rate": 4.3507522557529436e-05, + "loss": 0.2082, + "step": 3917 + }, + { + "epoch": 6.228934817170111, + "grad_norm": 1.6172787904498622, + "learning_rate": 4.3512970115294824e-05, + "loss": 0.1722, + "step": 3918 + }, + { + "epoch": 6.230524642289348, + "grad_norm": 3.9494249201520426, + "learning_rate": 4.3518416484047024e-05, + "loss": 0.1643, + "step": 3919 + }, + { + "epoch": 6.232114467408585, + "grad_norm": 6.5709518797510045, + "learning_rate": 4.352386166194264e-05, + "loss": 0.2218, + "step": 3920 + }, + { + "epoch": 6.233704292527822, + "grad_norm": 3.4198094920248145, + "learning_rate": 4.352930564713869e-05, + "loss": 0.206, + "step": 3921 + }, + { + "epoch": 6.235294117647059, + "grad_norm": 4.866579287383695, + "learning_rate": 4.353474843779257e-05, + "loss": 0.1813, + "step": 3922 + }, + { + "epoch": 6.236883942766296, + "grad_norm": 5.707231703452576, + "learning_rate": 4.35401900320621e-05, + "loss": 0.1756, + "step": 3923 + }, + { + "epoch": 6.238473767885533, + "grad_norm": 29.071499033451776, + "learning_rate": 4.3545630428105496e-05, + "loss": 4.3931, + "step": 3924 + }, + { + "epoch": 6.24006359300477, + "grad_norm": 4.465577590757787, + "learning_rate": 4.355106962408137e-05, + "loss": 0.1738, + "step": 3925 + }, + { + "epoch": 6.241653418124006, + "grad_norm": 2.1650333085123443, + "learning_rate": 4.355650761814877e-05, + "loss": 0.1553, + "step": 3926 + }, + { + "epoch": 6.243243243243243, + "grad_norm": 2.2962628034425334, + "learning_rate": 4.356194440846712e-05, + "loss": 0.1518, + "step": 3927 + }, + { + "epoch": 6.24483306836248, + "grad_norm": 1.7792538560580018, + "learning_rate": 4.3567379993196256e-05, + "loss": 0.1259, + "step": 3928 + }, + { + "epoch": 6.246422893481717, + "grad_norm": 3.5304963914868286, + "learning_rate": 4.357281437049644e-05, + "loss": 0.1179, + "step": 3929 + }, + { + "epoch": 6.248012718600954, + "grad_norm": 2.9884946226557916, + "learning_rate": 4.357824753852833e-05, + "loss": 0.161, + "step": 3930 + }, + { + "epoch": 6.249602543720191, + "grad_norm": 2.1067982257152598, + "learning_rate": 4.3583679495453e-05, + "loss": 0.1654, + "step": 3931 + }, + { + "epoch": 6.251192368839428, + "grad_norm": 31.12845886160391, + "learning_rate": 4.3589110239431935e-05, + "loss": 4.0818, + "step": 3932 + }, + { + "epoch": 6.252782193958664, + "grad_norm": 2.597854733457868, + "learning_rate": 4.3594539768626994e-05, + "loss": 0.1427, + "step": 3933 + }, + { + "epoch": 6.254372019077901, + "grad_norm": 2.3924966673761277, + "learning_rate": 4.359996808120051e-05, + "loss": 0.1923, + "step": 3934 + }, + { + "epoch": 6.255961844197138, + "grad_norm": 1.9340228087519844, + "learning_rate": 4.360539517531519e-05, + "loss": 0.1727, + "step": 3935 + }, + { + "epoch": 6.257551669316375, + "grad_norm": 2.5076155002887695, + "learning_rate": 4.361082104913414e-05, + "loss": 0.2053, + "step": 3936 + }, + { + "epoch": 6.259141494435612, + "grad_norm": 2.6511006976190123, + "learning_rate": 4.361624570082092e-05, + "loss": 0.9781, + "step": 3937 + }, + { + "epoch": 6.260731319554849, + "grad_norm": 2.17775592772492, + "learning_rate": 4.362166912853948e-05, + "loss": 0.1698, + "step": 3938 + }, + { + "epoch": 6.262321144674086, + "grad_norm": 2.849841478849213, + "learning_rate": 4.362709133045417e-05, + "loss": 0.1176, + "step": 3939 + }, + { + "epoch": 6.263910969793323, + "grad_norm": 3.4625957848509277, + "learning_rate": 4.3632512304729785e-05, + "loss": 0.1651, + "step": 3940 + }, + { + "epoch": 6.26550079491256, + "grad_norm": 3.409503292909845, + "learning_rate": 4.363793204953151e-05, + "loss": 0.1591, + "step": 3941 + }, + { + "epoch": 6.267090620031796, + "grad_norm": 2.7455618796468944, + "learning_rate": 4.364335056302498e-05, + "loss": 0.1837, + "step": 3942 + }, + { + "epoch": 6.268680445151033, + "grad_norm": 2.299385118871967, + "learning_rate": 4.3648767843376195e-05, + "loss": 0.2417, + "step": 3943 + }, + { + "epoch": 6.27027027027027, + "grad_norm": 6.974947695290759, + "learning_rate": 4.365418388875163e-05, + "loss": 0.1618, + "step": 3944 + }, + { + "epoch": 6.271860095389507, + "grad_norm": 20.139283973390842, + "learning_rate": 4.3659598697318125e-05, + "loss": 1.8979, + "step": 3945 + }, + { + "epoch": 6.273449920508744, + "grad_norm": 4.46792457222011, + "learning_rate": 4.3665012267242977e-05, + "loss": 0.1517, + "step": 3946 + }, + { + "epoch": 6.275039745627981, + "grad_norm": 5.430668558918035, + "learning_rate": 4.3670424596693885e-05, + "loss": 0.179, + "step": 3947 + }, + { + "epoch": 6.276629570747218, + "grad_norm": 8.806930581085638, + "learning_rate": 4.367583568383897e-05, + "loss": 0.1527, + "step": 3948 + }, + { + "epoch": 6.278219395866454, + "grad_norm": 3.085863624247372, + "learning_rate": 4.368124552684678e-05, + "loss": 0.1367, + "step": 3949 + }, + { + "epoch": 6.279809220985691, + "grad_norm": 7.220349875183313, + "learning_rate": 4.368665412388628e-05, + "loss": 0.1754, + "step": 3950 + }, + { + "epoch": 6.281399046104928, + "grad_norm": 5.658028104718708, + "learning_rate": 4.369206147312685e-05, + "loss": 0.3334, + "step": 3951 + }, + { + "epoch": 6.282988871224165, + "grad_norm": 5.539576982911385, + "learning_rate": 4.369746757273829e-05, + "loss": 0.1863, + "step": 3952 + }, + { + "epoch": 6.284578696343402, + "grad_norm": 3.4115569762090647, + "learning_rate": 4.3702872420890856e-05, + "loss": 0.1841, + "step": 3953 + }, + { + "epoch": 6.286168521462639, + "grad_norm": 2.95133926036608, + "learning_rate": 4.370827601575518e-05, + "loss": 0.1684, + "step": 3954 + }, + { + "epoch": 6.287758346581876, + "grad_norm": 4.357053699679026, + "learning_rate": 4.3713678355502345e-05, + "loss": 0.1422, + "step": 3955 + }, + { + "epoch": 6.289348171701113, + "grad_norm": 4.757250158603028, + "learning_rate": 4.371907943830387e-05, + "loss": 0.1497, + "step": 3956 + }, + { + "epoch": 6.290937996820349, + "grad_norm": 2.129048609284599, + "learning_rate": 4.372447926233166e-05, + "loss": 0.1621, + "step": 3957 + }, + { + "epoch": 6.292527821939586, + "grad_norm": 4.40707957764367, + "learning_rate": 4.372987782575809e-05, + "loss": 0.1522, + "step": 3958 + }, + { + "epoch": 6.294117647058823, + "grad_norm": 3.2921704874318585, + "learning_rate": 4.373527512675593e-05, + "loss": 0.1064, + "step": 3959 + }, + { + "epoch": 6.29570747217806, + "grad_norm": 4.194949316250937, + "learning_rate": 4.37406711634984e-05, + "loss": 0.1633, + "step": 3960 + }, + { + "epoch": 6.297297297297297, + "grad_norm": 3.4304651168810727, + "learning_rate": 4.3746065934159124e-05, + "loss": 0.1651, + "step": 3961 + }, + { + "epoch": 6.298887122416534, + "grad_norm": 5.450821105527862, + "learning_rate": 4.3751459436912175e-05, + "loss": 0.1768, + "step": 3962 + }, + { + "epoch": 6.300476947535771, + "grad_norm": 2.3794906448305384, + "learning_rate": 4.3756851669932046e-05, + "loss": 0.1338, + "step": 3963 + }, + { + "epoch": 6.302066772655008, + "grad_norm": 2.077951110374839, + "learning_rate": 4.376224263139366e-05, + "loss": 0.1758, + "step": 3964 + }, + { + "epoch": 6.3036565977742445, + "grad_norm": 6.124251727409857, + "learning_rate": 4.376763231947237e-05, + "loss": 0.1776, + "step": 3965 + }, + { + "epoch": 6.3052464228934815, + "grad_norm": 2.855246983547, + "learning_rate": 4.377302073234397e-05, + "loss": 0.1882, + "step": 3966 + }, + { + "epoch": 6.3068362480127185, + "grad_norm": 2.574696234319509, + "learning_rate": 4.3778407868184675e-05, + "loss": 0.1668, + "step": 3967 + }, + { + "epoch": 6.3084260731319555, + "grad_norm": 2.766116413131408, + "learning_rate": 4.3783793725171124e-05, + "loss": 0.1244, + "step": 3968 + }, + { + "epoch": 6.3100158982511925, + "grad_norm": 4.226374302376006, + "learning_rate": 4.3789178301480416e-05, + "loss": 0.2008, + "step": 3969 + }, + { + "epoch": 6.3116057233704295, + "grad_norm": 2.5496433325477605, + "learning_rate": 4.3794561595290055e-05, + "loss": 0.1431, + "step": 3970 + }, + { + "epoch": 6.3131955484896665, + "grad_norm": 4.024387961300335, + "learning_rate": 4.379994360477799e-05, + "loss": 0.2409, + "step": 3971 + }, + { + "epoch": 6.314785373608903, + "grad_norm": 2.9855026237023234, + "learning_rate": 4.380532432812262e-05, + "loss": 0.1572, + "step": 3972 + }, + { + "epoch": 6.31637519872814, + "grad_norm": 2.504949952793185, + "learning_rate": 4.3810703763502744e-05, + "loss": 0.1943, + "step": 3973 + }, + { + "epoch": 6.317965023847377, + "grad_norm": 4.456025650969578, + "learning_rate": 4.381608190909764e-05, + "loss": 0.1773, + "step": 3974 + }, + { + "epoch": 6.319554848966614, + "grad_norm": 4.230966444694455, + "learning_rate": 4.3821458763086973e-05, + "loss": 0.1655, + "step": 3975 + }, + { + "epoch": 6.321144674085851, + "grad_norm": 3.889743287859853, + "learning_rate": 4.3826834323650894e-05, + "loss": 0.1514, + "step": 3976 + }, + { + "epoch": 6.322734499205088, + "grad_norm": 5.482141271530701, + "learning_rate": 4.383220858896997e-05, + "loss": 0.1833, + "step": 3977 + }, + { + "epoch": 6.324324324324325, + "grad_norm": 3.3728747554798133, + "learning_rate": 4.383758155722521e-05, + "loss": 0.1779, + "step": 3978 + }, + { + "epoch": 6.325914149443562, + "grad_norm": 4.013211988904892, + "learning_rate": 4.3842953226598035e-05, + "loss": 0.1982, + "step": 3979 + }, + { + "epoch": 6.327503974562799, + "grad_norm": 3.0494297197057545, + "learning_rate": 4.3848323595270355e-05, + "loss": 0.1605, + "step": 3980 + }, + { + "epoch": 6.329093799682035, + "grad_norm": 4.567958516742571, + "learning_rate": 4.385369266142448e-05, + "loss": 0.2152, + "step": 3981 + }, + { + "epoch": 6.330683624801272, + "grad_norm": 4.599613776330691, + "learning_rate": 4.3859060423243186e-05, + "loss": 0.1508, + "step": 3982 + }, + { + "epoch": 6.332273449920509, + "grad_norm": 3.4676506573444463, + "learning_rate": 4.3864426878909674e-05, + "loss": 0.1543, + "step": 3983 + }, + { + "epoch": 6.333863275039746, + "grad_norm": 4.927328704589414, + "learning_rate": 4.386979202660759e-05, + "loss": 0.1763, + "step": 3984 + }, + { + "epoch": 6.335453100158983, + "grad_norm": 4.313460723722417, + "learning_rate": 4.387515586452103e-05, + "loss": 0.2037, + "step": 3985 + }, + { + "epoch": 6.33704292527822, + "grad_norm": 4.337815171072039, + "learning_rate": 4.388051839083453e-05, + "loss": 0.1858, + "step": 3986 + }, + { + "epoch": 6.338632750397457, + "grad_norm": 3.6383564827206554, + "learning_rate": 4.388587960373307e-05, + "loss": 0.1315, + "step": 3987 + }, + { + "epoch": 6.340222575516693, + "grad_norm": 1.654123914660149, + "learning_rate": 4.3891239501402065e-05, + "loss": 0.1903, + "step": 3988 + }, + { + "epoch": 6.34181240063593, + "grad_norm": 7.734505371407981, + "learning_rate": 4.389659808202739e-05, + "loss": 0.1664, + "step": 3989 + }, + { + "epoch": 6.343402225755167, + "grad_norm": 2.724978710975828, + "learning_rate": 4.390195534379536e-05, + "loss": 0.1593, + "step": 3990 + }, + { + "epoch": 6.344992050874404, + "grad_norm": 5.500497667992429, + "learning_rate": 4.390731128489274e-05, + "loss": 0.1781, + "step": 3991 + }, + { + "epoch": 6.346581875993641, + "grad_norm": 4.034119971973808, + "learning_rate": 4.391266590350673e-05, + "loss": 0.1882, + "step": 3992 + }, + { + "epoch": 6.348171701112878, + "grad_norm": 12.54149412168064, + "learning_rate": 4.391801919782499e-05, + "loss": 1.9821, + "step": 3993 + }, + { + "epoch": 6.349761526232115, + "grad_norm": 17.3376403394392, + "learning_rate": 4.3923371166035616e-05, + "loss": 0.9703, + "step": 3994 + }, + { + "epoch": 6.351351351351352, + "grad_norm": 3.017023413090191, + "learning_rate": 4.392872180632717e-05, + "loss": 0.203, + "step": 3995 + }, + { + "epoch": 6.352941176470588, + "grad_norm": 3.296320082005013, + "learning_rate": 4.393407111688865e-05, + "loss": 0.1349, + "step": 3996 + }, + { + "epoch": 6.354531001589825, + "grad_norm": 3.994641021796883, + "learning_rate": 4.3939419095909514e-05, + "loss": 0.1319, + "step": 3997 + }, + { + "epoch": 6.356120826709062, + "grad_norm": 5.184669169997193, + "learning_rate": 4.394476574157965e-05, + "loss": 0.2358, + "step": 3998 + }, + { + "epoch": 6.357710651828299, + "grad_norm": 2.877361955874784, + "learning_rate": 4.395011105208944e-05, + "loss": 0.11, + "step": 3999 + }, + { + "epoch": 6.359300476947536, + "grad_norm": 4.2293162396193695, + "learning_rate": 4.395545502562965e-05, + "loss": 0.2166, + "step": 4000 + }, + { + "epoch": 6.360890302066773, + "grad_norm": 10.963684929100229, + "learning_rate": 4.3960797660391575e-05, + "loss": 0.1714, + "step": 4001 + }, + { + "epoch": 6.36248012718601, + "grad_norm": 5.59487236743823, + "learning_rate": 4.39661389545669e-05, + "loss": 0.2562, + "step": 4002 + }, + { + "epoch": 6.364069952305247, + "grad_norm": 10.061653467332205, + "learning_rate": 4.397147890634781e-05, + "loss": 0.171, + "step": 4003 + }, + { + "epoch": 6.365659777424483, + "grad_norm": 8.622192094674991, + "learning_rate": 4.3976817513926916e-05, + "loss": 0.2221, + "step": 4004 + }, + { + "epoch": 6.36724960254372, + "grad_norm": 5.7998971194707245, + "learning_rate": 4.398215477549728e-05, + "loss": 0.1493, + "step": 4005 + }, + { + "epoch": 6.368839427662957, + "grad_norm": 4.458834685644778, + "learning_rate": 4.3987490689252466e-05, + "loss": 0.1542, + "step": 4006 + }, + { + "epoch": 6.370429252782194, + "grad_norm": 3.3330938767392038, + "learning_rate": 4.399282525338643e-05, + "loss": 0.1468, + "step": 4007 + }, + { + "epoch": 6.372019077901431, + "grad_norm": 6.740973887431507, + "learning_rate": 4.399815846609363e-05, + "loss": 0.2361, + "step": 4008 + }, + { + "epoch": 6.373608903020668, + "grad_norm": 10.789707915495368, + "learning_rate": 4.400349032556895e-05, + "loss": 0.1725, + "step": 4009 + }, + { + "epoch": 6.375198728139905, + "grad_norm": 4.806425141030187, + "learning_rate": 4.400882083000777e-05, + "loss": 0.1707, + "step": 4010 + }, + { + "epoch": 6.376788553259141, + "grad_norm": 9.725136352271948, + "learning_rate": 4.40141499776059e-05, + "loss": 0.1953, + "step": 4011 + }, + { + "epoch": 6.378378378378378, + "grad_norm": 6.753796184065153, + "learning_rate": 4.4019477766559604e-05, + "loss": 0.1999, + "step": 4012 + }, + { + "epoch": 6.379968203497615, + "grad_norm": 8.556776131280252, + "learning_rate": 4.402480419506563e-05, + "loss": 0.1924, + "step": 4013 + }, + { + "epoch": 6.381558028616852, + "grad_norm": 5.99420785941086, + "learning_rate": 4.403012926132118e-05, + "loss": 0.184, + "step": 4014 + }, + { + "epoch": 6.383147853736089, + "grad_norm": 6.778864723894779, + "learning_rate": 4.40354529635239e-05, + "loss": 0.1947, + "step": 4015 + }, + { + "epoch": 6.384737678855326, + "grad_norm": 9.074649302615452, + "learning_rate": 4.4040775299871915e-05, + "loss": 0.1685, + "step": 4016 + }, + { + "epoch": 6.386327503974563, + "grad_norm": 8.00117527891903, + "learning_rate": 4.404609626856381e-05, + "loss": 0.1327, + "step": 4017 + }, + { + "epoch": 6.3879173290938, + "grad_norm": 5.350642809195212, + "learning_rate": 4.405141586779863e-05, + "loss": 0.1674, + "step": 4018 + }, + { + "epoch": 6.389507154213036, + "grad_norm": 2.755580722538118, + "learning_rate": 4.405673409577587e-05, + "loss": 0.1586, + "step": 4019 + }, + { + "epoch": 6.391096979332273, + "grad_norm": 14.758678022053596, + "learning_rate": 4.406205095069552e-05, + "loss": 0.2181, + "step": 4020 + }, + { + "epoch": 6.39268680445151, + "grad_norm": 5.092218507342331, + "learning_rate": 4.4067366430758e-05, + "loss": 0.1369, + "step": 4021 + }, + { + "epoch": 6.394276629570747, + "grad_norm": 4.934550163339906, + "learning_rate": 4.407268053416423e-05, + "loss": 0.225, + "step": 4022 + }, + { + "epoch": 6.395866454689984, + "grad_norm": 3.70864030634627, + "learning_rate": 4.4077993259115566e-05, + "loss": 0.1717, + "step": 4023 + }, + { + "epoch": 6.397456279809221, + "grad_norm": 17.158089789920272, + "learning_rate": 4.408330460381385e-05, + "loss": 0.22, + "step": 4024 + }, + { + "epoch": 6.399046104928458, + "grad_norm": 5.59861792737576, + "learning_rate": 4.408861456646138e-05, + "loss": 0.2167, + "step": 4025 + }, + { + "epoch": 6.400635930047695, + "grad_norm": 5.502191655174385, + "learning_rate": 4.409392314526093e-05, + "loss": 0.2082, + "step": 4026 + }, + { + "epoch": 6.402225755166931, + "grad_norm": 7.3341683384059175, + "learning_rate": 4.4099230338415726e-05, + "loss": 0.2589, + "step": 4027 + }, + { + "epoch": 6.403815580286168, + "grad_norm": 7.251789428759672, + "learning_rate": 4.410453614412949e-05, + "loss": 0.1912, + "step": 4028 + }, + { + "epoch": 6.405405405405405, + "grad_norm": 43.25777964597279, + "learning_rate": 4.4109840560606396e-05, + "loss": 1.5185, + "step": 4029 + }, + { + "epoch": 6.406995230524642, + "grad_norm": 5.571300065924355, + "learning_rate": 4.411514358605109e-05, + "loss": 0.1964, + "step": 4030 + }, + { + "epoch": 6.408585055643879, + "grad_norm": 7.2224813416629585, + "learning_rate": 4.4120445218668686e-05, + "loss": 0.1734, + "step": 4031 + }, + { + "epoch": 6.410174880763116, + "grad_norm": 3.9078555752465776, + "learning_rate": 4.4125745456664776e-05, + "loss": 0.1691, + "step": 4032 + }, + { + "epoch": 6.411764705882353, + "grad_norm": 4.271061963032607, + "learning_rate": 4.4131044298245425e-05, + "loss": 0.1303, + "step": 4033 + }, + { + "epoch": 6.413354531001589, + "grad_norm": 178.11054594536967, + "learning_rate": 4.4136341741617154e-05, + "loss": 1.4875, + "step": 4034 + }, + { + "epoch": 6.414944356120826, + "grad_norm": 5.612295686705271, + "learning_rate": 4.414163778498698e-05, + "loss": 0.2064, + "step": 4035 + }, + { + "epoch": 6.416534181240063, + "grad_norm": 6.098859908724126, + "learning_rate": 4.414693242656239e-05, + "loss": 0.151, + "step": 4036 + }, + { + "epoch": 6.4181240063593, + "grad_norm": 3.203231709855121, + "learning_rate": 4.4152225664551336e-05, + "loss": 0.1848, + "step": 4037 + }, + { + "epoch": 6.419713831478537, + "grad_norm": 7.3406589784435265, + "learning_rate": 4.4157517497162246e-05, + "loss": 0.1805, + "step": 4038 + }, + { + "epoch": 6.421303656597774, + "grad_norm": 3.000364300952209, + "learning_rate": 4.416280792260401e-05, + "loss": 0.2096, + "step": 4039 + }, + { + "epoch": 6.422893481717011, + "grad_norm": 2.4078971634233493, + "learning_rate": 4.4168096939086046e-05, + "loss": 0.2157, + "step": 4040 + }, + { + "epoch": 6.424483306836248, + "grad_norm": 4.9542284457853745, + "learning_rate": 4.417338454481818e-05, + "loss": 0.2444, + "step": 4041 + }, + { + "epoch": 6.426073131955485, + "grad_norm": 4.1261543714723, + "learning_rate": 4.417867073801077e-05, + "loss": 0.2163, + "step": 4042 + }, + { + "epoch": 6.4276629570747215, + "grad_norm": 16.22378974982509, + "learning_rate": 4.418395551687462e-05, + "loss": 1.5064, + "step": 4043 + }, + { + "epoch": 6.4292527821939585, + "grad_norm": 3.1201888770204733, + "learning_rate": 4.418923887962103e-05, + "loss": 0.2057, + "step": 4044 + }, + { + "epoch": 6.4308426073131955, + "grad_norm": 5.743872144233768, + "learning_rate": 4.4194520824461776e-05, + "loss": 0.2076, + "step": 4045 + }, + { + "epoch": 6.4324324324324325, + "grad_norm": 2.292652076744471, + "learning_rate": 4.41998013496091e-05, + "loss": 0.1991, + "step": 4046 + }, + { + "epoch": 6.4340222575516695, + "grad_norm": 62.23614271105842, + "learning_rate": 4.4205080453275736e-05, + "loss": 12.2324, + "step": 4047 + }, + { + "epoch": 6.4356120826709065, + "grad_norm": 14.816067755420178, + "learning_rate": 4.421035813367491e-05, + "loss": 1.5532, + "step": 4048 + }, + { + "epoch": 6.4372019077901435, + "grad_norm": 3.2830181903811444, + "learning_rate": 4.421563438902031e-05, + "loss": 0.2201, + "step": 4049 + }, + { + "epoch": 6.43879173290938, + "grad_norm": 3.148439388115308, + "learning_rate": 4.422090921752612e-05, + "loss": 0.2115, + "step": 4050 + }, + { + "epoch": 6.440381558028617, + "grad_norm": 1.7855128026666314, + "learning_rate": 4.4226182617406995e-05, + "loss": 0.1558, + "step": 4051 + }, + { + "epoch": 6.441971383147854, + "grad_norm": 3.9914612145466246, + "learning_rate": 4.4231454586878086e-05, + "loss": 0.2462, + "step": 4052 + }, + { + "epoch": 6.443561208267091, + "grad_norm": 5.328733875502032, + "learning_rate": 4.423672512415502e-05, + "loss": 0.1665, + "step": 4053 + }, + { + "epoch": 6.4451510333863276, + "grad_norm": 2.748623432625521, + "learning_rate": 4.4241994227453904e-05, + "loss": 0.2055, + "step": 4054 + }, + { + "epoch": 6.4467408585055646, + "grad_norm": 8.622985381943458, + "learning_rate": 4.424726189499135e-05, + "loss": 1.1694, + "step": 4055 + }, + { + "epoch": 6.4483306836248016, + "grad_norm": 4.704606660522426, + "learning_rate": 4.425252812498443e-05, + "loss": 0.2733, + "step": 4056 + }, + { + "epoch": 6.4499205087440385, + "grad_norm": 2.805896888808438, + "learning_rate": 4.425779291565073e-05, + "loss": 0.2365, + "step": 4057 + }, + { + "epoch": 6.451510333863275, + "grad_norm": 3.6055006153034563, + "learning_rate": 4.426305626520829e-05, + "loss": 0.1396, + "step": 4058 + }, + { + "epoch": 6.453100158982512, + "grad_norm": 4.269089139710243, + "learning_rate": 4.4268318171875684e-05, + "loss": 0.2135, + "step": 4059 + }, + { + "epoch": 6.454689984101749, + "grad_norm": 5.546140080024612, + "learning_rate": 4.4273578633871925e-05, + "loss": 0.1761, + "step": 4060 + }, + { + "epoch": 6.456279809220986, + "grad_norm": 3.8133186533716206, + "learning_rate": 4.4278837649416544e-05, + "loss": 0.203, + "step": 4061 + }, + { + "epoch": 6.457869634340223, + "grad_norm": 3.8188878877071226, + "learning_rate": 4.428409521672955e-05, + "loss": 0.1932, + "step": 4062 + }, + { + "epoch": 6.45945945945946, + "grad_norm": 3.9564649464204384, + "learning_rate": 4.4289351334031464e-05, + "loss": 0.1436, + "step": 4063 + }, + { + "epoch": 6.461049284578697, + "grad_norm": 2.428846125978776, + "learning_rate": 4.429460599954325e-05, + "loss": 0.1468, + "step": 4064 + }, + { + "epoch": 6.462639109697934, + "grad_norm": 2.8251006798995064, + "learning_rate": 4.429985921148643e-05, + "loss": 0.1621, + "step": 4065 + }, + { + "epoch": 6.46422893481717, + "grad_norm": 5.032954910167469, + "learning_rate": 4.430511096808295e-05, + "loss": 0.2437, + "step": 4066 + }, + { + "epoch": 6.465818759936407, + "grad_norm": 2.985456058827167, + "learning_rate": 4.43103612675553e-05, + "loss": 0.1605, + "step": 4067 + }, + { + "epoch": 6.467408585055644, + "grad_norm": 2.921707745916031, + "learning_rate": 4.4315610108126446e-05, + "loss": 0.1594, + "step": 4068 + }, + { + "epoch": 6.468998410174881, + "grad_norm": 3.363755283958462, + "learning_rate": 4.432085748801983e-05, + "loss": 0.1371, + "step": 4069 + }, + { + "epoch": 6.470588235294118, + "grad_norm": 3.2244524822955696, + "learning_rate": 4.43261034054594e-05, + "loss": 0.1337, + "step": 4070 + }, + { + "epoch": 6.472178060413355, + "grad_norm": 3.7252584576350145, + "learning_rate": 4.433134785866963e-05, + "loss": 0.1768, + "step": 4071 + }, + { + "epoch": 6.473767885532592, + "grad_norm": 3.82189591423742, + "learning_rate": 4.4336590845875444e-05, + "loss": 0.2335, + "step": 4072 + }, + { + "epoch": 6.475357710651828, + "grad_norm": 2.4914664165617113, + "learning_rate": 4.434183236530228e-05, + "loss": 0.247, + "step": 4073 + }, + { + "epoch": 6.476947535771065, + "grad_norm": 1.9220398760638604, + "learning_rate": 4.4347072415176083e-05, + "loss": 0.1562, + "step": 4074 + }, + { + "epoch": 6.478537360890302, + "grad_norm": 2.6129678566228964, + "learning_rate": 4.435231099372328e-05, + "loss": 0.1163, + "step": 4075 + }, + { + "epoch": 6.480127186009539, + "grad_norm": 3.158570473310375, + "learning_rate": 4.4357548099170795e-05, + "loss": 0.2054, + "step": 4076 + }, + { + "epoch": 6.481717011128776, + "grad_norm": 2.75595312418156, + "learning_rate": 4.436278372974607e-05, + "loss": 0.2441, + "step": 4077 + }, + { + "epoch": 6.483306836248013, + "grad_norm": 2.619607084824096, + "learning_rate": 4.436801788367702e-05, + "loss": 0.1714, + "step": 4078 + }, + { + "epoch": 6.48489666136725, + "grad_norm": 80.29574445038882, + "learning_rate": 4.437325055919209e-05, + "loss": 7.6357, + "step": 4079 + }, + { + "epoch": 6.486486486486487, + "grad_norm": 2.6933402905203927, + "learning_rate": 4.43784817545202e-05, + "loss": 0.1652, + "step": 4080 + }, + { + "epoch": 6.488076311605723, + "grad_norm": 4.3964445509262395, + "learning_rate": 4.438371146789078e-05, + "loss": 0.4119, + "step": 4081 + }, + { + "epoch": 6.48966613672496, + "grad_norm": 2.810487179656018, + "learning_rate": 4.438893969753376e-05, + "loss": 0.1489, + "step": 4082 + }, + { + "epoch": 6.491255961844197, + "grad_norm": 3.7697934540951024, + "learning_rate": 4.4394166441679573e-05, + "loss": 0.1572, + "step": 4083 + }, + { + "epoch": 6.492845786963434, + "grad_norm": 3.362985870045082, + "learning_rate": 4.439939169855915e-05, + "loss": 0.1453, + "step": 4084 + }, + { + "epoch": 6.494435612082671, + "grad_norm": 1.9330572843655938, + "learning_rate": 4.440461546640395e-05, + "loss": 0.1307, + "step": 4085 + }, + { + "epoch": 6.496025437201908, + "grad_norm": 2.2284663087291996, + "learning_rate": 4.44098377434459e-05, + "loss": 0.1303, + "step": 4086 + }, + { + "epoch": 6.497615262321145, + "grad_norm": 3.2198652724415675, + "learning_rate": 4.441505852791745e-05, + "loss": 0.1768, + "step": 4087 + }, + { + "epoch": 6.499205087440382, + "grad_norm": 4.2465171958495675, + "learning_rate": 4.442027781805156e-05, + "loss": 0.1894, + "step": 4088 + }, + { + "epoch": 6.500794912559618, + "grad_norm": 1.656733093189461, + "learning_rate": 4.442549561208169e-05, + "loss": 0.175, + "step": 4089 + }, + { + "epoch": 6.502384737678855, + "grad_norm": 4.7541366258711015, + "learning_rate": 4.44307119082418e-05, + "loss": 0.1559, + "step": 4090 + }, + { + "epoch": 6.503974562798092, + "grad_norm": 3.550223802163937, + "learning_rate": 4.443592670476636e-05, + "loss": 0.1655, + "step": 4091 + }, + { + "epoch": 6.505564387917329, + "grad_norm": 2.0190405420450666, + "learning_rate": 4.444113999989036e-05, + "loss": 0.1326, + "step": 4092 + }, + { + "epoch": 6.507154213036566, + "grad_norm": 16.778922298941115, + "learning_rate": 4.4446351791849274e-05, + "loss": 40.6076, + "step": 4093 + }, + { + "epoch": 6.508744038155803, + "grad_norm": 8.912058388609987, + "learning_rate": 4.445156207887911e-05, + "loss": 0.2417, + "step": 4094 + }, + { + "epoch": 6.51033386327504, + "grad_norm": 10.466234380750231, + "learning_rate": 4.445677085921639e-05, + "loss": 0.2404, + "step": 4095 + }, + { + "epoch": 6.511923688394276, + "grad_norm": 3.2471050920624673, + "learning_rate": 4.446197813109809e-05, + "loss": 0.1984, + "step": 4096 + }, + { + "epoch": 6.513513513513513, + "grad_norm": 6.534490689874421, + "learning_rate": 4.446718389276176e-05, + "loss": 0.2481, + "step": 4097 + }, + { + "epoch": 6.51510333863275, + "grad_norm": 7.415748213280736, + "learning_rate": 4.4472388142445455e-05, + "loss": 0.186, + "step": 4098 + }, + { + "epoch": 6.516693163751987, + "grad_norm": 6.390034897923217, + "learning_rate": 4.4477590878387696e-05, + "loss": 0.2453, + "step": 4099 + }, + { + "epoch": 6.518282988871224, + "grad_norm": 1.8076776368588225, + "learning_rate": 4.448279209882756e-05, + "loss": 0.1412, + "step": 4100 + }, + { + "epoch": 6.519872813990461, + "grad_norm": 6.73793588970458, + "learning_rate": 4.448799180200462e-05, + "loss": 0.2103, + "step": 4101 + }, + { + "epoch": 6.521462639109698, + "grad_norm": 7.246268308641273, + "learning_rate": 4.449318998615897e-05, + "loss": 0.1775, + "step": 4102 + }, + { + "epoch": 6.523052464228935, + "grad_norm": 4.199056388954719, + "learning_rate": 4.44983866495312e-05, + "loss": 0.1396, + "step": 4103 + }, + { + "epoch": 6.524642289348172, + "grad_norm": 2.666221083753239, + "learning_rate": 4.450358179036244e-05, + "loss": 0.2034, + "step": 4104 + }, + { + "epoch": 6.526232114467408, + "grad_norm": 6.818891438556802, + "learning_rate": 4.450877540689431e-05, + "loss": 0.1695, + "step": 4105 + }, + { + "epoch": 6.527821939586645, + "grad_norm": 4.801019119669153, + "learning_rate": 4.451396749736897e-05, + "loss": 0.2766, + "step": 4106 + }, + { + "epoch": 6.529411764705882, + "grad_norm": 6.050132280219142, + "learning_rate": 4.451915806002909e-05, + "loss": 0.2046, + "step": 4107 + }, + { + "epoch": 6.531001589825119, + "grad_norm": 2.3548909308987698, + "learning_rate": 4.452434709311783e-05, + "loss": 0.1924, + "step": 4108 + }, + { + "epoch": 6.532591414944356, + "grad_norm": 4.562494610782942, + "learning_rate": 4.452953459487891e-05, + "loss": 0.2507, + "step": 4109 + }, + { + "epoch": 6.534181240063593, + "grad_norm": 4.993069996354965, + "learning_rate": 4.4534720563556546e-05, + "loss": 0.1648, + "step": 4110 + }, + { + "epoch": 6.53577106518283, + "grad_norm": 5.5897542234483595, + "learning_rate": 4.4539904997395466e-05, + "loss": 0.1828, + "step": 4111 + }, + { + "epoch": 6.537360890302066, + "grad_norm": 3.7133003140479626, + "learning_rate": 4.454508789464094e-05, + "loss": 0.1886, + "step": 4112 + }, + { + "epoch": 6.538950715421303, + "grad_norm": 5.903340925157806, + "learning_rate": 4.455026925353874e-05, + "loss": 0.4549, + "step": 4113 + }, + { + "epoch": 6.54054054054054, + "grad_norm": 4.124933569149193, + "learning_rate": 4.4555449072335154e-05, + "loss": 0.1934, + "step": 4114 + }, + { + "epoch": 6.542130365659777, + "grad_norm": 7.042423286041375, + "learning_rate": 4.456062734927702e-05, + "loss": 0.2105, + "step": 4115 + }, + { + "epoch": 6.543720190779014, + "grad_norm": 1.8138529423485292, + "learning_rate": 4.4565804082611656e-05, + "loss": 0.1597, + "step": 4116 + }, + { + "epoch": 6.545310015898251, + "grad_norm": 1.932735412333997, + "learning_rate": 4.4570979270586945e-05, + "loss": 0.1222, + "step": 4117 + }, + { + "epoch": 6.546899841017488, + "grad_norm": 5.439764390808079, + "learning_rate": 4.4576152911451264e-05, + "loss": 0.1371, + "step": 4118 + }, + { + "epoch": 6.548489666136725, + "grad_norm": 3.053685340926211, + "learning_rate": 4.458132500345352e-05, + "loss": 0.1883, + "step": 4119 + }, + { + "epoch": 6.550079491255962, + "grad_norm": 4.0139912979387695, + "learning_rate": 4.4586495544843146e-05, + "loss": 0.1535, + "step": 4120 + }, + { + "epoch": 6.5516693163751984, + "grad_norm": 3.7522915852810192, + "learning_rate": 4.4591664533870125e-05, + "loss": 0.2158, + "step": 4121 + }, + { + "epoch": 6.5532591414944354, + "grad_norm": 5.286735107164596, + "learning_rate": 4.4596831968784905e-05, + "loss": 0.1543, + "step": 4122 + }, + { + "epoch": 6.5548489666136724, + "grad_norm": 24.81570323891182, + "learning_rate": 4.460199784783852e-05, + "loss": 0.6685, + "step": 4123 + }, + { + "epoch": 6.556438791732909, + "grad_norm": 1.636222219007359, + "learning_rate": 4.46071621692825e-05, + "loss": 0.2099, + "step": 4124 + }, + { + "epoch": 6.558028616852146, + "grad_norm": 2.6957889145113394, + "learning_rate": 4.4612324931368906e-05, + "loss": 0.1945, + "step": 4125 + }, + { + "epoch": 6.559618441971383, + "grad_norm": 2.98671915047828, + "learning_rate": 4.461748613235034e-05, + "loss": 0.2493, + "step": 4126 + }, + { + "epoch": 6.56120826709062, + "grad_norm": 2.230162438451041, + "learning_rate": 4.462264577047992e-05, + "loss": 0.2047, + "step": 4127 + }, + { + "epoch": 6.5627980922098565, + "grad_norm": 2.312870576446253, + "learning_rate": 4.4627803844011284e-05, + "loss": 0.192, + "step": 4128 + }, + { + "epoch": 6.5643879173290935, + "grad_norm": 6.47178567223315, + "learning_rate": 4.463296035119862e-05, + "loss": 0.7484, + "step": 4129 + }, + { + "epoch": 6.5659777424483305, + "grad_norm": 4.390119438444852, + "learning_rate": 4.463811529029664e-05, + "loss": 0.1468, + "step": 4130 + }, + { + "epoch": 6.5675675675675675, + "grad_norm": 91.16691167333389, + "learning_rate": 4.4643268659560574e-05, + "loss": 1.547, + "step": 4131 + }, + { + "epoch": 6.5691573926868045, + "grad_norm": 3.7916995779786697, + "learning_rate": 4.464842045724619e-05, + "loss": 0.1329, + "step": 4132 + }, + { + "epoch": 6.5707472178060415, + "grad_norm": 3.4851842886411224, + "learning_rate": 4.465357068160982e-05, + "loss": 0.1345, + "step": 4133 + }, + { + "epoch": 6.5723370429252785, + "grad_norm": 2.0932544097302785, + "learning_rate": 4.4658719330908266e-05, + "loss": 0.1186, + "step": 4134 + }, + { + "epoch": 6.573926868044515, + "grad_norm": 1.4160980390820952, + "learning_rate": 4.466386640339892e-05, + "loss": 0.2123, + "step": 4135 + }, + { + "epoch": 6.575516693163752, + "grad_norm": 2.80095406252407, + "learning_rate": 4.466901189733966e-05, + "loss": 0.137, + "step": 4136 + }, + { + "epoch": 6.577106518282989, + "grad_norm": 2.8024390289593004, + "learning_rate": 4.467415581098895e-05, + "loss": 0.1364, + "step": 4137 + }, + { + "epoch": 6.578696343402226, + "grad_norm": 2.5120131719083494, + "learning_rate": 4.4679298142605734e-05, + "loss": 0.1438, + "step": 4138 + }, + { + "epoch": 6.580286168521463, + "grad_norm": 3.509051078365209, + "learning_rate": 4.4684438890449545e-05, + "loss": 0.1624, + "step": 4139 + }, + { + "epoch": 6.5818759936407, + "grad_norm": 2.5540530262876016, + "learning_rate": 4.4689578052780405e-05, + "loss": 0.1431, + "step": 4140 + }, + { + "epoch": 6.583465818759937, + "grad_norm": 2.7274762298021016, + "learning_rate": 4.469471562785891e-05, + "loss": 0.1087, + "step": 4141 + }, + { + "epoch": 6.585055643879174, + "grad_norm": 1.7849404948383736, + "learning_rate": 4.469985161394617e-05, + "loss": 0.183, + "step": 4142 + }, + { + "epoch": 6.586645468998411, + "grad_norm": 1.950400597448263, + "learning_rate": 4.470498600930383e-05, + "loss": 0.1577, + "step": 4143 + }, + { + "epoch": 6.588235294117647, + "grad_norm": 2.170789275413261, + "learning_rate": 4.47101188121941e-05, + "loss": 0.1968, + "step": 4144 + }, + { + "epoch": 6.589825119236884, + "grad_norm": 1.922325604979357, + "learning_rate": 4.4715250020879706e-05, + "loss": 0.1515, + "step": 4145 + }, + { + "epoch": 6.591414944356121, + "grad_norm": 3.041581714208334, + "learning_rate": 4.472037963362391e-05, + "loss": 0.1483, + "step": 4146 + }, + { + "epoch": 6.593004769475358, + "grad_norm": 3.68883489382966, + "learning_rate": 4.472550764869054e-05, + "loss": 0.235, + "step": 4147 + }, + { + "epoch": 6.594594594594595, + "grad_norm": 1.8267905228025465, + "learning_rate": 4.473063406434394e-05, + "loss": 0.1559, + "step": 4148 + }, + { + "epoch": 6.596184419713832, + "grad_norm": 1.977957817884437, + "learning_rate": 4.473575887884901e-05, + "loss": 0.1853, + "step": 4149 + }, + { + "epoch": 6.597774244833069, + "grad_norm": 3.2718560253419247, + "learning_rate": 4.4740882090471163e-05, + "loss": 0.2213, + "step": 4150 + }, + { + "epoch": 6.599364069952305, + "grad_norm": 3.029076651509095, + "learning_rate": 4.474600369747641e-05, + "loss": 0.1025, + "step": 4151 + }, + { + "epoch": 6.600953895071542, + "grad_norm": 1.3847918791207174, + "learning_rate": 4.4751123698131245e-05, + "loss": 0.1289, + "step": 4152 + }, + { + "epoch": 6.602543720190779, + "grad_norm": 2.70714103735813, + "learning_rate": 4.475624209070276e-05, + "loss": 0.1472, + "step": 4153 + }, + { + "epoch": 6.604133545310016, + "grad_norm": 3.2641256215652135, + "learning_rate": 4.476135887345854e-05, + "loss": 0.1951, + "step": 4154 + }, + { + "epoch": 6.605723370429253, + "grad_norm": 2.803762746952952, + "learning_rate": 4.4766474044666746e-05, + "loss": 0.1639, + "step": 4155 + }, + { + "epoch": 6.60731319554849, + "grad_norm": 1.4804773844359571, + "learning_rate": 4.4771587602596086e-05, + "loss": 0.163, + "step": 4156 + }, + { + "epoch": 6.608903020667727, + "grad_norm": 2.161964297812788, + "learning_rate": 4.47766995455158e-05, + "loss": 0.1796, + "step": 4157 + }, + { + "epoch": 6.610492845786963, + "grad_norm": 2.097577918500342, + "learning_rate": 4.478180987169568e-05, + "loss": 0.135, + "step": 4158 + }, + { + "epoch": 6.6120826709062, + "grad_norm": 1.8879819006038643, + "learning_rate": 4.478691857940607e-05, + "loss": 0.1929, + "step": 4159 + }, + { + "epoch": 6.613672496025437, + "grad_norm": 1.735963788081695, + "learning_rate": 4.479202566691785e-05, + "loss": 0.1537, + "step": 4160 + }, + { + "epoch": 6.615262321144674, + "grad_norm": 1.5855322385465556, + "learning_rate": 4.479713113250246e-05, + "loss": 0.1633, + "step": 4161 + }, + { + "epoch": 6.616852146263911, + "grad_norm": 1.402199399484262, + "learning_rate": 4.4802234974431896e-05, + "loss": 0.1688, + "step": 4162 + }, + { + "epoch": 6.618441971383148, + "grad_norm": 2.702890457660564, + "learning_rate": 4.480733719097867e-05, + "loss": 0.1729, + "step": 4163 + }, + { + "epoch": 6.620031796502385, + "grad_norm": 1.2603386268854597, + "learning_rate": 4.481243778041588e-05, + "loss": 0.1766, + "step": 4164 + }, + { + "epoch": 6.621621621621622, + "grad_norm": 2.856529865742124, + "learning_rate": 4.481753674101716e-05, + "loss": 0.1712, + "step": 4165 + }, + { + "epoch": 6.623211446740859, + "grad_norm": 3.67855357477022, + "learning_rate": 4.4822634071056686e-05, + "loss": 0.1659, + "step": 4166 + }, + { + "epoch": 6.624801271860095, + "grad_norm": 2.153455469474261, + "learning_rate": 4.4827729768809214e-05, + "loss": 0.1477, + "step": 4167 + }, + { + "epoch": 6.626391096979332, + "grad_norm": 5.153891845695849, + "learning_rate": 4.4832823832550024e-05, + "loss": 0.1441, + "step": 4168 + }, + { + "epoch": 6.627980922098569, + "grad_norm": 2.214953228943529, + "learning_rate": 4.483791626055497e-05, + "loss": 0.2045, + "step": 4169 + }, + { + "epoch": 6.629570747217806, + "grad_norm": 4.326888221329487, + "learning_rate": 4.484300705110043e-05, + "loss": 0.1558, + "step": 4170 + }, + { + "epoch": 6.631160572337043, + "grad_norm": 2.0854850060483643, + "learning_rate": 4.4848096202463376e-05, + "loss": 0.1146, + "step": 4171 + }, + { + "epoch": 6.63275039745628, + "grad_norm": 1.7508905779242037, + "learning_rate": 4.48531837129213e-05, + "loss": 0.1712, + "step": 4172 + }, + { + "epoch": 6.634340222575517, + "grad_norm": 3.3784266176558533, + "learning_rate": 4.485826958075227e-05, + "loss": 0.1355, + "step": 4173 + }, + { + "epoch": 6.635930047694753, + "grad_norm": 2.434623114463907, + "learning_rate": 4.4863353804234906e-05, + "loss": 0.1648, + "step": 4174 + }, + { + "epoch": 6.63751987281399, + "grad_norm": 2.0246151551374063, + "learning_rate": 4.486843638164838e-05, + "loss": 0.202, + "step": 4175 + }, + { + "epoch": 6.639109697933227, + "grad_norm": 2.7715544550132885, + "learning_rate": 4.487351731127243e-05, + "loss": 0.1842, + "step": 4176 + }, + { + "epoch": 6.640699523052464, + "grad_norm": 3.272943522867485, + "learning_rate": 4.4878596591387335e-05, + "loss": 0.1369, + "step": 4177 + }, + { + "epoch": 6.642289348171701, + "grad_norm": 5.164290188797228, + "learning_rate": 4.488367422027394e-05, + "loss": 0.1451, + "step": 4178 + }, + { + "epoch": 6.643879173290938, + "grad_norm": 1.3550320986289068, + "learning_rate": 4.4888750196213664e-05, + "loss": 0.143, + "step": 4179 + }, + { + "epoch": 6.645468998410175, + "grad_norm": 2.0748276021394583, + "learning_rate": 4.489382451748846e-05, + "loss": 0.1865, + "step": 4180 + }, + { + "epoch": 6.647058823529412, + "grad_norm": 3.8053787930494853, + "learning_rate": 4.4898897182380874e-05, + "loss": 0.1852, + "step": 4181 + }, + { + "epoch": 6.648648648648649, + "grad_norm": 1.405798674807752, + "learning_rate": 4.4903968189173975e-05, + "loss": 0.1522, + "step": 4182 + }, + { + "epoch": 6.650238473767885, + "grad_norm": 4.3563684087195265, + "learning_rate": 4.490903753615141e-05, + "loss": 0.1578, + "step": 4183 + }, + { + "epoch": 6.651828298887122, + "grad_norm": 1.3022724142111404, + "learning_rate": 4.4914105221597396e-05, + "loss": 0.1489, + "step": 4184 + }, + { + "epoch": 6.653418124006359, + "grad_norm": 17.370899541976673, + "learning_rate": 4.4919171243796705e-05, + "loss": 2.6541, + "step": 4185 + }, + { + "epoch": 6.655007949125596, + "grad_norm": 15.322852884599909, + "learning_rate": 4.492423560103467e-05, + "loss": 2.1362, + "step": 4186 + }, + { + "epoch": 6.656597774244833, + "grad_norm": 2.636254950106106, + "learning_rate": 4.492929829159719e-05, + "loss": 0.1696, + "step": 4187 + }, + { + "epoch": 6.65818759936407, + "grad_norm": 2.5892144107281476, + "learning_rate": 4.4934359313770734e-05, + "loss": 0.0853, + "step": 4188 + }, + { + "epoch": 6.659777424483307, + "grad_norm": 3.585243968836552, + "learning_rate": 4.493941866584231e-05, + "loss": 0.1621, + "step": 4189 + }, + { + "epoch": 6.661367249602543, + "grad_norm": 5.068043929543862, + "learning_rate": 4.494447634609953e-05, + "loss": 0.1613, + "step": 4190 + }, + { + "epoch": 6.66295707472178, + "grad_norm": 4.3375980187874985, + "learning_rate": 4.4949532352830546e-05, + "loss": 0.2161, + "step": 4191 + }, + { + "epoch": 6.664546899841017, + "grad_norm": 3.9491016709397675, + "learning_rate": 4.4954586684324084e-05, + "loss": 0.1731, + "step": 4192 + }, + { + "epoch": 6.666136724960254, + "grad_norm": 3.9313800110846655, + "learning_rate": 4.4959639338869424e-05, + "loss": 0.1721, + "step": 4193 + }, + { + "epoch": 6.667726550079491, + "grad_norm": 4.609925836392233, + "learning_rate": 4.496469031475644e-05, + "loss": 0.1738, + "step": 4194 + }, + { + "epoch": 6.669316375198728, + "grad_norm": 2.419974275945473, + "learning_rate": 4.4969739610275554e-05, + "loss": 0.1931, + "step": 4195 + }, + { + "epoch": 6.670906200317965, + "grad_norm": 2.934184736063685, + "learning_rate": 4.4974787223717766e-05, + "loss": 0.1555, + "step": 4196 + }, + { + "epoch": 6.672496025437201, + "grad_norm": 4.254513886997416, + "learning_rate": 4.4979833153374644e-05, + "loss": 0.1962, + "step": 4197 + }, + { + "epoch": 6.674085850556438, + "grad_norm": 2.598687127600904, + "learning_rate": 4.4984877397538306e-05, + "loss": 0.2055, + "step": 4198 + }, + { + "epoch": 6.675675675675675, + "grad_norm": 3.5935872176578365, + "learning_rate": 4.498991995450147e-05, + "loss": 0.1868, + "step": 4199 + }, + { + "epoch": 6.677265500794912, + "grad_norm": 2.500992619321777, + "learning_rate": 4.4994960822557425e-05, + "loss": 0.1168, + "step": 4200 + }, + { + "epoch": 6.678855325914149, + "grad_norm": 2.6862905780065924, + "learning_rate": 4.5e-05, + "loss": 0.1639, + "step": 4201 + }, + { + "epoch": 6.680445151033386, + "grad_norm": 4.832893228649769, + "learning_rate": 4.500503748512363e-05, + "loss": 0.1402, + "step": 4202 + }, + { + "epoch": 6.682034976152623, + "grad_norm": 4.157398719065339, + "learning_rate": 4.5010073276223296e-05, + "loss": 0.1548, + "step": 4203 + }, + { + "epoch": 6.68362480127186, + "grad_norm": 2.4766670360841734, + "learning_rate": 4.5015107371594575e-05, + "loss": 0.1744, + "step": 4204 + }, + { + "epoch": 6.685214626391097, + "grad_norm": 2.7552527786534777, + "learning_rate": 4.5020139769533606e-05, + "loss": 0.1739, + "step": 4205 + }, + { + "epoch": 6.6868044515103335, + "grad_norm": 4.45181114027743, + "learning_rate": 4.50251704683371e-05, + "loss": 0.2036, + "step": 4206 + }, + { + "epoch": 6.6883942766295705, + "grad_norm": 5.509601070433082, + "learning_rate": 4.5030199466302356e-05, + "loss": 0.2051, + "step": 4207 + }, + { + "epoch": 6.6899841017488075, + "grad_norm": 2.3072765541269975, + "learning_rate": 4.5035226761727226e-05, + "loss": 0.1374, + "step": 4208 + }, + { + "epoch": 6.6915739268680445, + "grad_norm": 2.8937502254205936, + "learning_rate": 4.504025235291017e-05, + "loss": 0.1831, + "step": 4209 + }, + { + "epoch": 6.6931637519872815, + "grad_norm": 5.099268008900632, + "learning_rate": 4.50452762381502e-05, + "loss": 0.221, + "step": 4210 + }, + { + "epoch": 6.6947535771065185, + "grad_norm": 2.737278740735534, + "learning_rate": 4.5050298415746904e-05, + "loss": 0.1235, + "step": 4211 + }, + { + "epoch": 6.6963434022257555, + "grad_norm": 2.934869150419045, + "learning_rate": 4.5055318884000465e-05, + "loss": 0.1407, + "step": 4212 + }, + { + "epoch": 6.697933227344992, + "grad_norm": 2.3096486699797416, + "learning_rate": 4.506033764121164e-05, + "loss": 0.1469, + "step": 4213 + }, + { + "epoch": 6.699523052464229, + "grad_norm": 5.501162671204545, + "learning_rate": 4.506535468568176e-05, + "loss": 0.2199, + "step": 4214 + }, + { + "epoch": 6.701112877583466, + "grad_norm": 4.385089568425522, + "learning_rate": 4.5070370015712725e-05, + "loss": 0.1697, + "step": 4215 + }, + { + "epoch": 6.702702702702703, + "grad_norm": 4.4453416900484255, + "learning_rate": 4.507538362960704e-05, + "loss": 0.1079, + "step": 4216 + }, + { + "epoch": 6.70429252782194, + "grad_norm": 6.759986990227837, + "learning_rate": 4.508039552566778e-05, + "loss": 0.1811, + "step": 4217 + }, + { + "epoch": 6.705882352941177, + "grad_norm": 3.5159415059676835, + "learning_rate": 4.5085405702198595e-05, + "loss": 0.2381, + "step": 4218 + }, + { + "epoch": 6.707472178060414, + "grad_norm": 5.5315579210461925, + "learning_rate": 4.509041415750372e-05, + "loss": 0.191, + "step": 4219 + }, + { + "epoch": 6.709062003179651, + "grad_norm": 4.728651012535913, + "learning_rate": 4.5095420889887966e-05, + "loss": 0.1841, + "step": 4220 + }, + { + "epoch": 6.710651828298887, + "grad_norm": 4.274269797975611, + "learning_rate": 4.510042589765676e-05, + "loss": 0.1214, + "step": 4221 + }, + { + "epoch": 6.712241653418124, + "grad_norm": 3.931606624782243, + "learning_rate": 4.510542917911606e-05, + "loss": 0.1772, + "step": 4222 + }, + { + "epoch": 6.713831478537361, + "grad_norm": 6.196075958716857, + "learning_rate": 4.511043073257246e-05, + "loss": 0.3037, + "step": 4223 + }, + { + "epoch": 6.715421303656598, + "grad_norm": 5.222759100952918, + "learning_rate": 4.51154305563331e-05, + "loss": 0.235, + "step": 4224 + }, + { + "epoch": 6.717011128775835, + "grad_norm": 3.32644777557435, + "learning_rate": 4.512042864870572e-05, + "loss": 0.1294, + "step": 4225 + }, + { + "epoch": 6.718600953895072, + "grad_norm": 2.9568315292353033, + "learning_rate": 4.5125425007998656e-05, + "loss": 0.1719, + "step": 4226 + }, + { + "epoch": 6.720190779014309, + "grad_norm": 8.207785904820122, + "learning_rate": 4.513041963252082e-05, + "loss": 0.177, + "step": 4227 + }, + { + "epoch": 6.721780604133546, + "grad_norm": 2.388638636840881, + "learning_rate": 4.51354125205817e-05, + "loss": 0.1841, + "step": 4228 + }, + { + "epoch": 6.723370429252782, + "grad_norm": 5.455312028458112, + "learning_rate": 4.514040367049141e-05, + "loss": 0.2682, + "step": 4229 + }, + { + "epoch": 6.724960254372019, + "grad_norm": 5.399574841592194, + "learning_rate": 4.5145393080560596e-05, + "loss": 0.1825, + "step": 4230 + }, + { + "epoch": 6.726550079491256, + "grad_norm": 3.2279298308830953, + "learning_rate": 4.515038074910055e-05, + "loss": 0.1722, + "step": 4231 + }, + { + "epoch": 6.728139904610493, + "grad_norm": 2.2987957091093056, + "learning_rate": 4.515536667442311e-05, + "loss": 0.15, + "step": 4232 + }, + { + "epoch": 6.72972972972973, + "grad_norm": 4.5221904119019225, + "learning_rate": 4.516035085484072e-05, + "loss": 0.1829, + "step": 4233 + }, + { + "epoch": 6.731319554848967, + "grad_norm": 3.9881168553638973, + "learning_rate": 4.516533328866642e-05, + "loss": 0.1808, + "step": 4234 + }, + { + "epoch": 6.732909379968204, + "grad_norm": 5.250577837520792, + "learning_rate": 4.5170313974213846e-05, + "loss": 0.2266, + "step": 4235 + }, + { + "epoch": 6.73449920508744, + "grad_norm": 3.761486198326603, + "learning_rate": 4.51752929097972e-05, + "loss": 0.1721, + "step": 4236 + }, + { + "epoch": 6.736089030206677, + "grad_norm": 2.7902909318562514, + "learning_rate": 4.51802700937313e-05, + "loss": 0.163, + "step": 4237 + }, + { + "epoch": 6.737678855325914, + "grad_norm": 2.296660053029683, + "learning_rate": 4.518524552433156e-05, + "loss": 0.2288, + "step": 4238 + }, + { + "epoch": 6.739268680445151, + "grad_norm": 3.3586708776925844, + "learning_rate": 4.5190219199913963e-05, + "loss": 0.1861, + "step": 4239 + }, + { + "epoch": 6.740858505564388, + "grad_norm": 2.341420325013109, + "learning_rate": 4.5195191118795096e-05, + "loss": 0.1609, + "step": 4240 + }, + { + "epoch": 6.742448330683625, + "grad_norm": 3.3639614830927393, + "learning_rate": 4.5200161279292155e-05, + "loss": 0.7839, + "step": 4241 + }, + { + "epoch": 6.744038155802862, + "grad_norm": 2.4842270030998814, + "learning_rate": 4.520512967972292e-05, + "loss": 0.2218, + "step": 4242 + }, + { + "epoch": 6.745627980922099, + "grad_norm": 4.571691286436985, + "learning_rate": 4.521009631840576e-05, + "loss": 0.1635, + "step": 4243 + }, + { + "epoch": 6.747217806041336, + "grad_norm": 6.129616391721861, + "learning_rate": 4.5215061193659664e-05, + "loss": 0.2904, + "step": 4244 + }, + { + "epoch": 6.748807631160572, + "grad_norm": 3.6105297440774478, + "learning_rate": 4.5220024303804185e-05, + "loss": 0.1336, + "step": 4245 + }, + { + "epoch": 6.750397456279809, + "grad_norm": 2.4811524700858616, + "learning_rate": 4.522498564715949e-05, + "loss": 0.2214, + "step": 4246 + }, + { + "epoch": 6.751987281399046, + "grad_norm": 2.843715580604871, + "learning_rate": 4.5229945222046355e-05, + "loss": 0.2092, + "step": 4247 + }, + { + "epoch": 6.753577106518283, + "grad_norm": 10.102758532659559, + "learning_rate": 4.5234903026786134e-05, + "loss": 0.2153, + "step": 4248 + }, + { + "epoch": 6.75516693163752, + "grad_norm": 3.2370048772050226, + "learning_rate": 4.523985905970079e-05, + "loss": 0.1999, + "step": 4249 + }, + { + "epoch": 6.756756756756757, + "grad_norm": 2.3467772695946514, + "learning_rate": 4.52448133191129e-05, + "loss": 0.1515, + "step": 4250 + }, + { + "epoch": 6.758346581875994, + "grad_norm": 2.8136175582492573, + "learning_rate": 4.52497658033456e-05, + "loss": 0.2036, + "step": 4251 + }, + { + "epoch": 6.75993640699523, + "grad_norm": 6.027317266342272, + "learning_rate": 4.525471651072268e-05, + "loss": 0.1909, + "step": 4252 + }, + { + "epoch": 6.761526232114467, + "grad_norm": 4.682268661443079, + "learning_rate": 4.525966543956849e-05, + "loss": 0.1509, + "step": 4253 + }, + { + "epoch": 6.763116057233704, + "grad_norm": 4.06777493323999, + "learning_rate": 4.5264612588207996e-05, + "loss": 0.2328, + "step": 4254 + }, + { + "epoch": 6.764705882352941, + "grad_norm": 2.527735934494563, + "learning_rate": 4.526955795496678e-05, + "loss": 0.173, + "step": 4255 + }, + { + "epoch": 6.766295707472178, + "grad_norm": 9.625407897919226, + "learning_rate": 4.5274501538171e-05, + "loss": 0.3158, + "step": 4256 + }, + { + "epoch": 6.767885532591415, + "grad_norm": 5.445454892596262, + "learning_rate": 4.5279443336147434e-05, + "loss": 0.2017, + "step": 4257 + }, + { + "epoch": 6.769475357710652, + "grad_norm": 4.944569783427609, + "learning_rate": 4.5284383347223474e-05, + "loss": 0.2005, + "step": 4258 + }, + { + "epoch": 6.771065182829888, + "grad_norm": 2.237953711332626, + "learning_rate": 4.52893215697271e-05, + "loss": 0.1583, + "step": 4259 + }, + { + "epoch": 6.772655007949125, + "grad_norm": 4.544457996370721, + "learning_rate": 4.529425800198689e-05, + "loss": 0.2304, + "step": 4260 + }, + { + "epoch": 6.774244833068362, + "grad_norm": 4.140267830458746, + "learning_rate": 4.5299192642332046e-05, + "loss": 0.1413, + "step": 4261 + }, + { + "epoch": 6.775834658187599, + "grad_norm": 4.083390595783748, + "learning_rate": 4.530412548909239e-05, + "loss": 0.1449, + "step": 4262 + }, + { + "epoch": 6.777424483306836, + "grad_norm": 3.714350425437931, + "learning_rate": 4.530905654059831e-05, + "loss": 0.1614, + "step": 4263 + }, + { + "epoch": 6.779014308426073, + "grad_norm": 30.794175521195413, + "learning_rate": 4.5313985795180835e-05, + "loss": 0.5791, + "step": 4264 + }, + { + "epoch": 6.78060413354531, + "grad_norm": 2.6406991720028823, + "learning_rate": 4.531891325117158e-05, + "loss": 0.1726, + "step": 4265 + }, + { + "epoch": 6.782193958664547, + "grad_norm": 2.9349091080933793, + "learning_rate": 4.5323838906902786e-05, + "loss": 0.2118, + "step": 4266 + }, + { + "epoch": 6.783783783783784, + "grad_norm": 4.094321752173411, + "learning_rate": 4.53287627607073e-05, + "loss": 0.1605, + "step": 4267 + }, + { + "epoch": 6.78537360890302, + "grad_norm": 3.032077374341222, + "learning_rate": 4.533368481091858e-05, + "loss": 0.19, + "step": 4268 + }, + { + "epoch": 6.786963434022257, + "grad_norm": 3.3632791445552117, + "learning_rate": 4.533860505587067e-05, + "loss": 0.1788, + "step": 4269 + }, + { + "epoch": 6.788553259141494, + "grad_norm": 4.389297209223793, + "learning_rate": 4.5343523493898264e-05, + "loss": 0.209, + "step": 4270 + }, + { + "epoch": 6.790143084260731, + "grad_norm": 2.3514727887599713, + "learning_rate": 4.534844012333665e-05, + "loss": 0.2007, + "step": 4271 + }, + { + "epoch": 6.791732909379968, + "grad_norm": 51.06739310629877, + "learning_rate": 4.53533549425217e-05, + "loss": 6.4069, + "step": 4272 + }, + { + "epoch": 6.793322734499205, + "grad_norm": 4.980417288008446, + "learning_rate": 4.535826794978997e-05, + "loss": 0.1832, + "step": 4273 + }, + { + "epoch": 6.794912559618442, + "grad_norm": 3.0695349264769654, + "learning_rate": 4.5363179143478554e-05, + "loss": 0.1907, + "step": 4274 + }, + { + "epoch": 6.796502384737678, + "grad_norm": 5.374833210228057, + "learning_rate": 4.536808852192519e-05, + "loss": 0.2006, + "step": 4275 + }, + { + "epoch": 6.798092209856915, + "grad_norm": 3.8594680338551015, + "learning_rate": 4.537299608346824e-05, + "loss": 0.1276, + "step": 4276 + }, + { + "epoch": 6.799682034976152, + "grad_norm": 3.629918102998424, + "learning_rate": 4.537790182644667e-05, + "loss": 0.1877, + "step": 4277 + }, + { + "epoch": 6.801271860095389, + "grad_norm": 3.2901623352864706, + "learning_rate": 4.538280574920007e-05, + "loss": 0.1619, + "step": 4278 + }, + { + "epoch": 6.802861685214626, + "grad_norm": 4.572962645668463, + "learning_rate": 4.538770785006863e-05, + "loss": 0.1019, + "step": 4279 + }, + { + "epoch": 6.804451510333863, + "grad_norm": 2.842924626115517, + "learning_rate": 4.539260812739317e-05, + "loss": 0.1859, + "step": 4280 + }, + { + "epoch": 6.8060413354531, + "grad_norm": 2.3743410985348965, + "learning_rate": 4.5397506579515124e-05, + "loss": 0.1867, + "step": 4281 + }, + { + "epoch": 6.807631160572337, + "grad_norm": 2.3935101520924036, + "learning_rate": 4.540240320477655e-05, + "loss": 0.1821, + "step": 4282 + }, + { + "epoch": 6.809220985691574, + "grad_norm": 6.388679439023524, + "learning_rate": 4.540729800152011e-05, + "loss": 0.2968, + "step": 4283 + }, + { + "epoch": 6.8108108108108105, + "grad_norm": 3.6507247505750353, + "learning_rate": 4.5412190968089085e-05, + "loss": 0.1913, + "step": 4284 + }, + { + "epoch": 6.8124006359300475, + "grad_norm": 2.384285220432607, + "learning_rate": 4.54170821028274e-05, + "loss": 0.1533, + "step": 4285 + }, + { + "epoch": 6.8139904610492845, + "grad_norm": 3.9267428597959815, + "learning_rate": 4.5421971404079575e-05, + "loss": 0.2098, + "step": 4286 + }, + { + "epoch": 6.8155802861685215, + "grad_norm": 2.1829105065184695, + "learning_rate": 4.542685887019075e-05, + "loss": 0.119, + "step": 4287 + }, + { + "epoch": 6.8171701112877585, + "grad_norm": 2.1415477130501004, + "learning_rate": 4.5431744499506714e-05, + "loss": 0.1225, + "step": 4288 + }, + { + "epoch": 6.8187599364069955, + "grad_norm": 4.522357963564039, + "learning_rate": 4.543662829037383e-05, + "loss": 0.1602, + "step": 4289 + }, + { + "epoch": 6.8203497615262325, + "grad_norm": 4.371273099884999, + "learning_rate": 4.544151024113914e-05, + "loss": 0.167, + "step": 4290 + }, + { + "epoch": 6.821939586645469, + "grad_norm": 2.4110927872583803, + "learning_rate": 4.544639035015027e-05, + "loss": 0.1381, + "step": 4291 + }, + { + "epoch": 6.823529411764706, + "grad_norm": 2.229744073403196, + "learning_rate": 4.545126861575548e-05, + "loss": 0.1652, + "step": 4292 + }, + { + "epoch": 6.825119236883943, + "grad_norm": 6.7911831435048, + "learning_rate": 4.545614503630365e-05, + "loss": 0.1837, + "step": 4293 + }, + { + "epoch": 6.82670906200318, + "grad_norm": 3.3756677187424606, + "learning_rate": 4.546101961014429e-05, + "loss": 0.1496, + "step": 4294 + }, + { + "epoch": 6.828298887122417, + "grad_norm": 4.4074980558484205, + "learning_rate": 4.546589233562754e-05, + "loss": 0.1842, + "step": 4295 + }, + { + "epoch": 6.829888712241654, + "grad_norm": 5.057782872498222, + "learning_rate": 4.547076321110415e-05, + "loss": 0.7432, + "step": 4296 + }, + { + "epoch": 6.831478537360891, + "grad_norm": 4.949139918646627, + "learning_rate": 4.54756322349255e-05, + "loss": 0.1609, + "step": 4297 + }, + { + "epoch": 6.833068362480127, + "grad_norm": 4.4383495922621155, + "learning_rate": 4.548049940544362e-05, + "loss": 0.146, + "step": 4298 + }, + { + "epoch": 6.834658187599364, + "grad_norm": 3.133071970652931, + "learning_rate": 4.548536472101114e-05, + "loss": 0.1566, + "step": 4299 + }, + { + "epoch": 6.836248012718601, + "grad_norm": 6.3056890865951605, + "learning_rate": 4.5490228179981325e-05, + "loss": 0.2553, + "step": 4300 + }, + { + "epoch": 6.837837837837838, + "grad_norm": 2.3891160107537237, + "learning_rate": 4.549508978070806e-05, + "loss": 0.1564, + "step": 4301 + }, + { + "epoch": 6.839427662957075, + "grad_norm": 2.613451198051224, + "learning_rate": 4.549994952154589e-05, + "loss": 0.1453, + "step": 4302 + }, + { + "epoch": 6.841017488076312, + "grad_norm": 2.6797437097388714, + "learning_rate": 4.550480740084996e-05, + "loss": 0.2843, + "step": 4303 + }, + { + "epoch": 6.842607313195549, + "grad_norm": 1.7930253974671073, + "learning_rate": 4.5509663416976045e-05, + "loss": 0.1174, + "step": 4304 + }, + { + "epoch": 6.844197138314786, + "grad_norm": 2.8362761467484563, + "learning_rate": 4.551451756828058e-05, + "loss": 0.199, + "step": 4305 + }, + { + "epoch": 6.845786963434023, + "grad_norm": 2.267866454877451, + "learning_rate": 4.5519369853120585e-05, + "loss": 0.2242, + "step": 4306 + }, + { + "epoch": 6.847376788553259, + "grad_norm": 3.1216264561110316, + "learning_rate": 4.5524220269853755e-05, + "loss": 0.1918, + "step": 4307 + }, + { + "epoch": 6.848966613672496, + "grad_norm": 2.6818501923676243, + "learning_rate": 4.552906881683839e-05, + "loss": 0.2023, + "step": 4308 + }, + { + "epoch": 6.850556438791733, + "grad_norm": 3.2857185659601233, + "learning_rate": 4.553391549243344e-05, + "loss": 0.2123, + "step": 4309 + }, + { + "epoch": 6.85214626391097, + "grad_norm": 3.349193875983658, + "learning_rate": 4.553876029499848e-05, + "loss": 0.1364, + "step": 4310 + }, + { + "epoch": 6.853736089030207, + "grad_norm": 2.7059401189543677, + "learning_rate": 4.5543603222893715e-05, + "loss": 0.1409, + "step": 4311 + }, + { + "epoch": 6.855325914149444, + "grad_norm": 3.9090392039715423, + "learning_rate": 4.554844427447999e-05, + "loss": 0.265, + "step": 4312 + }, + { + "epoch": 6.856915739268681, + "grad_norm": 4.245928169312547, + "learning_rate": 4.555328344811879e-05, + "loss": 0.3164, + "step": 4313 + }, + { + "epoch": 6.858505564387917, + "grad_norm": 2.986860661809424, + "learning_rate": 4.5558120742172235e-05, + "loss": 0.1299, + "step": 4314 + }, + { + "epoch": 6.860095389507154, + "grad_norm": 3.8900717088854915, + "learning_rate": 4.5562956155003055e-05, + "loss": 0.1875, + "step": 4315 + }, + { + "epoch": 6.861685214626391, + "grad_norm": 1.7722948444032445, + "learning_rate": 4.5567789684974644e-05, + "loss": 0.1188, + "step": 4316 + }, + { + "epoch": 6.863275039745628, + "grad_norm": 3.590403468719313, + "learning_rate": 4.557262133045105e-05, + "loss": 0.1276, + "step": 4317 + }, + { + "epoch": 6.864864864864865, + "grad_norm": 3.0243520817716476, + "learning_rate": 4.5577451089796905e-05, + "loss": 0.1848, + "step": 4318 + }, + { + "epoch": 6.866454689984102, + "grad_norm": 2.9257651026889007, + "learning_rate": 4.558227896137753e-05, + "loss": 0.14, + "step": 4319 + }, + { + "epoch": 6.868044515103339, + "grad_norm": 2.4600606204485387, + "learning_rate": 4.558710494355886e-05, + "loss": 0.1377, + "step": 4320 + }, + { + "epoch": 6.869634340222575, + "grad_norm": 2.0427304241410353, + "learning_rate": 4.5591929034707465e-05, + "loss": 0.1713, + "step": 4321 + }, + { + "epoch": 6.871224165341812, + "grad_norm": 4.692690811229895, + "learning_rate": 4.5596751233190586e-05, + "loss": 0.2372, + "step": 4322 + }, + { + "epoch": 6.872813990461049, + "grad_norm": 2.409305668481138, + "learning_rate": 4.560157153737607e-05, + "loss": 0.1504, + "step": 4323 + }, + { + "epoch": 6.874403815580286, + "grad_norm": 1.7477798440277965, + "learning_rate": 4.560638994563242e-05, + "loss": 0.1207, + "step": 4324 + }, + { + "epoch": 6.875993640699523, + "grad_norm": 2.1574900068327767, + "learning_rate": 4.561120645632878e-05, + "loss": 0.1841, + "step": 4325 + }, + { + "epoch": 6.87758346581876, + "grad_norm": 2.964160212310907, + "learning_rate": 4.561602106783493e-05, + "loss": 0.1716, + "step": 4326 + }, + { + "epoch": 6.879173290937997, + "grad_norm": 2.627997605819045, + "learning_rate": 4.56208337785213e-05, + "loss": 0.1461, + "step": 4327 + }, + { + "epoch": 6.880763116057234, + "grad_norm": 2.620367548782156, + "learning_rate": 4.562564458675898e-05, + "loss": 0.1811, + "step": 4328 + }, + { + "epoch": 6.882352941176471, + "grad_norm": 1.804058886365865, + "learning_rate": 4.563045349091967e-05, + "loss": 0.1349, + "step": 4329 + }, + { + "epoch": 6.883942766295707, + "grad_norm": 4.502258630834874, + "learning_rate": 4.5635260489375715e-05, + "loss": 0.155, + "step": 4330 + }, + { + "epoch": 6.885532591414944, + "grad_norm": 1.795624252467874, + "learning_rate": 4.564006558050015e-05, + "loss": 0.1532, + "step": 4331 + }, + { + "epoch": 6.887122416534181, + "grad_norm": 2.7049792818053437, + "learning_rate": 4.56448687626666e-05, + "loss": 0.1847, + "step": 4332 + }, + { + "epoch": 6.888712241653418, + "grad_norm": 3.0312981410024817, + "learning_rate": 4.564967003424938e-05, + "loss": 0.1483, + "step": 4333 + }, + { + "epoch": 6.890302066772655, + "grad_norm": 2.6143849248285536, + "learning_rate": 4.565446939362343e-05, + "loss": 0.1399, + "step": 4334 + }, + { + "epoch": 6.891891891891892, + "grad_norm": 2.756393685075757, + "learning_rate": 4.5659266839164335e-05, + "loss": 0.2123, + "step": 4335 + }, + { + "epoch": 6.893481717011129, + "grad_norm": 2.797721396692664, + "learning_rate": 4.5664062369248324e-05, + "loss": 0.2126, + "step": 4336 + }, + { + "epoch": 6.895071542130365, + "grad_norm": 1.8284739094170694, + "learning_rate": 4.5668855982252314e-05, + "loss": 0.2238, + "step": 4337 + }, + { + "epoch": 6.896661367249602, + "grad_norm": 1.6322523983765866, + "learning_rate": 4.567364767655381e-05, + "loss": 0.0908, + "step": 4338 + }, + { + "epoch": 6.898251192368839, + "grad_norm": 3.8948783100518494, + "learning_rate": 4.567843745053101e-05, + "loss": 0.2126, + "step": 4339 + }, + { + "epoch": 6.899841017488076, + "grad_norm": 2.308803879861451, + "learning_rate": 4.5683225302562756e-05, + "loss": 0.1385, + "step": 4340 + }, + { + "epoch": 6.901430842607313, + "grad_norm": 1.9766678525387713, + "learning_rate": 4.568801123102852e-05, + "loss": 0.2095, + "step": 4341 + }, + { + "epoch": 6.90302066772655, + "grad_norm": 3.126412347455305, + "learning_rate": 4.569279523430844e-05, + "loss": 0.1705, + "step": 4342 + }, + { + "epoch": 6.904610492845787, + "grad_norm": 1.8700961476505393, + "learning_rate": 4.569757731078332e-05, + "loss": 0.158, + "step": 4343 + }, + { + "epoch": 6.906200317965024, + "grad_norm": 2.0929624950407475, + "learning_rate": 4.570235745883458e-05, + "loss": 0.1595, + "step": 4344 + }, + { + "epoch": 6.907790143084261, + "grad_norm": 2.4512566135424154, + "learning_rate": 4.570713567684431e-05, + "loss": 0.1365, + "step": 4345 + }, + { + "epoch": 6.909379968203497, + "grad_norm": 2.7950146792060724, + "learning_rate": 4.571191196319529e-05, + "loss": 0.0956, + "step": 4346 + }, + { + "epoch": 6.910969793322734, + "grad_norm": 2.3457808396877655, + "learning_rate": 4.5716686316270885e-05, + "loss": 0.161, + "step": 4347 + }, + { + "epoch": 6.912559618441971, + "grad_norm": 2.0457646872665274, + "learning_rate": 4.5721458734455165e-05, + "loss": 0.1356, + "step": 4348 + }, + { + "epoch": 6.914149443561208, + "grad_norm": 2.935246167895616, + "learning_rate": 4.572622921613284e-05, + "loss": 0.1651, + "step": 4349 + }, + { + "epoch": 6.915739268680445, + "grad_norm": 3.3088219843452036, + "learning_rate": 4.573099775968926e-05, + "loss": 0.1215, + "step": 4350 + }, + { + "epoch": 6.917329093799682, + "grad_norm": 3.051272148340767, + "learning_rate": 4.573576436351046e-05, + "loss": 0.2047, + "step": 4351 + }, + { + "epoch": 6.918918918918919, + "grad_norm": 2.6660671100869835, + "learning_rate": 4.574052902598312e-05, + "loss": 0.1658, + "step": 4352 + }, + { + "epoch": 6.920508744038155, + "grad_norm": 18.312242504997883, + "learning_rate": 4.574529174549456e-05, + "loss": 4.2673, + "step": 4353 + }, + { + "epoch": 6.922098569157392, + "grad_norm": 2.006590439935249, + "learning_rate": 4.575005252043279e-05, + "loss": 0.1773, + "step": 4354 + }, + { + "epoch": 6.923688394276629, + "grad_norm": 5.997622103161415, + "learning_rate": 4.575481134918645e-05, + "loss": 0.1614, + "step": 4355 + }, + { + "epoch": 6.925278219395866, + "grad_norm": 2.060800157233172, + "learning_rate": 4.5759568230144836e-05, + "loss": 0.166, + "step": 4356 + }, + { + "epoch": 6.926868044515103, + "grad_norm": 2.6705672162478544, + "learning_rate": 4.5764323161697934e-05, + "loss": 0.2583, + "step": 4357 + }, + { + "epoch": 6.92845786963434, + "grad_norm": 5.309504896189564, + "learning_rate": 4.576907614223637e-05, + "loss": 0.2566, + "step": 4358 + }, + { + "epoch": 6.930047694753577, + "grad_norm": 2.4965523809698427, + "learning_rate": 4.577382717015143e-05, + "loss": 0.1188, + "step": 4359 + }, + { + "epoch": 6.9316375198728135, + "grad_norm": 4.635435485717884, + "learning_rate": 4.577857624383506e-05, + "loss": 0.173, + "step": 4360 + }, + { + "epoch": 6.9332273449920505, + "grad_norm": 3.1016496451273863, + "learning_rate": 4.5783323361679865e-05, + "loss": 0.2129, + "step": 4361 + }, + { + "epoch": 6.9348171701112875, + "grad_norm": 3.7841862035443903, + "learning_rate": 4.5788068522079134e-05, + "loss": 0.1225, + "step": 4362 + }, + { + "epoch": 6.9364069952305245, + "grad_norm": 2.596130237009024, + "learning_rate": 4.579281172342679e-05, + "loss": 0.1778, + "step": 4363 + }, + { + "epoch": 6.9379968203497615, + "grad_norm": 3.22668710238125, + "learning_rate": 4.5797552964117436e-05, + "loss": 0.1713, + "step": 4364 + }, + { + "epoch": 6.9395866454689985, + "grad_norm": 5.741533631003023, + "learning_rate": 4.580229224254633e-05, + "loss": 0.1422, + "step": 4365 + }, + { + "epoch": 6.9411764705882355, + "grad_norm": 2.6977819659551185, + "learning_rate": 4.58070295571094e-05, + "loss": 0.1603, + "step": 4366 + }, + { + "epoch": 6.9427662957074725, + "grad_norm": 4.114255220532861, + "learning_rate": 4.5811764906203236e-05, + "loss": 0.1409, + "step": 4367 + }, + { + "epoch": 6.9443561208267095, + "grad_norm": 3.6080354154907397, + "learning_rate": 4.581649828822509e-05, + "loss": 0.2079, + "step": 4368 + }, + { + "epoch": 6.945945945945946, + "grad_norm": 2.4449513024466434, + "learning_rate": 4.5821229701572894e-05, + "loss": 0.1348, + "step": 4369 + }, + { + "epoch": 6.947535771065183, + "grad_norm": 3.4226311384991783, + "learning_rate": 4.5825959144645234e-05, + "loss": 0.1838, + "step": 4370 + }, + { + "epoch": 6.94912559618442, + "grad_norm": 3.4660953849956, + "learning_rate": 4.583068661584135e-05, + "loss": 0.1693, + "step": 4371 + }, + { + "epoch": 6.950715421303657, + "grad_norm": 2.5245847713834575, + "learning_rate": 4.5835412113561176e-05, + "loss": 0.1629, + "step": 4372 + }, + { + "epoch": 6.952305246422894, + "grad_norm": 1.7529822562961754, + "learning_rate": 4.5840135636205306e-05, + "loss": 0.1331, + "step": 4373 + }, + { + "epoch": 6.953895071542131, + "grad_norm": 2.8279357368346245, + "learning_rate": 4.584485718217499e-05, + "loss": 0.1539, + "step": 4374 + }, + { + "epoch": 6.955484896661368, + "grad_norm": 2.9353023050915423, + "learning_rate": 4.584957674987215e-05, + "loss": 0.1765, + "step": 4375 + }, + { + "epoch": 6.957074721780604, + "grad_norm": 3.9166450703605307, + "learning_rate": 4.585429433769941e-05, + "loss": 0.1555, + "step": 4376 + }, + { + "epoch": 6.958664546899841, + "grad_norm": 3.617644257049665, + "learning_rate": 4.585900994406001e-05, + "loss": 0.1482, + "step": 4377 + }, + { + "epoch": 6.960254372019078, + "grad_norm": 1.334397987442756, + "learning_rate": 4.58637235673579e-05, + "loss": 0.1292, + "step": 4378 + }, + { + "epoch": 6.961844197138315, + "grad_norm": 4.386647232528946, + "learning_rate": 4.586843520599768e-05, + "loss": 0.3152, + "step": 4379 + }, + { + "epoch": 6.963434022257552, + "grad_norm": 2.4032855336139445, + "learning_rate": 4.587314485838464e-05, + "loss": 0.1583, + "step": 4380 + }, + { + "epoch": 6.965023847376789, + "grad_norm": 2.418217534989904, + "learning_rate": 4.587785252292473e-05, + "loss": 0.121, + "step": 4381 + }, + { + "epoch": 6.966613672496026, + "grad_norm": 2.844770569230615, + "learning_rate": 4.588255819802458e-05, + "loss": 0.226, + "step": 4382 + }, + { + "epoch": 6.968203497615263, + "grad_norm": 3.4541924175132688, + "learning_rate": 4.588726188209149e-05, + "loss": 0.144, + "step": 4383 + }, + { + "epoch": 6.9697933227345, + "grad_norm": 1.7675646516164376, + "learning_rate": 4.589196357353343e-05, + "loss": 0.816, + "step": 4384 + }, + { + "epoch": 6.971383147853736, + "grad_norm": 30.255251619940378, + "learning_rate": 4.589666327075904e-05, + "loss": 1.8304, + "step": 4385 + }, + { + "epoch": 6.972972972972973, + "grad_norm": 1.9616277655494896, + "learning_rate": 4.5901360972177645e-05, + "loss": 0.1718, + "step": 4386 + }, + { + "epoch": 6.97456279809221, + "grad_norm": 2.8000848053725957, + "learning_rate": 4.5906056676199253e-05, + "loss": 0.1079, + "step": 4387 + }, + { + "epoch": 6.976152623211447, + "grad_norm": 3.6697587363326387, + "learning_rate": 4.591075038123454e-05, + "loss": 0.1254, + "step": 4388 + }, + { + "epoch": 6.977742448330684, + "grad_norm": 2.459458957843938, + "learning_rate": 4.591544208569484e-05, + "loss": 0.144, + "step": 4389 + }, + { + "epoch": 6.979332273449921, + "grad_norm": 2.510459502298899, + "learning_rate": 4.59201317879922e-05, + "loss": 0.1521, + "step": 4390 + }, + { + "epoch": 6.980922098569158, + "grad_norm": 2.804673072802171, + "learning_rate": 4.5924819486539306e-05, + "loss": 0.1493, + "step": 4391 + }, + { + "epoch": 6.982511923688394, + "grad_norm": 4.787218101497528, + "learning_rate": 4.592950517974956e-05, + "loss": 0.2671, + "step": 4392 + }, + { + "epoch": 6.984101748807631, + "grad_norm": 2.1654109483200656, + "learning_rate": 4.593418886603702e-05, + "loss": 0.1451, + "step": 4393 + }, + { + "epoch": 6.985691573926868, + "grad_norm": 39.85436083876924, + "learning_rate": 4.593887054381641e-05, + "loss": 5.2202, + "step": 4394 + }, + { + "epoch": 6.987281399046105, + "grad_norm": 3.0615244642623236, + "learning_rate": 4.594355021150317e-05, + "loss": 0.1731, + "step": 4395 + }, + { + "epoch": 6.988871224165342, + "grad_norm": 2.473062325600342, + "learning_rate": 4.594822786751341e-05, + "loss": 0.1317, + "step": 4396 + }, + { + "epoch": 6.990461049284579, + "grad_norm": 2.024385544797275, + "learning_rate": 4.59529035102639e-05, + "loss": 0.1622, + "step": 4397 + }, + { + "epoch": 6.992050874403816, + "grad_norm": 1.3244823476591756, + "learning_rate": 4.59575771381721e-05, + "loss": 0.1428, + "step": 4398 + }, + { + "epoch": 6.993640699523052, + "grad_norm": 3.2145959096281578, + "learning_rate": 4.596224874965616e-05, + "loss": 0.1296, + "step": 4399 + }, + { + "epoch": 6.995230524642289, + "grad_norm": 3.106402634986056, + "learning_rate": 4.596691834313491e-05, + "loss": 0.1391, + "step": 4400 + }, + { + "epoch": 6.996820349761526, + "grad_norm": 2.642970761003322, + "learning_rate": 4.5971585917027865e-05, + "loss": 0.1522, + "step": 4401 + }, + { + "epoch": 6.998410174880763, + "grad_norm": 2.439585989951723, + "learning_rate": 4.597625146975521e-05, + "loss": 0.1956, + "step": 4402 + }, + { + "epoch": 7.0, + "grad_norm": 2.2644959959799493, + "learning_rate": 4.598091499973784e-05, + "loss": 0.1246, + "step": 4403 + }, + { + "epoch": 7.001589825119237, + "grad_norm": 2.8063127097032483, + "learning_rate": 4.598557650539731e-05, + "loss": 0.2117, + "step": 4404 + }, + { + "epoch": 7.003179650238474, + "grad_norm": 3.262544497764086, + "learning_rate": 4.599023598515586e-05, + "loss": 0.1711, + "step": 4405 + }, + { + "epoch": 7.004769475357711, + "grad_norm": 3.1992101213967628, + "learning_rate": 4.599489343743644e-05, + "loss": 0.139, + "step": 4406 + }, + { + "epoch": 7.006359300476947, + "grad_norm": 2.9548266851897567, + "learning_rate": 4.5999548860662666e-05, + "loss": 0.1376, + "step": 4407 + }, + { + "epoch": 7.007949125596184, + "grad_norm": 2.058194666306494, + "learning_rate": 4.600420225325885e-05, + "loss": 0.098, + "step": 4408 + }, + { + "epoch": 7.009538950715421, + "grad_norm": 2.8322112027718696, + "learning_rate": 4.600885361364997e-05, + "loss": 0.1298, + "step": 4409 + }, + { + "epoch": 7.011128775834658, + "grad_norm": 3.384376411773975, + "learning_rate": 4.6013502940261725e-05, + "loss": 0.1196, + "step": 4410 + }, + { + "epoch": 7.012718600953895, + "grad_norm": 2.2573931578408577, + "learning_rate": 4.601815023152049e-05, + "loss": 0.1349, + "step": 4411 + }, + { + "epoch": 7.014308426073132, + "grad_norm": 1.4201740950932924, + "learning_rate": 4.602279548585331e-05, + "loss": 0.1522, + "step": 4412 + }, + { + "epoch": 7.015898251192369, + "grad_norm": 4.663154812206325, + "learning_rate": 4.602743870168794e-05, + "loss": 0.1891, + "step": 4413 + }, + { + "epoch": 7.017488076311606, + "grad_norm": 1.8636234624581416, + "learning_rate": 4.6032079877452826e-05, + "loss": 0.1185, + "step": 4414 + }, + { + "epoch": 7.019077901430842, + "grad_norm": 4.571617784434723, + "learning_rate": 4.603671901157709e-05, + "loss": 0.305, + "step": 4415 + }, + { + "epoch": 7.020667726550079, + "grad_norm": 3.6193585557374814, + "learning_rate": 4.604135610249057e-05, + "loss": 0.1599, + "step": 4416 + }, + { + "epoch": 7.022257551669316, + "grad_norm": 5.631572725751323, + "learning_rate": 4.6045991148623757e-05, + "loss": 0.2197, + "step": 4417 + }, + { + "epoch": 7.023847376788553, + "grad_norm": 2.2466828085930466, + "learning_rate": 4.605062414840786e-05, + "loss": 0.1778, + "step": 4418 + }, + { + "epoch": 7.02543720190779, + "grad_norm": 5.6730956783342625, + "learning_rate": 4.6055255100274785e-05, + "loss": 0.2279, + "step": 4419 + }, + { + "epoch": 7.027027027027027, + "grad_norm": 2.732934122086105, + "learning_rate": 4.6059884002657114e-05, + "loss": 0.1328, + "step": 4420 + }, + { + "epoch": 7.028616852146264, + "grad_norm": 3.498541347769334, + "learning_rate": 4.606451085398814e-05, + "loss": 0.2121, + "step": 4421 + }, + { + "epoch": 7.030206677265501, + "grad_norm": 3.753298087283855, + "learning_rate": 4.606913565270183e-05, + "loss": 0.2023, + "step": 4422 + }, + { + "epoch": 7.031796502384737, + "grad_norm": 2.759505137271759, + "learning_rate": 4.607375839723287e-05, + "loss": 0.1741, + "step": 4423 + }, + { + "epoch": 7.033386327503974, + "grad_norm": 2.844621855127885, + "learning_rate": 4.607837908601662e-05, + "loss": 0.1697, + "step": 4424 + }, + { + "epoch": 7.034976152623211, + "grad_norm": 3.286500935932206, + "learning_rate": 4.608299771748915e-05, + "loss": 0.1745, + "step": 4425 + }, + { + "epoch": 7.036565977742448, + "grad_norm": 3.9546466249966064, + "learning_rate": 4.608761429008721e-05, + "loss": 0.1229, + "step": 4426 + }, + { + "epoch": 7.038155802861685, + "grad_norm": 2.8562797807706612, + "learning_rate": 4.609222880224827e-05, + "loss": 0.1414, + "step": 4427 + }, + { + "epoch": 7.039745627980922, + "grad_norm": 2.8704343978626445, + "learning_rate": 4.609684125241047e-05, + "loss": 0.2424, + "step": 4428 + }, + { + "epoch": 7.041335453100159, + "grad_norm": 3.4505018146096313, + "learning_rate": 4.610145163901268e-05, + "loss": 0.1262, + "step": 4429 + }, + { + "epoch": 7.042925278219396, + "grad_norm": 2.9065501601682526, + "learning_rate": 4.610605996049444e-05, + "loss": 0.202, + "step": 4430 + }, + { + "epoch": 7.044515103338632, + "grad_norm": 4.27505240120418, + "learning_rate": 4.6110666215296e-05, + "loss": 0.2304, + "step": 4431 + }, + { + "epoch": 7.046104928457869, + "grad_norm": 3.3345193428045206, + "learning_rate": 4.6115270401858316e-05, + "loss": 0.1183, + "step": 4432 + }, + { + "epoch": 7.047694753577106, + "grad_norm": 3.5790686366209923, + "learning_rate": 4.611987251862303e-05, + "loss": 0.1733, + "step": 4433 + }, + { + "epoch": 7.049284578696343, + "grad_norm": 2.7048849995532307, + "learning_rate": 4.6124472564032496e-05, + "loss": 0.1658, + "step": 4434 + }, + { + "epoch": 7.05087440381558, + "grad_norm": 1.5311640051696793, + "learning_rate": 4.612907053652977e-05, + "loss": 0.2297, + "step": 4435 + }, + { + "epoch": 7.052464228934817, + "grad_norm": 2.5752891669948843, + "learning_rate": 4.6133666434558594e-05, + "loss": 0.2322, + "step": 4436 + }, + { + "epoch": 7.054054054054054, + "grad_norm": 2.0000478096881205, + "learning_rate": 4.613826025656343e-05, + "loss": 0.1277, + "step": 4437 + }, + { + "epoch": 7.0556438791732905, + "grad_norm": 4.319689695264964, + "learning_rate": 4.614285200098943e-05, + "loss": 0.101, + "step": 4438 + }, + { + "epoch": 7.0572337042925275, + "grad_norm": 2.6696702845722027, + "learning_rate": 4.614744166628247e-05, + "loss": 0.1587, + "step": 4439 + }, + { + "epoch": 7.0588235294117645, + "grad_norm": 3.25174830051645, + "learning_rate": 4.61520292508891e-05, + "loss": 0.1781, + "step": 4440 + }, + { + "epoch": 7.0604133545310015, + "grad_norm": 2.4414248427287144, + "learning_rate": 4.6156614753256584e-05, + "loss": 0.141, + "step": 4441 + }, + { + "epoch": 7.0620031796502385, + "grad_norm": 3.4884001010139745, + "learning_rate": 4.616119817183291e-05, + "loss": 0.1188, + "step": 4442 + }, + { + "epoch": 7.0635930047694755, + "grad_norm": 4.099324516951509, + "learning_rate": 4.616577950506675e-05, + "loss": 0.2052, + "step": 4443 + }, + { + "epoch": 7.0651828298887125, + "grad_norm": 2.225131594211299, + "learning_rate": 4.617035875140749e-05, + "loss": 0.1428, + "step": 4444 + }, + { + "epoch": 7.0667726550079495, + "grad_norm": 3.0379355326249655, + "learning_rate": 4.6174935909305224e-05, + "loss": 0.1969, + "step": 4445 + }, + { + "epoch": 7.068362480127186, + "grad_norm": 4.443941278791309, + "learning_rate": 4.617951097721073e-05, + "loss": 0.2175, + "step": 4446 + }, + { + "epoch": 7.069952305246423, + "grad_norm": 3.699733690276889, + "learning_rate": 4.618408395357554e-05, + "loss": 0.1024, + "step": 4447 + }, + { + "epoch": 7.07154213036566, + "grad_norm": 2.2239882767532873, + "learning_rate": 4.618865483685186e-05, + "loss": 0.1843, + "step": 4448 + }, + { + "epoch": 7.073131955484897, + "grad_norm": 3.947541433472577, + "learning_rate": 4.61932236254926e-05, + "loss": 0.1952, + "step": 4449 + }, + { + "epoch": 7.074721780604134, + "grad_norm": 1.2906295939423535, + "learning_rate": 4.61977903179514e-05, + "loss": 0.1855, + "step": 4450 + }, + { + "epoch": 7.076311605723371, + "grad_norm": 3.364772687511871, + "learning_rate": 4.6202354912682606e-05, + "loss": 0.2123, + "step": 4451 + }, + { + "epoch": 7.077901430842608, + "grad_norm": 6.664777331963277, + "learning_rate": 4.6206917408141246e-05, + "loss": 0.2366, + "step": 4452 + }, + { + "epoch": 7.079491255961845, + "grad_norm": 2.919927563316208, + "learning_rate": 4.6211477802783106e-05, + "loss": 0.1788, + "step": 4453 + }, + { + "epoch": 7.081081081081081, + "grad_norm": 5.613842693189876, + "learning_rate": 4.621603609506465e-05, + "loss": 0.2068, + "step": 4454 + }, + { + "epoch": 7.082670906200318, + "grad_norm": 4.076631500072799, + "learning_rate": 4.622059228344304e-05, + "loss": 0.1968, + "step": 4455 + }, + { + "epoch": 7.084260731319555, + "grad_norm": 8.450672796717233, + "learning_rate": 4.62251463663762e-05, + "loss": 0.2146, + "step": 4456 + }, + { + "epoch": 7.085850556438792, + "grad_norm": 4.506870293920581, + "learning_rate": 4.6229698342322724e-05, + "loss": 0.2742, + "step": 4457 + }, + { + "epoch": 7.087440381558029, + "grad_norm": 2.498264932644508, + "learning_rate": 4.623424820974193e-05, + "loss": 0.151, + "step": 4458 + }, + { + "epoch": 7.089030206677266, + "grad_norm": 5.699029986574716, + "learning_rate": 4.623879596709386e-05, + "loss": 0.3973, + "step": 4459 + }, + { + "epoch": 7.090620031796503, + "grad_norm": 5.582698358253199, + "learning_rate": 4.6243341612839264e-05, + "loss": 0.1182, + "step": 4460 + }, + { + "epoch": 7.09220985691574, + "grad_norm": 5.686112855405954, + "learning_rate": 4.62478851454396e-05, + "loss": 0.1874, + "step": 4461 + }, + { + "epoch": 7.093799682034976, + "grad_norm": 1.7910649715417066, + "learning_rate": 4.6252426563357055e-05, + "loss": 0.1296, + "step": 4462 + }, + { + "epoch": 7.095389507154213, + "grad_norm": 2.2442840128536274, + "learning_rate": 4.6256965865054514e-05, + "loss": 0.1852, + "step": 4463 + }, + { + "epoch": 7.09697933227345, + "grad_norm": 4.7837453557363965, + "learning_rate": 4.626150304899559e-05, + "loss": 0.2075, + "step": 4464 + }, + { + "epoch": 7.098569157392687, + "grad_norm": 5.090094857483868, + "learning_rate": 4.62660381136446e-05, + "loss": 0.1595, + "step": 4465 + }, + { + "epoch": 7.100158982511924, + "grad_norm": 3.0695592364630557, + "learning_rate": 4.627057105746662e-05, + "loss": 0.1393, + "step": 4466 + }, + { + "epoch": 7.101748807631161, + "grad_norm": 3.472145957957069, + "learning_rate": 4.627510187892738e-05, + "loss": 0.1652, + "step": 4467 + }, + { + "epoch": 7.103338632750398, + "grad_norm": 3.1978564098920783, + "learning_rate": 4.627963057649338e-05, + "loss": 0.2059, + "step": 4468 + }, + { + "epoch": 7.104928457869635, + "grad_norm": 4.277511868676584, + "learning_rate": 4.6284157148631814e-05, + "loss": 0.2404, + "step": 4469 + }, + { + "epoch": 7.106518282988871, + "grad_norm": 4.054286437909406, + "learning_rate": 4.6288681593810595e-05, + "loss": 0.2484, + "step": 4470 + }, + { + "epoch": 7.108108108108108, + "grad_norm": 3.366165667185676, + "learning_rate": 4.6293203910498376e-05, + "loss": 0.1928, + "step": 4471 + }, + { + "epoch": 7.109697933227345, + "grad_norm": 2.115767120476511, + "learning_rate": 4.6297724097164506e-05, + "loss": 0.1959, + "step": 4472 + }, + { + "epoch": 7.111287758346582, + "grad_norm": 2.838743539691398, + "learning_rate": 4.630224215227907e-05, + "loss": 0.1633, + "step": 4473 + }, + { + "epoch": 7.112877583465819, + "grad_norm": 4.781675279326575, + "learning_rate": 4.630675807431286e-05, + "loss": 0.1667, + "step": 4474 + }, + { + "epoch": 7.114467408585056, + "grad_norm": 2.631361946208207, + "learning_rate": 4.631127186173742e-05, + "loss": 0.1937, + "step": 4475 + }, + { + "epoch": 7.116057233704293, + "grad_norm": 2.9041738967228303, + "learning_rate": 4.6315783513024974e-05, + "loss": 0.1734, + "step": 4476 + }, + { + "epoch": 7.117647058823529, + "grad_norm": 2.614016178727657, + "learning_rate": 4.6320293026648516e-05, + "loss": 0.2086, + "step": 4477 + }, + { + "epoch": 7.119236883942766, + "grad_norm": 2.6491194761109194, + "learning_rate": 4.632480040108171e-05, + "loss": 0.146, + "step": 4478 + }, + { + "epoch": 7.120826709062003, + "grad_norm": 2.972856423711915, + "learning_rate": 4.6329305634799e-05, + "loss": 0.213, + "step": 4479 + }, + { + "epoch": 7.12241653418124, + "grad_norm": 2.0786906037732327, + "learning_rate": 4.63338087262755e-05, + "loss": 0.125, + "step": 4480 + }, + { + "epoch": 7.124006359300477, + "grad_norm": 3.142548082285499, + "learning_rate": 4.6338309673987106e-05, + "loss": 0.1819, + "step": 4481 + }, + { + "epoch": 7.125596184419714, + "grad_norm": 2.1055150817651214, + "learning_rate": 4.634280847641039e-05, + "loss": 0.1843, + "step": 4482 + }, + { + "epoch": 7.127186009538951, + "grad_norm": 2.056976264873237, + "learning_rate": 4.634730513202268e-05, + "loss": 0.1224, + "step": 4483 + }, + { + "epoch": 7.128775834658188, + "grad_norm": 2.2468688521273537, + "learning_rate": 4.635179963930201e-05, + "loss": 0.1819, + "step": 4484 + }, + { + "epoch": 7.130365659777424, + "grad_norm": 2.0236323052429412, + "learning_rate": 4.6356291996727166e-05, + "loss": 0.2021, + "step": 4485 + }, + { + "epoch": 7.131955484896661, + "grad_norm": 1.742066523207537, + "learning_rate": 4.6360782202777644e-05, + "loss": 0.1904, + "step": 4486 + }, + { + "epoch": 7.133545310015898, + "grad_norm": 2.6685780084474047, + "learning_rate": 4.636527025593366e-05, + "loss": 0.144, + "step": 4487 + }, + { + "epoch": 7.135135135135135, + "grad_norm": 2.0984672572286844, + "learning_rate": 4.636975615467618e-05, + "loss": 0.2154, + "step": 4488 + }, + { + "epoch": 7.136724960254372, + "grad_norm": 2.1125565860310225, + "learning_rate": 4.63742398974869e-05, + "loss": 0.1504, + "step": 4489 + }, + { + "epoch": 7.138314785373609, + "grad_norm": 1.0770710762230538, + "learning_rate": 4.637872148284821e-05, + "loss": 0.1163, + "step": 4490 + }, + { + "epoch": 7.139904610492846, + "grad_norm": 2.0214139607607136, + "learning_rate": 4.638320090924328e-05, + "loss": 0.1236, + "step": 4491 + }, + { + "epoch": 7.141494435612083, + "grad_norm": 2.133888777348414, + "learning_rate": 4.638767817515598e-05, + "loss": 0.1889, + "step": 4492 + }, + { + "epoch": 7.143084260731319, + "grad_norm": 2.2862250273158886, + "learning_rate": 4.639215327907091e-05, + "loss": 0.1889, + "step": 4493 + }, + { + "epoch": 7.144674085850556, + "grad_norm": 1.6071193960483956, + "learning_rate": 4.639662621947341e-05, + "loss": 0.1834, + "step": 4494 + }, + { + "epoch": 7.146263910969793, + "grad_norm": 2.48355387305992, + "learning_rate": 4.6401096994849556e-05, + "loss": 0.2011, + "step": 4495 + }, + { + "epoch": 7.14785373608903, + "grad_norm": 2.757952979321054, + "learning_rate": 4.6405565603686154e-05, + "loss": 0.1253, + "step": 4496 + }, + { + "epoch": 7.149443561208267, + "grad_norm": 2.0412122511600232, + "learning_rate": 4.641003204447073e-05, + "loss": 0.2129, + "step": 4497 + }, + { + "epoch": 7.151033386327504, + "grad_norm": 2.3407493754177686, + "learning_rate": 4.641449631569158e-05, + "loss": 0.2297, + "step": 4498 + }, + { + "epoch": 7.152623211446741, + "grad_norm": 2.243899694451982, + "learning_rate": 4.641895841583769e-05, + "loss": 0.154, + "step": 4499 + }, + { + "epoch": 7.154213036565977, + "grad_norm": 2.0682966870310544, + "learning_rate": 4.64234183433988e-05, + "loss": 0.1617, + "step": 4500 + }, + { + "epoch": 7.155802861685214, + "grad_norm": 2.875190177929625, + "learning_rate": 4.64278760968654e-05, + "loss": 0.2694, + "step": 4501 + }, + { + "epoch": 7.157392686804451, + "grad_norm": 1.4901338481348805, + "learning_rate": 4.643233167472868e-05, + "loss": 0.1428, + "step": 4502 + }, + { + "epoch": 7.158982511923688, + "grad_norm": 2.6961165905314775, + "learning_rate": 4.6436785075480605e-05, + "loss": 0.2041, + "step": 4503 + }, + { + "epoch": 7.160572337042925, + "grad_norm": 1.6903660025169973, + "learning_rate": 4.6441236297613866e-05, + "loss": 0.2268, + "step": 4504 + }, + { + "epoch": 7.162162162162162, + "grad_norm": 3.356744206964435, + "learning_rate": 4.6445685339621865e-05, + "loss": 0.178, + "step": 4505 + }, + { + "epoch": 7.163751987281399, + "grad_norm": 1.752625616973835, + "learning_rate": 4.645013219999878e-05, + "loss": 0.1615, + "step": 4506 + }, + { + "epoch": 7.165341812400636, + "grad_norm": 4.335386723596389, + "learning_rate": 4.645457687723951e-05, + "loss": 0.3305, + "step": 4507 + }, + { + "epoch": 7.166931637519872, + "grad_norm": 3.124655167018783, + "learning_rate": 4.645901936983968e-05, + "loss": 0.1385, + "step": 4508 + }, + { + "epoch": 7.168521462639109, + "grad_norm": 2.3954837278173526, + "learning_rate": 4.646345967629567e-05, + "loss": 0.2005, + "step": 4509 + }, + { + "epoch": 7.170111287758346, + "grad_norm": 3.2038118335673436, + "learning_rate": 4.64678977951046e-05, + "loss": 0.1328, + "step": 4510 + }, + { + "epoch": 7.171701112877583, + "grad_norm": 5.0895508949764645, + "learning_rate": 4.647233372476433e-05, + "loss": 0.228, + "step": 4511 + }, + { + "epoch": 7.17329093799682, + "grad_norm": 1.5332138122199175, + "learning_rate": 4.647676746377345e-05, + "loss": 0.1768, + "step": 4512 + }, + { + "epoch": 7.174880763116057, + "grad_norm": 5.624882137856937, + "learning_rate": 4.648119901063131e-05, + "loss": 0.1855, + "step": 4513 + }, + { + "epoch": 7.176470588235294, + "grad_norm": 3.313121846398208, + "learning_rate": 4.6485628363837986e-05, + "loss": 0.1637, + "step": 4514 + }, + { + "epoch": 7.178060413354531, + "grad_norm": 3.757883602821559, + "learning_rate": 4.64900555218943e-05, + "loss": 0.2054, + "step": 4515 + }, + { + "epoch": 7.1796502384737675, + "grad_norm": 4.132407897837306, + "learning_rate": 4.649448048330183e-05, + "loss": 0.2097, + "step": 4516 + }, + { + "epoch": 7.1812400635930045, + "grad_norm": 4.357569013118494, + "learning_rate": 4.6498903246562886e-05, + "loss": 0.1245, + "step": 4517 + }, + { + "epoch": 7.1828298887122415, + "grad_norm": 3.976686184216191, + "learning_rate": 4.650332381018051e-05, + "loss": 0.2748, + "step": 4518 + }, + { + "epoch": 7.1844197138314785, + "grad_norm": 4.293871509086424, + "learning_rate": 4.650774217265851e-05, + "loss": 0.1487, + "step": 4519 + }, + { + "epoch": 7.1860095389507155, + "grad_norm": 3.5536632924492566, + "learning_rate": 4.6512158332501425e-05, + "loss": 0.1902, + "step": 4520 + }, + { + "epoch": 7.1875993640699525, + "grad_norm": 1.7266418325776576, + "learning_rate": 4.651657228821455e-05, + "loss": 0.1493, + "step": 4521 + }, + { + "epoch": 7.1891891891891895, + "grad_norm": 4.474490394279598, + "learning_rate": 4.652098403830393e-05, + "loss": 0.2612, + "step": 4522 + }, + { + "epoch": 7.1907790143084265, + "grad_norm": 4.417410224420178, + "learning_rate": 4.652539358127632e-05, + "loss": 0.1635, + "step": 4523 + }, + { + "epoch": 7.192368839427663, + "grad_norm": 2.4883423154002555, + "learning_rate": 4.652980091563927e-05, + "loss": 0.2763, + "step": 4524 + }, + { + "epoch": 7.1939586645469, + "grad_norm": 29.478519091687346, + "learning_rate": 4.653420603990106e-05, + "loss": 7.5321, + "step": 4525 + }, + { + "epoch": 7.195548489666137, + "grad_norm": 2.4686447863051715, + "learning_rate": 4.65386089525707e-05, + "loss": 0.2099, + "step": 4526 + }, + { + "epoch": 7.197138314785374, + "grad_norm": 1.6642467885580376, + "learning_rate": 4.654300965215797e-05, + "loss": 0.0947, + "step": 4527 + }, + { + "epoch": 7.198728139904611, + "grad_norm": 2.758781500507689, + "learning_rate": 4.6547408137173404e-05, + "loss": 0.258, + "step": 4528 + }, + { + "epoch": 7.200317965023848, + "grad_norm": 2.781808904715662, + "learning_rate": 4.655180440612825e-05, + "loss": 0.1615, + "step": 4529 + }, + { + "epoch": 7.201907790143085, + "grad_norm": 3.981821797076212, + "learning_rate": 4.655619845753456e-05, + "loss": 0.1963, + "step": 4530 + }, + { + "epoch": 7.203497615262322, + "grad_norm": 2.0165405847180686, + "learning_rate": 4.656059028990507e-05, + "loss": 0.187, + "step": 4531 + }, + { + "epoch": 7.205087440381558, + "grad_norm": 4.24364445646243, + "learning_rate": 4.6564979901753344e-05, + "loss": 0.157, + "step": 4532 + }, + { + "epoch": 7.206677265500795, + "grad_norm": 2.904758145169337, + "learning_rate": 4.6569367291593624e-05, + "loss": 0.2314, + "step": 4533 + }, + { + "epoch": 7.208267090620032, + "grad_norm": 2.565064631691079, + "learning_rate": 4.657375245794096e-05, + "loss": 0.1332, + "step": 4534 + }, + { + "epoch": 7.209856915739269, + "grad_norm": 3.683076427285052, + "learning_rate": 4.6578135399311114e-05, + "loss": 0.2722, + "step": 4535 + }, + { + "epoch": 7.211446740858506, + "grad_norm": 54.16715047219509, + "learning_rate": 4.658251611422064e-05, + "loss": 3.4737, + "step": 4536 + }, + { + "epoch": 7.213036565977743, + "grad_norm": 3.10510466285117, + "learning_rate": 4.658689460118681e-05, + "loss": 0.154, + "step": 4537 + }, + { + "epoch": 7.21462639109698, + "grad_norm": 2.5116271294917323, + "learning_rate": 4.659127085872766e-05, + "loss": 0.1641, + "step": 4538 + }, + { + "epoch": 7.216216216216216, + "grad_norm": 48.16702850486449, + "learning_rate": 4.6595644885362e-05, + "loss": 5.3855, + "step": 4539 + }, + { + "epoch": 7.217806041335453, + "grad_norm": 3.501931745885533, + "learning_rate": 4.660001667960937e-05, + "loss": 0.2109, + "step": 4540 + }, + { + "epoch": 7.21939586645469, + "grad_norm": 2.3998930765295463, + "learning_rate": 4.6604386239990074e-05, + "loss": 0.2027, + "step": 4541 + }, + { + "epoch": 7.220985691573927, + "grad_norm": 3.6958479719269293, + "learning_rate": 4.660875356502519e-05, + "loss": 0.2279, + "step": 4542 + }, + { + "epoch": 7.222575516693164, + "grad_norm": 2.344503837369767, + "learning_rate": 4.661311865323652e-05, + "loss": 0.2033, + "step": 4543 + }, + { + "epoch": 7.224165341812401, + "grad_norm": 1.5747479415327035, + "learning_rate": 4.6617481503146644e-05, + "loss": 0.1453, + "step": 4544 + }, + { + "epoch": 7.225755166931638, + "grad_norm": 2.615810965808244, + "learning_rate": 4.6621842113278896e-05, + "loss": 0.1692, + "step": 4545 + }, + { + "epoch": 7.227344992050875, + "grad_norm": 3.511344066573996, + "learning_rate": 4.6626200482157375e-05, + "loss": 0.1524, + "step": 4546 + }, + { + "epoch": 7.228934817170111, + "grad_norm": 1.9898296669332898, + "learning_rate": 4.663055660830692e-05, + "loss": 0.1211, + "step": 4547 + }, + { + "epoch": 7.230524642289348, + "grad_norm": 3.318262980739692, + "learning_rate": 4.6634910490253146e-05, + "loss": 0.1752, + "step": 4548 + }, + { + "epoch": 7.232114467408585, + "grad_norm": 46.65507460527682, + "learning_rate": 4.6639262126522425e-05, + "loss": 2.5714, + "step": 4549 + }, + { + "epoch": 7.233704292527822, + "grad_norm": 3.099797153453075, + "learning_rate": 4.664361151564186e-05, + "loss": 0.1549, + "step": 4550 + }, + { + "epoch": 7.235294117647059, + "grad_norm": 1.9333214538393355, + "learning_rate": 4.6647958656139385e-05, + "loss": 0.1059, + "step": 4551 + }, + { + "epoch": 7.236883942766296, + "grad_norm": 2.518635857167061, + "learning_rate": 4.6652303546543614e-05, + "loss": 0.1104, + "step": 4552 + }, + { + "epoch": 7.238473767885533, + "grad_norm": 1.998129639297619, + "learning_rate": 4.6656646185383964e-05, + "loss": 0.1105, + "step": 4553 + }, + { + "epoch": 7.24006359300477, + "grad_norm": 2.034320725694436, + "learning_rate": 4.6660986571190625e-05, + "loss": 0.165, + "step": 4554 + }, + { + "epoch": 7.241653418124006, + "grad_norm": 2.525889972094502, + "learning_rate": 4.666532470249453e-05, + "loss": 0.2118, + "step": 4555 + }, + { + "epoch": 7.243243243243243, + "grad_norm": 3.1451120278009235, + "learning_rate": 4.666966057782736e-05, + "loss": 0.2127, + "step": 4556 + }, + { + "epoch": 7.24483306836248, + "grad_norm": 2.1200454228492838, + "learning_rate": 4.66739941957216e-05, + "loss": 0.177, + "step": 4557 + }, + { + "epoch": 7.246422893481717, + "grad_norm": 2.1571405463157487, + "learning_rate": 4.6678325554710464e-05, + "loss": 0.2235, + "step": 4558 + }, + { + "epoch": 7.248012718600954, + "grad_norm": 1.766101336864548, + "learning_rate": 4.668265465332796e-05, + "loss": 0.1308, + "step": 4559 + }, + { + "epoch": 7.249602543720191, + "grad_norm": 1.4953758259828855, + "learning_rate": 4.6686981490108825e-05, + "loss": 0.171, + "step": 4560 + }, + { + "epoch": 7.251192368839428, + "grad_norm": 2.3193787160007115, + "learning_rate": 4.669130606358858e-05, + "loss": 0.1828, + "step": 4561 + }, + { + "epoch": 7.252782193958664, + "grad_norm": 1.7040094721142731, + "learning_rate": 4.669562837230354e-05, + "loss": 0.1829, + "step": 4562 + }, + { + "epoch": 7.254372019077901, + "grad_norm": 2.4303965512409773, + "learning_rate": 4.6699948414790734e-05, + "loss": 0.1362, + "step": 4563 + }, + { + "epoch": 7.255961844197138, + "grad_norm": 1.8243774351961926, + "learning_rate": 4.670426618958799e-05, + "loss": 0.1051, + "step": 4564 + }, + { + "epoch": 7.257551669316375, + "grad_norm": 1.9669435625129836, + "learning_rate": 4.670858169523391e-05, + "loss": 0.156, + "step": 4565 + }, + { + "epoch": 7.259141494435612, + "grad_norm": 3.0069495962431123, + "learning_rate": 4.671289493026784e-05, + "loss": 0.1728, + "step": 4566 + }, + { + "epoch": 7.260731319554849, + "grad_norm": 2.1773407115075503, + "learning_rate": 4.67172058932299e-05, + "loss": 0.1501, + "step": 4567 + }, + { + "epoch": 7.262321144674086, + "grad_norm": 1.9622327169783964, + "learning_rate": 4.672151458266101e-05, + "loss": 0.1545, + "step": 4568 + }, + { + "epoch": 7.263910969793323, + "grad_norm": 2.1373689819462247, + "learning_rate": 4.6725820997102805e-05, + "loss": 0.2674, + "step": 4569 + }, + { + "epoch": 7.26550079491256, + "grad_norm": 1.5441598754945203, + "learning_rate": 4.6730125135097733e-05, + "loss": 0.163, + "step": 4570 + }, + { + "epoch": 7.267090620031796, + "grad_norm": 2.4432448792908215, + "learning_rate": 4.673442699518901e-05, + "loss": 0.1452, + "step": 4571 + }, + { + "epoch": 7.268680445151033, + "grad_norm": 3.015236339153733, + "learning_rate": 4.673872657592059e-05, + "loss": 0.207, + "step": 4572 + }, + { + "epoch": 7.27027027027027, + "grad_norm": 4.251574198317745, + "learning_rate": 4.674302387583724e-05, + "loss": 0.2539, + "step": 4573 + }, + { + "epoch": 7.271860095389507, + "grad_norm": 3.0047200768079905, + "learning_rate": 4.674731889348446e-05, + "loss": 0.1247, + "step": 4574 + }, + { + "epoch": 7.273449920508744, + "grad_norm": 3.418098328142938, + "learning_rate": 4.6751611627408564e-05, + "loss": 0.2613, + "step": 4575 + }, + { + "epoch": 7.275039745627981, + "grad_norm": 1.5499181878844996, + "learning_rate": 4.6755902076156606e-05, + "loss": 0.1251, + "step": 4576 + }, + { + "epoch": 7.276629570747218, + "grad_norm": 4.368871661218968, + "learning_rate": 4.6760190238276425e-05, + "loss": 0.1487, + "step": 4577 + }, + { + "epoch": 7.278219395866454, + "grad_norm": 1.592053376494794, + "learning_rate": 4.676447611231663e-05, + "loss": 0.1436, + "step": 4578 + }, + { + "epoch": 7.279809220985691, + "grad_norm": 3.0537116890969402, + "learning_rate": 4.676875969682661e-05, + "loss": 0.1261, + "step": 4579 + }, + { + "epoch": 7.281399046104928, + "grad_norm": 4.222513577643903, + "learning_rate": 4.677304099035653e-05, + "loss": 0.2222, + "step": 4580 + }, + { + "epoch": 7.282988871224165, + "grad_norm": 2.1954841909472114, + "learning_rate": 4.6777319991457325e-05, + "loss": 0.1677, + "step": 4581 + }, + { + "epoch": 7.284578696343402, + "grad_norm": 3.9065208069911943, + "learning_rate": 4.6781596698680705e-05, + "loss": 0.1077, + "step": 4582 + }, + { + "epoch": 7.286168521462639, + "grad_norm": 2.830472452397222, + "learning_rate": 4.6785871110579165e-05, + "loss": 0.1731, + "step": 4583 + }, + { + "epoch": 7.287758346581876, + "grad_norm": 2.1980108570697316, + "learning_rate": 4.679014322570597e-05, + "loss": 0.183, + "step": 4584 + }, + { + "epoch": 7.289348171701113, + "grad_norm": 2.4333176520245265, + "learning_rate": 4.6794413042615165e-05, + "loss": 0.1908, + "step": 4585 + }, + { + "epoch": 7.290937996820349, + "grad_norm": 3.23592945826406, + "learning_rate": 4.6798680559861566e-05, + "loss": 0.1674, + "step": 4586 + }, + { + "epoch": 7.292527821939586, + "grad_norm": 1.4252833125469218, + "learning_rate": 4.680294577600078e-05, + "loss": 0.1904, + "step": 4587 + }, + { + "epoch": 7.294117647058823, + "grad_norm": 1.6806088353766642, + "learning_rate": 4.680720868958918e-05, + "loss": 0.1284, + "step": 4588 + }, + { + "epoch": 7.29570747217806, + "grad_norm": 91.5272230814426, + "learning_rate": 4.681146929918392e-05, + "loss": 6.4066, + "step": 4589 + }, + { + "epoch": 7.297297297297297, + "grad_norm": 2.4930198078774057, + "learning_rate": 4.681572760334296e-05, + "loss": 0.1557, + "step": 4590 + }, + { + "epoch": 7.298887122416534, + "grad_norm": 2.914012020511383, + "learning_rate": 4.681998360062499e-05, + "loss": 0.1534, + "step": 4591 + }, + { + "epoch": 7.300476947535771, + "grad_norm": 1.3041471798428454, + "learning_rate": 4.6824237289589525e-05, + "loss": 0.1732, + "step": 4592 + }, + { + "epoch": 7.302066772655008, + "grad_norm": 2.209648591398421, + "learning_rate": 4.682848866879683e-05, + "loss": 0.1564, + "step": 4593 + }, + { + "epoch": 7.3036565977742445, + "grad_norm": 2.7413448973005368, + "learning_rate": 4.6832737736808e-05, + "loss": 0.2145, + "step": 4594 + }, + { + "epoch": 7.3052464228934815, + "grad_norm": 1.8186951921757886, + "learning_rate": 4.683698449218484e-05, + "loss": 0.1396, + "step": 4595 + }, + { + "epoch": 7.3068362480127185, + "grad_norm": 63.057714270396716, + "learning_rate": 4.6841228933490005e-05, + "loss": 2.6173, + "step": 4596 + }, + { + "epoch": 7.3084260731319555, + "grad_norm": 61.49505222251891, + "learning_rate": 4.684547105928689e-05, + "loss": 7.8309, + "step": 4597 + }, + { + "epoch": 7.3100158982511925, + "grad_norm": 228.71112477038872, + "learning_rate": 4.6849710868139694e-05, + "loss": 19.6757, + "step": 4598 + }, + { + "epoch": 7.3116057233704295, + "grad_norm": 4.250878005395424, + "learning_rate": 4.6853948358613394e-05, + "loss": 0.1567, + "step": 4599 + }, + { + "epoch": 7.3131955484896665, + "grad_norm": 15.046421577595163, + "learning_rate": 4.6858183529273767e-05, + "loss": 1.7316, + "step": 4600 + }, + { + "epoch": 7.314785373608903, + "grad_norm": 1.5836214299328233, + "learning_rate": 4.686241637868734e-05, + "loss": 0.1196, + "step": 4601 + }, + { + "epoch": 7.31637519872814, + "grad_norm": 2.228720962393585, + "learning_rate": 4.686664690542145e-05, + "loss": 0.1423, + "step": 4602 + }, + { + "epoch": 7.317965023847377, + "grad_norm": 1.7984626796609704, + "learning_rate": 4.687087510804423e-05, + "loss": 0.159, + "step": 4603 + }, + { + "epoch": 7.319554848966614, + "grad_norm": 2.2236302397086654, + "learning_rate": 4.687510098512458e-05, + "loss": 0.1883, + "step": 4604 + }, + { + "epoch": 7.321144674085851, + "grad_norm": 1.9213632995119332, + "learning_rate": 4.687932453523219e-05, + "loss": 0.171, + "step": 4605 + }, + { + "epoch": 7.322734499205088, + "grad_norm": 2.66377980090811, + "learning_rate": 4.6883545756937545e-05, + "loss": 0.1901, + "step": 4606 + }, + { + "epoch": 7.324324324324325, + "grad_norm": 1.68682227718761, + "learning_rate": 4.6887764648811906e-05, + "loss": 0.1156, + "step": 4607 + }, + { + "epoch": 7.325914149443562, + "grad_norm": 1.5477636587500019, + "learning_rate": 4.6891981209427343e-05, + "loss": 0.1623, + "step": 4608 + }, + { + "epoch": 7.327503974562799, + "grad_norm": 2.77993159213339, + "learning_rate": 4.689619543735671e-05, + "loss": 0.2013, + "step": 4609 + }, + { + "epoch": 7.329093799682035, + "grad_norm": 2.2536050789889392, + "learning_rate": 4.690040733117361e-05, + "loss": 0.1772, + "step": 4610 + }, + { + "epoch": 7.330683624801272, + "grad_norm": 1.6287876871319098, + "learning_rate": 4.69046168894525e-05, + "loss": 0.2, + "step": 4611 + }, + { + "epoch": 7.332273449920509, + "grad_norm": 1.8028030178081373, + "learning_rate": 4.6908824110768585e-05, + "loss": 0.1228, + "step": 4612 + }, + { + "epoch": 7.333863275039746, + "grad_norm": 1826.6907209016326, + "learning_rate": 4.691302899369788e-05, + "loss": 5.8109, + "step": 4613 + }, + { + "epoch": 7.335453100158983, + "grad_norm": 834.6858736953902, + "learning_rate": 4.6917231536817176e-05, + "loss": 7.4766, + "step": 4614 + }, + { + "epoch": 7.33704292527822, + "grad_norm": 2.200708696499558, + "learning_rate": 4.692143173870407e-05, + "loss": 0.1707, + "step": 4615 + }, + { + "epoch": 7.338632750397457, + "grad_norm": 2.5438678070580107, + "learning_rate": 4.692562959793694e-05, + "loss": 0.1418, + "step": 4616 + }, + { + "epoch": 7.340222575516693, + "grad_norm": 2.653523601142218, + "learning_rate": 4.692982511309498e-05, + "loss": 0.2261, + "step": 4617 + }, + { + "epoch": 7.34181240063593, + "grad_norm": 1.9092335906747837, + "learning_rate": 4.6934018282758135e-05, + "loss": 0.1243, + "step": 4618 + }, + { + "epoch": 7.343402225755167, + "grad_norm": 2.6153331513227958, + "learning_rate": 4.6938209105507185e-05, + "loss": 0.1641, + "step": 4619 + }, + { + "epoch": 7.344992050874404, + "grad_norm": 3.342657781777808, + "learning_rate": 4.694239757992368e-05, + "loss": 0.2476, + "step": 4620 + }, + { + "epoch": 7.346581875993641, + "grad_norm": 2.1995604934121284, + "learning_rate": 4.6946583704589973e-05, + "loss": 0.1753, + "step": 4621 + }, + { + "epoch": 7.348171701112878, + "grad_norm": 3.7877139236335178, + "learning_rate": 4.695076747808923e-05, + "loss": 0.252, + "step": 4622 + }, + { + "epoch": 7.349761526232115, + "grad_norm": 5.11658136382788, + "learning_rate": 4.695494889900536e-05, + "loss": 0.1697, + "step": 4623 + }, + { + "epoch": 7.351351351351352, + "grad_norm": 3.333462480028068, + "learning_rate": 4.6959127965923145e-05, + "loss": 0.1513, + "step": 4624 + }, + { + "epoch": 7.352941176470588, + "grad_norm": 3.442470478617867, + "learning_rate": 4.69633046774281e-05, + "loss": 0.2073, + "step": 4625 + }, + { + "epoch": 7.354531001589825, + "grad_norm": 2.153235332540995, + "learning_rate": 4.696747903210655e-05, + "loss": 0.1435, + "step": 4626 + }, + { + "epoch": 7.356120826709062, + "grad_norm": 3.7531558836767203, + "learning_rate": 4.697165102854565e-05, + "loss": 0.1764, + "step": 4627 + }, + { + "epoch": 7.357710651828299, + "grad_norm": 1.905212809648395, + "learning_rate": 4.6975820665333314e-05, + "loss": 0.1972, + "step": 4628 + }, + { + "epoch": 7.359300476947536, + "grad_norm": 3.1282838603787813, + "learning_rate": 4.697998794105827e-05, + "loss": 0.214, + "step": 4629 + }, + { + "epoch": 7.360890302066773, + "grad_norm": 3.986941257135869, + "learning_rate": 4.6984152854310057e-05, + "loss": 0.1663, + "step": 4630 + }, + { + "epoch": 7.36248012718601, + "grad_norm": 2.5789731102464764, + "learning_rate": 4.6988315403679e-05, + "loss": 0.2061, + "step": 4631 + }, + { + "epoch": 7.364069952305247, + "grad_norm": 3.478378498736349, + "learning_rate": 4.699247558775622e-05, + "loss": 0.1679, + "step": 4632 + }, + { + "epoch": 7.365659777424483, + "grad_norm": 3.1052506188008584, + "learning_rate": 4.6996633405133657e-05, + "loss": 0.1934, + "step": 4633 + }, + { + "epoch": 7.36724960254372, + "grad_norm": 1.836535369394707, + "learning_rate": 4.7000788854404024e-05, + "loss": 0.1763, + "step": 4634 + }, + { + "epoch": 7.368839427662957, + "grad_norm": 2.9097823684859665, + "learning_rate": 4.700494193416087e-05, + "loss": 0.2084, + "step": 4635 + }, + { + "epoch": 7.370429252782194, + "grad_norm": 3.8700946065124473, + "learning_rate": 4.7009092642998514e-05, + "loss": 0.1972, + "step": 4636 + }, + { + "epoch": 7.372019077901431, + "grad_norm": 4.103456867416233, + "learning_rate": 4.7013240979512094e-05, + "loss": 0.2149, + "step": 4637 + }, + { + "epoch": 7.373608903020668, + "grad_norm": 2.7190520942942986, + "learning_rate": 4.701738694229755e-05, + "loss": 0.1682, + "step": 4638 + }, + { + "epoch": 7.375198728139905, + "grad_norm": 3.456693748029552, + "learning_rate": 4.702153052995163e-05, + "loss": 0.1748, + "step": 4639 + }, + { + "epoch": 7.376788553259141, + "grad_norm": 3.123671088805879, + "learning_rate": 4.702567174107186e-05, + "loss": 0.1895, + "step": 4640 + }, + { + "epoch": 7.378378378378378, + "grad_norm": 3.06799377971188, + "learning_rate": 4.702981057425662e-05, + "loss": 0.161, + "step": 4641 + }, + { + "epoch": 7.379968203497615, + "grad_norm": 3.1111318143528375, + "learning_rate": 4.703394702810504e-05, + "loss": 0.1696, + "step": 4642 + }, + { + "epoch": 7.381558028616852, + "grad_norm": 3.938155534936393, + "learning_rate": 4.703808110121709e-05, + "loss": 0.1665, + "step": 4643 + }, + { + "epoch": 7.383147853736089, + "grad_norm": 4.789806348871144, + "learning_rate": 4.7042212792193535e-05, + "loss": 0.2065, + "step": 4644 + }, + { + "epoch": 7.384737678855326, + "grad_norm": 4.330853948569379, + "learning_rate": 4.704634209963595e-05, + "loss": 0.2924, + "step": 4645 + }, + { + "epoch": 7.386327503974563, + "grad_norm": 4.276456178120317, + "learning_rate": 4.705046902214671e-05, + "loss": 0.2048, + "step": 4646 + }, + { + "epoch": 7.3879173290938, + "grad_norm": 4.8027921699504255, + "learning_rate": 4.705459355832899e-05, + "loss": 0.2169, + "step": 4647 + }, + { + "epoch": 7.389507154213036, + "grad_norm": 3.1552031406340415, + "learning_rate": 4.705871570678681e-05, + "loss": 0.1258, + "step": 4648 + }, + { + "epoch": 7.391096979332273, + "grad_norm": 1.5820221026350458, + "learning_rate": 4.706283546612496e-05, + "loss": 0.1533, + "step": 4649 + }, + { + "epoch": 7.39268680445151, + "grad_norm": 3.5949954804990085, + "learning_rate": 4.7066952834949044e-05, + "loss": 0.2635, + "step": 4650 + }, + { + "epoch": 7.394276629570747, + "grad_norm": 2.435206769591396, + "learning_rate": 4.7071067811865475e-05, + "loss": 0.2243, + "step": 4651 + }, + { + "epoch": 7.395866454689984, + "grad_norm": 2.0229113379514305, + "learning_rate": 4.7075180395481504e-05, + "loss": 0.1718, + "step": 4652 + }, + { + "epoch": 7.397456279809221, + "grad_norm": 3.3185192368035623, + "learning_rate": 4.707929058440516e-05, + "loss": 0.1916, + "step": 4653 + }, + { + "epoch": 7.399046104928458, + "grad_norm": 4.629987550376275, + "learning_rate": 4.708339837724529e-05, + "loss": 0.1829, + "step": 4654 + }, + { + "epoch": 7.400635930047695, + "grad_norm": 2.625168033516052, + "learning_rate": 4.7087503772611556e-05, + "loss": 0.1833, + "step": 4655 + }, + { + "epoch": 7.402225755166931, + "grad_norm": 2.6160258975796093, + "learning_rate": 4.709160676911444e-05, + "loss": 0.1739, + "step": 4656 + }, + { + "epoch": 7.403815580286168, + "grad_norm": 2.010846845540932, + "learning_rate": 4.709570736536521e-05, + "loss": 0.1583, + "step": 4657 + }, + { + "epoch": 7.405405405405405, + "grad_norm": 3.6716892066463918, + "learning_rate": 4.7099805559975975e-05, + "loss": 0.1493, + "step": 4658 + }, + { + "epoch": 7.406995230524642, + "grad_norm": 2.993739993523424, + "learning_rate": 4.710390135155964e-05, + "loss": 0.145, + "step": 4659 + }, + { + "epoch": 7.408585055643879, + "grad_norm": 3.5255149338371963, + "learning_rate": 4.7107994738729926e-05, + "loss": 0.1443, + "step": 4660 + }, + { + "epoch": 7.410174880763116, + "grad_norm": 2.947498540360932, + "learning_rate": 4.711208572010137e-05, + "loss": 0.1519, + "step": 4661 + }, + { + "epoch": 7.411764705882353, + "grad_norm": 1.4237328886242548, + "learning_rate": 4.7116174294289336e-05, + "loss": 0.1358, + "step": 4662 + }, + { + "epoch": 7.413354531001589, + "grad_norm": 2.2217210986354923, + "learning_rate": 4.712026045990997e-05, + "loss": 0.1626, + "step": 4663 + }, + { + "epoch": 7.414944356120826, + "grad_norm": 1.7180844920916618, + "learning_rate": 4.712434421558026e-05, + "loss": 0.1533, + "step": 4664 + }, + { + "epoch": 7.416534181240063, + "grad_norm": 4.908859392498175, + "learning_rate": 4.712842555991801e-05, + "loss": 0.1947, + "step": 4665 + }, + { + "epoch": 7.4181240063593, + "grad_norm": 4.269436482654356, + "learning_rate": 4.713250449154181e-05, + "loss": 0.157, + "step": 4666 + }, + { + "epoch": 7.419713831478537, + "grad_norm": 3.124987618485747, + "learning_rate": 4.7136581009071127e-05, + "loss": 0.1687, + "step": 4667 + }, + { + "epoch": 7.421303656597774, + "grad_norm": 1.514799967021865, + "learning_rate": 4.714065511112618e-05, + "loss": 0.129, + "step": 4668 + }, + { + "epoch": 7.422893481717011, + "grad_norm": 3.3932292283602203, + "learning_rate": 4.714472679632803e-05, + "loss": 0.1591, + "step": 4669 + }, + { + "epoch": 7.424483306836248, + "grad_norm": 2.6906369998068262, + "learning_rate": 4.714879606329858e-05, + "loss": 0.1978, + "step": 4670 + }, + { + "epoch": 7.426073131955485, + "grad_norm": 2.934327035483828, + "learning_rate": 4.7152862910660514e-05, + "loss": 0.1479, + "step": 4671 + }, + { + "epoch": 7.4276629570747215, + "grad_norm": 3.9816725066451153, + "learning_rate": 4.715692733703736e-05, + "loss": 0.2408, + "step": 4672 + }, + { + "epoch": 7.4292527821939585, + "grad_norm": 2.3343490397692452, + "learning_rate": 4.7160989341053453e-05, + "loss": 0.1978, + "step": 4673 + }, + { + "epoch": 7.4308426073131955, + "grad_norm": 2.227151952114795, + "learning_rate": 4.716504892133394e-05, + "loss": 0.1706, + "step": 4674 + }, + { + "epoch": 7.4324324324324325, + "grad_norm": 2.682389220129188, + "learning_rate": 4.716910607650483e-05, + "loss": 0.1567, + "step": 4675 + }, + { + "epoch": 7.4340222575516695, + "grad_norm": 3.175203996554223, + "learning_rate": 4.7173160805192896e-05, + "loss": 0.1393, + "step": 4676 + }, + { + "epoch": 7.4356120826709065, + "grad_norm": 2.004516905179918, + "learning_rate": 4.7177213106025765e-05, + "loss": 0.1854, + "step": 4677 + }, + { + "epoch": 7.4372019077901435, + "grad_norm": 3.7459321604305007, + "learning_rate": 4.718126297763189e-05, + "loss": 0.1993, + "step": 4678 + }, + { + "epoch": 7.43879173290938, + "grad_norm": 1.2376547117061463, + "learning_rate": 4.718531041864052e-05, + "loss": 0.1592, + "step": 4679 + }, + { + "epoch": 7.440381558028617, + "grad_norm": 2.280954771931182, + "learning_rate": 4.7189355427681764e-05, + "loss": 0.1661, + "step": 4680 + }, + { + "epoch": 7.441971383147854, + "grad_norm": 1.5612257727041825, + "learning_rate": 4.7193398003386515e-05, + "loss": 0.1231, + "step": 4681 + }, + { + "epoch": 7.443561208267091, + "grad_norm": 4.951074546996653, + "learning_rate": 4.719743814438651e-05, + "loss": 0.1471, + "step": 4682 + }, + { + "epoch": 7.4451510333863276, + "grad_norm": 3.537055110729649, + "learning_rate": 4.720147584931431e-05, + "loss": 0.1275, + "step": 4683 + }, + { + "epoch": 7.4467408585055646, + "grad_norm": 4.894256462561929, + "learning_rate": 4.7205511116803306e-05, + "loss": 0.1753, + "step": 4684 + }, + { + "epoch": 7.4483306836248016, + "grad_norm": 1.8203135128724133, + "learning_rate": 4.720954394548769e-05, + "loss": 0.1668, + "step": 4685 + }, + { + "epoch": 7.4499205087440385, + "grad_norm": 5.084558279767693, + "learning_rate": 4.721357433400251e-05, + "loss": 0.2477, + "step": 4686 + }, + { + "epoch": 7.451510333863275, + "grad_norm": 5.075150722843725, + "learning_rate": 4.721760228098362e-05, + "loss": 0.113, + "step": 4687 + }, + { + "epoch": 7.453100158982512, + "grad_norm": 2.0579198230509945, + "learning_rate": 4.722162778506771e-05, + "loss": 0.1378, + "step": 4688 + }, + { + "epoch": 7.454689984101749, + "grad_norm": 5.912397767805827, + "learning_rate": 4.7225650844892286e-05, + "loss": 0.1791, + "step": 4689 + }, + { + "epoch": 7.456279809220986, + "grad_norm": 5.157151610608219, + "learning_rate": 4.7229671459095686e-05, + "loss": 0.146, + "step": 4690 + }, + { + "epoch": 7.457869634340223, + "grad_norm": 3.535675623894268, + "learning_rate": 4.723368962631708e-05, + "loss": 0.1324, + "step": 4691 + }, + { + "epoch": 7.45945945945946, + "grad_norm": 6.338322565965297, + "learning_rate": 4.723770534519647e-05, + "loss": 0.3028, + "step": 4692 + }, + { + "epoch": 7.461049284578697, + "grad_norm": 2.79837048647265, + "learning_rate": 4.7241718614374675e-05, + "loss": 0.3088, + "step": 4693 + }, + { + "epoch": 7.462639109697934, + "grad_norm": 3.364948468503576, + "learning_rate": 4.7245729432493356e-05, + "loss": 0.1631, + "step": 4694 + }, + { + "epoch": 7.46422893481717, + "grad_norm": 3.3913954708455063, + "learning_rate": 4.7249737798194976e-05, + "loss": 0.1876, + "step": 4695 + }, + { + "epoch": 7.465818759936407, + "grad_norm": 5.8944704318496886, + "learning_rate": 4.725374371012288e-05, + "loss": 0.1803, + "step": 4696 + }, + { + "epoch": 7.467408585055644, + "grad_norm": 3.095669997474016, + "learning_rate": 4.7257747166921187e-05, + "loss": 0.1588, + "step": 4697 + }, + { + "epoch": 7.468998410174881, + "grad_norm": 4.7883787985428405, + "learning_rate": 4.726174816723488e-05, + "loss": 0.1656, + "step": 4698 + }, + { + "epoch": 7.470588235294118, + "grad_norm": 3.965217845987796, + "learning_rate": 4.726574670970976e-05, + "loss": 0.1819, + "step": 4699 + }, + { + "epoch": 7.472178060413355, + "grad_norm": 3.2236672212096815, + "learning_rate": 4.7269742792992476e-05, + "loss": 0.1332, + "step": 4700 + }, + { + "epoch": 7.473767885532592, + "grad_norm": 2.2609844250944664, + "learning_rate": 4.727373641573049e-05, + "loss": 0.1736, + "step": 4701 + }, + { + "epoch": 7.475357710651828, + "grad_norm": 4.769516491322302, + "learning_rate": 4.7277727576572105e-05, + "loss": 0.1199, + "step": 4702 + }, + { + "epoch": 7.476947535771065, + "grad_norm": 3.337869102149578, + "learning_rate": 4.728171627416647e-05, + "loss": 0.1126, + "step": 4703 + }, + { + "epoch": 7.478537360890302, + "grad_norm": 2.1821135727877246, + "learning_rate": 4.728570250716353e-05, + "loss": 0.1385, + "step": 4704 + }, + { + "epoch": 7.480127186009539, + "grad_norm": 3.9755841187792282, + "learning_rate": 4.728968627421412e-05, + "loss": 0.1332, + "step": 4705 + }, + { + "epoch": 7.481717011128776, + "grad_norm": 3.6358439431351597, + "learning_rate": 4.729366757396986e-05, + "loss": 0.1671, + "step": 4706 + }, + { + "epoch": 7.483306836248013, + "grad_norm": 750.0141026126608, + "learning_rate": 4.729764640508322e-05, + "loss": 13.4396, + "step": 4707 + }, + { + "epoch": 7.48489666136725, + "grad_norm": 2.5082252208984603, + "learning_rate": 4.730162276620753e-05, + "loss": 0.1515, + "step": 4708 + }, + { + "epoch": 7.486486486486487, + "grad_norm": 3.849322461922561, + "learning_rate": 4.7305596655996916e-05, + "loss": 0.3028, + "step": 4709 + }, + { + "epoch": 7.488076311605723, + "grad_norm": 3.2936206926161433, + "learning_rate": 4.730956807310637e-05, + "loss": 0.221, + "step": 4710 + }, + { + "epoch": 7.48966613672496, + "grad_norm": 5.155836582657859, + "learning_rate": 4.731353701619171e-05, + "loss": 0.1725, + "step": 4711 + }, + { + "epoch": 7.491255961844197, + "grad_norm": 2.4761484033994847, + "learning_rate": 4.731750348390959e-05, + "loss": 0.1319, + "step": 4712 + }, + { + "epoch": 7.492845786963434, + "grad_norm": 7.997796151686728, + "learning_rate": 4.732146747491751e-05, + "loss": 0.2, + "step": 4713 + }, + { + "epoch": 7.494435612082671, + "grad_norm": 3.039593478647834, + "learning_rate": 4.732542898787379e-05, + "loss": 0.1939, + "step": 4714 + }, + { + "epoch": 7.496025437201908, + "grad_norm": 4.472770549551557, + "learning_rate": 4.732938802143762e-05, + "loss": 0.1672, + "step": 4715 + }, + { + "epoch": 7.497615262321145, + "grad_norm": 3.1513952507067096, + "learning_rate": 4.733334457426899e-05, + "loss": 0.2079, + "step": 4716 + }, + { + "epoch": 7.499205087440382, + "grad_norm": 5.458426153729237, + "learning_rate": 4.733729864502877e-05, + "loss": 0.2682, + "step": 4717 + }, + { + "epoch": 7.500794912559618, + "grad_norm": 4.226231852706508, + "learning_rate": 4.7341250232378634e-05, + "loss": 0.252, + "step": 4718 + }, + { + "epoch": 7.502384737678855, + "grad_norm": 1.7967875331770338, + "learning_rate": 4.734519933498112e-05, + "loss": 0.1878, + "step": 4719 + }, + { + "epoch": 7.503974562798092, + "grad_norm": 2.4682409598013098, + "learning_rate": 4.73491459514996e-05, + "loss": 0.2136, + "step": 4720 + }, + { + "epoch": 7.505564387917329, + "grad_norm": 2.16699606590182, + "learning_rate": 4.735309008059829e-05, + "loss": 0.1877, + "step": 4721 + }, + { + "epoch": 7.507154213036566, + "grad_norm": 1.9773407372854714, + "learning_rate": 4.735703172094223e-05, + "loss": 0.2204, + "step": 4722 + }, + { + "epoch": 7.508744038155803, + "grad_norm": 2.221847340094475, + "learning_rate": 4.736097087119734e-05, + "loss": 0.219, + "step": 4723 + }, + { + "epoch": 7.51033386327504, + "grad_norm": 2.253443471611839, + "learning_rate": 4.7364907530030355e-05, + "loss": 0.1843, + "step": 4724 + }, + { + "epoch": 7.511923688394276, + "grad_norm": 4.240762526070993, + "learning_rate": 4.736884169610884e-05, + "loss": 0.3385, + "step": 4725 + }, + { + "epoch": 7.513513513513513, + "grad_norm": 2.0539090475674966, + "learning_rate": 4.737277336810125e-05, + "loss": 0.2133, + "step": 4726 + }, + { + "epoch": 7.51510333863275, + "grad_norm": 1.4743084092590693, + "learning_rate": 4.737670254467683e-05, + "loss": 0.195, + "step": 4727 + }, + { + "epoch": 7.516693163751987, + "grad_norm": 1.7562589578841121, + "learning_rate": 4.738062922450571e-05, + "loss": 0.1281, + "step": 4728 + }, + { + "epoch": 7.518282988871224, + "grad_norm": 1.750779209530418, + "learning_rate": 4.7384553406258847e-05, + "loss": 0.2146, + "step": 4729 + }, + { + "epoch": 7.519872813990461, + "grad_norm": 1.4592913766492661, + "learning_rate": 4.738847508860804e-05, + "loss": 0.1336, + "step": 4730 + }, + { + "epoch": 7.521462639109698, + "grad_norm": 2.3438049638781915, + "learning_rate": 4.739239427022597e-05, + "loss": 0.1536, + "step": 4731 + }, + { + "epoch": 7.523052464228935, + "grad_norm": 1.5234195825683736, + "learning_rate": 4.73963109497861e-05, + "loss": 0.1792, + "step": 4732 + }, + { + "epoch": 7.524642289348172, + "grad_norm": 2.041928222211489, + "learning_rate": 4.7400225125962794e-05, + "loss": 0.1114, + "step": 4733 + }, + { + "epoch": 7.526232114467408, + "grad_norm": 1.8280261962082247, + "learning_rate": 4.7404136797431254e-05, + "loss": 0.2361, + "step": 4734 + }, + { + "epoch": 7.527821939586645, + "grad_norm": 3.36071105748803, + "learning_rate": 4.74080459628675e-05, + "loss": 0.1426, + "step": 4735 + }, + { + "epoch": 7.529411764705882, + "grad_norm": 28.649114268005768, + "learning_rate": 4.741195262094844e-05, + "loss": 4.3817, + "step": 4736 + }, + { + "epoch": 7.531001589825119, + "grad_norm": 1.601734718346111, + "learning_rate": 4.7415856770351794e-05, + "loss": 0.1504, + "step": 4737 + }, + { + "epoch": 7.532591414944356, + "grad_norm": 1.5445450752056085, + "learning_rate": 4.741975840975617e-05, + "loss": 0.1077, + "step": 4738 + }, + { + "epoch": 7.534181240063593, + "grad_norm": 1.345083505579829, + "learning_rate": 4.742365753784098e-05, + "loss": 0.1775, + "step": 4739 + }, + { + "epoch": 7.53577106518283, + "grad_norm": 1.998266539143038, + "learning_rate": 4.742755415328652e-05, + "loss": 0.1731, + "step": 4740 + }, + { + "epoch": 7.537360890302066, + "grad_norm": 1.7413664435036789, + "learning_rate": 4.7431448254773944e-05, + "loss": 0.1282, + "step": 4741 + }, + { + "epoch": 7.538950715421303, + "grad_norm": 1.783407789000656, + "learning_rate": 4.7435339840985216e-05, + "loss": 0.1558, + "step": 4742 + }, + { + "epoch": 7.54054054054054, + "grad_norm": 51.44892367431455, + "learning_rate": 4.7439228910603185e-05, + "loss": 10.1336, + "step": 4743 + }, + { + "epoch": 7.542130365659777, + "grad_norm": 1.4433244049754699, + "learning_rate": 4.7443115462311546e-05, + "loss": 0.1565, + "step": 4744 + }, + { + "epoch": 7.543720190779014, + "grad_norm": 1.6683684694506802, + "learning_rate": 4.744699949479483e-05, + "loss": 0.1369, + "step": 4745 + }, + { + "epoch": 7.545310015898251, + "grad_norm": 2.0149915466008994, + "learning_rate": 4.745088100673844e-05, + "loss": 0.1406, + "step": 4746 + }, + { + "epoch": 7.546899841017488, + "grad_norm": 1.4581996764056835, + "learning_rate": 4.745475999682863e-05, + "loss": 0.1327, + "step": 4747 + }, + { + "epoch": 7.548489666136725, + "grad_norm": 1.9748463659862572, + "learning_rate": 4.745863646375248e-05, + "loss": 0.1573, + "step": 4748 + }, + { + "epoch": 7.550079491255962, + "grad_norm": 1.30087943561732, + "learning_rate": 4.7462510406197986e-05, + "loss": 0.1129, + "step": 4749 + }, + { + "epoch": 7.5516693163751984, + "grad_norm": 2.3818264865709424, + "learning_rate": 4.7466381822853916e-05, + "loss": 0.2294, + "step": 4750 + }, + { + "epoch": 7.5532591414944354, + "grad_norm": 1.7245447384240924, + "learning_rate": 4.7470250712409964e-05, + "loss": 0.1465, + "step": 4751 + }, + { + "epoch": 7.5548489666136724, + "grad_norm": 1.2101239806431119, + "learning_rate": 4.747411707355664e-05, + "loss": 0.1489, + "step": 4752 + }, + { + "epoch": 7.556438791732909, + "grad_norm": 1.6527050953914617, + "learning_rate": 4.7477980904985316e-05, + "loss": 0.1537, + "step": 4753 + }, + { + "epoch": 7.558028616852146, + "grad_norm": 2.477854866767092, + "learning_rate": 4.748184220538824e-05, + "loss": 0.1136, + "step": 4754 + }, + { + "epoch": 7.559618441971383, + "grad_norm": 1.3041266116483894, + "learning_rate": 4.7485700973458495e-05, + "loss": 0.0911, + "step": 4755 + }, + { + "epoch": 7.56120826709062, + "grad_norm": 2.2002470913846137, + "learning_rate": 4.748955720789002e-05, + "loss": 0.2142, + "step": 4756 + }, + { + "epoch": 7.5627980922098565, + "grad_norm": 3.0736668768238786, + "learning_rate": 4.749341090737763e-05, + "loss": 0.1373, + "step": 4757 + }, + { + "epoch": 7.5643879173290935, + "grad_norm": 1.2933614306714942, + "learning_rate": 4.749726207061699e-05, + "loss": 0.1718, + "step": 4758 + }, + { + "epoch": 7.5659777424483305, + "grad_norm": 2.075661618005865, + "learning_rate": 4.75011106963046e-05, + "loss": 0.1537, + "step": 4759 + }, + { + "epoch": 7.5675675675675675, + "grad_norm": 3.482582044362261, + "learning_rate": 4.750495678313786e-05, + "loss": 0.1651, + "step": 4760 + }, + { + "epoch": 7.5691573926868045, + "grad_norm": 2.879845230783415, + "learning_rate": 4.7508800329814994e-05, + "loss": 0.1753, + "step": 4761 + }, + { + "epoch": 7.5707472178060415, + "grad_norm": 3.1621032045861934, + "learning_rate": 4.7512641335035116e-05, + "loss": 0.1303, + "step": 4762 + }, + { + "epoch": 7.5723370429252785, + "grad_norm": 1.5750307200935876, + "learning_rate": 4.751647979749817e-05, + "loss": 0.15, + "step": 4763 + }, + { + "epoch": 7.573926868044515, + "grad_norm": 3.6782968902022515, + "learning_rate": 4.752031571590499e-05, + "loss": 0.2155, + "step": 4764 + }, + { + "epoch": 7.575516693163752, + "grad_norm": 1.8957613870660304, + "learning_rate": 4.7524149088957245e-05, + "loss": 0.9014, + "step": 4765 + }, + { + "epoch": 7.577106518282989, + "grad_norm": 3.09632919471064, + "learning_rate": 4.752797991535748e-05, + "loss": 0.1138, + "step": 4766 + }, + { + "epoch": 7.578696343402226, + "grad_norm": 2.3797123359675783, + "learning_rate": 4.753180819380911e-05, + "loss": 0.1767, + "step": 4767 + }, + { + "epoch": 7.580286168521463, + "grad_norm": 2.219168271269133, + "learning_rate": 4.753563392301638e-05, + "loss": 0.1215, + "step": 4768 + }, + { + "epoch": 7.5818759936407, + "grad_norm": 4.128501093189743, + "learning_rate": 4.753945710168444e-05, + "loss": 0.1421, + "step": 4769 + }, + { + "epoch": 7.583465818759937, + "grad_norm": 3.2379415637688593, + "learning_rate": 4.754327772851926e-05, + "loss": 0.199, + "step": 4770 + }, + { + "epoch": 7.585055643879174, + "grad_norm": 3.323650024918312, + "learning_rate": 4.754709580222773e-05, + "loss": 0.1795, + "step": 4771 + }, + { + "epoch": 7.586645468998411, + "grad_norm": 2.5169014239700003, + "learning_rate": 4.755091132151753e-05, + "loss": 0.196, + "step": 4772 + }, + { + "epoch": 7.588235294117647, + "grad_norm": 2.279971932912635, + "learning_rate": 4.755472428509727e-05, + "loss": 0.1747, + "step": 4773 + }, + { + "epoch": 7.589825119236884, + "grad_norm": 3.4233366033168884, + "learning_rate": 4.75585346916764e-05, + "loss": 0.1507, + "step": 4774 + }, + { + "epoch": 7.591414944356121, + "grad_norm": 1.9389525539447425, + "learning_rate": 4.756234253996523e-05, + "loss": 0.1491, + "step": 4775 + }, + { + "epoch": 7.593004769475358, + "grad_norm": 2.7711551617651535, + "learning_rate": 4.756614782867493e-05, + "loss": 0.155, + "step": 4776 + }, + { + "epoch": 7.594594594594595, + "grad_norm": 1.4968108682548629, + "learning_rate": 4.7569950556517563e-05, + "loss": 0.0973, + "step": 4777 + }, + { + "epoch": 7.596184419713832, + "grad_norm": 4.844031944015179, + "learning_rate": 4.7573750722206046e-05, + "loss": 0.1682, + "step": 4778 + }, + { + "epoch": 7.597774244833069, + "grad_norm": 2.378610057443689, + "learning_rate": 4.757754832445415e-05, + "loss": 0.0925, + "step": 4779 + }, + { + "epoch": 7.599364069952305, + "grad_norm": 2.1966235385502406, + "learning_rate": 4.7581343361976524e-05, + "loss": 0.2128, + "step": 4780 + }, + { + "epoch": 7.600953895071542, + "grad_norm": 2.8210170727194317, + "learning_rate": 4.7585135833488696e-05, + "loss": 0.1792, + "step": 4781 + }, + { + "epoch": 7.602543720190779, + "grad_norm": 4.554974174590489, + "learning_rate": 4.758892573770703e-05, + "loss": 0.1984, + "step": 4782 + }, + { + "epoch": 7.604133545310016, + "grad_norm": 1.903886722329593, + "learning_rate": 4.759271307334881e-05, + "loss": 0.1721, + "step": 4783 + }, + { + "epoch": 7.605723370429253, + "grad_norm": 4.481724633862183, + "learning_rate": 4.759649783913214e-05, + "loss": 0.1396, + "step": 4784 + }, + { + "epoch": 7.60731319554849, + "grad_norm": 2.478999217334688, + "learning_rate": 4.760028003377602e-05, + "loss": 0.1472, + "step": 4785 + }, + { + "epoch": 7.608903020667727, + "grad_norm": 3.007223060046777, + "learning_rate": 4.7604059656000314e-05, + "loss": 0.1397, + "step": 4786 + }, + { + "epoch": 7.610492845786963, + "grad_norm": 8.730791661322813, + "learning_rate": 4.760783670452575e-05, + "loss": 0.5768, + "step": 4787 + }, + { + "epoch": 7.6120826709062, + "grad_norm": 2.7379108947168147, + "learning_rate": 4.7611611178073946e-05, + "loss": 0.1851, + "step": 4788 + }, + { + "epoch": 7.613672496025437, + "grad_norm": 1.9016436329395812, + "learning_rate": 4.7615383075367364e-05, + "loss": 0.1542, + "step": 4789 + }, + { + "epoch": 7.615262321144674, + "grad_norm": 2.7668087263854746, + "learning_rate": 4.761915239512937e-05, + "loss": 0.1782, + "step": 4790 + }, + { + "epoch": 7.616852146263911, + "grad_norm": 1.6599032105887652, + "learning_rate": 4.7622919136084184e-05, + "loss": 0.1182, + "step": 4791 + }, + { + "epoch": 7.618441971383148, + "grad_norm": 27.751020804952294, + "learning_rate": 4.762668329695688e-05, + "loss": 2.2302, + "step": 4792 + }, + { + "epoch": 7.620031796502385, + "grad_norm": 1.8951048054325008, + "learning_rate": 4.763044487647345e-05, + "loss": 0.1233, + "step": 4793 + }, + { + "epoch": 7.621621621621622, + "grad_norm": 3.0287561561644285, + "learning_rate": 4.7634203873360724e-05, + "loss": 0.1684, + "step": 4794 + }, + { + "epoch": 7.623211446740859, + "grad_norm": 1.5345062778426573, + "learning_rate": 4.7637960286346424e-05, + "loss": 0.1393, + "step": 4795 + }, + { + "epoch": 7.624801271860095, + "grad_norm": 2.9106308824383476, + "learning_rate": 4.7641714114159136e-05, + "loss": 0.1832, + "step": 4796 + }, + { + "epoch": 7.626391096979332, + "grad_norm": 1.996694487050703, + "learning_rate": 4.7645465355528325e-05, + "loss": 0.1294, + "step": 4797 + }, + { + "epoch": 7.627980922098569, + "grad_norm": 1.7492567187586974, + "learning_rate": 4.764921400918432e-05, + "loss": 0.1447, + "step": 4798 + }, + { + "epoch": 7.629570747217806, + "grad_norm": 1.1793220969434797, + "learning_rate": 4.7652960073858356e-05, + "loss": 0.1696, + "step": 4799 + }, + { + "epoch": 7.631160572337043, + "grad_norm": 4.529096893733174, + "learning_rate": 4.765670354828252e-05, + "loss": 0.1801, + "step": 4800 + }, + { + "epoch": 7.63275039745628, + "grad_norm": 1.7026156386878346, + "learning_rate": 4.766044443118978e-05, + "loss": 0.1598, + "step": 4801 + }, + { + "epoch": 7.634340222575517, + "grad_norm": 3.5516136873338877, + "learning_rate": 4.766418272131399e-05, + "loss": 0.1969, + "step": 4802 + }, + { + "epoch": 7.635930047694753, + "grad_norm": 2.2079967799734472, + "learning_rate": 4.766791841738986e-05, + "loss": 0.1486, + "step": 4803 + }, + { + "epoch": 7.63751987281399, + "grad_norm": 2.124930609544282, + "learning_rate": 4.7671651518153e-05, + "loss": 0.1772, + "step": 4804 + }, + { + "epoch": 7.639109697933227, + "grad_norm": 2.8473318060050126, + "learning_rate": 4.767538202233989e-05, + "loss": 0.2068, + "step": 4805 + }, + { + "epoch": 7.640699523052464, + "grad_norm": 2.7634170060588494, + "learning_rate": 4.7679109928687886e-05, + "loss": 0.1475, + "step": 4806 + }, + { + "epoch": 7.642289348171701, + "grad_norm": 2.044527922093527, + "learning_rate": 4.768283523593523e-05, + "loss": 0.1115, + "step": 4807 + }, + { + "epoch": 7.643879173290938, + "grad_norm": 3.950386450273311, + "learning_rate": 4.768655794282105e-05, + "loss": 0.1468, + "step": 4808 + }, + { + "epoch": 7.645468998410175, + "grad_norm": 3.2449097015331616, + "learning_rate": 4.769027804808533e-05, + "loss": 0.1711, + "step": 4809 + }, + { + "epoch": 7.647058823529412, + "grad_norm": 2.0167432008104926, + "learning_rate": 4.769399555046895e-05, + "loss": 0.175, + "step": 4810 + }, + { + "epoch": 7.648648648648649, + "grad_norm": 2.6208862533654074, + "learning_rate": 4.769771044871368e-05, + "loss": 0.158, + "step": 4811 + }, + { + "epoch": 7.650238473767885, + "grad_norm": 2.3912918717986527, + "learning_rate": 4.770142274156215e-05, + "loss": 0.1708, + "step": 4812 + }, + { + "epoch": 7.651828298887122, + "grad_norm": 1.5514775690177256, + "learning_rate": 4.77051324277579e-05, + "loss": 0.1384, + "step": 4813 + }, + { + "epoch": 7.653418124006359, + "grad_norm": 2.814862550330019, + "learning_rate": 4.770883950604531e-05, + "loss": 0.134, + "step": 4814 + }, + { + "epoch": 7.655007949125596, + "grad_norm": 43.67830904971946, + "learning_rate": 4.771254397516969e-05, + "loss": 5.1057, + "step": 4815 + }, + { + "epoch": 7.656597774244833, + "grad_norm": 2.8814425819872307, + "learning_rate": 4.77162458338772e-05, + "loss": 0.1899, + "step": 4816 + }, + { + "epoch": 7.65818759936407, + "grad_norm": 3.366461473686901, + "learning_rate": 4.77199450809149e-05, + "loss": 0.1844, + "step": 4817 + }, + { + "epoch": 7.659777424483307, + "grad_norm": 4.924699834617854, + "learning_rate": 4.7723641715030733e-05, + "loss": 0.1821, + "step": 4818 + }, + { + "epoch": 7.661367249602543, + "grad_norm": 2.349006993854446, + "learning_rate": 4.772733573497352e-05, + "loss": 0.1419, + "step": 4819 + }, + { + "epoch": 7.66295707472178, + "grad_norm": 86.34540492585505, + "learning_rate": 4.773102713949295e-05, + "loss": 13.2542, + "step": 4820 + }, + { + "epoch": 7.664546899841017, + "grad_norm": 4.178754122373659, + "learning_rate": 4.7734715927339636e-05, + "loss": 0.1568, + "step": 4821 + }, + { + "epoch": 7.666136724960254, + "grad_norm": 1.8076730756080879, + "learning_rate": 4.773840209726507e-05, + "loss": 0.1511, + "step": 4822 + }, + { + "epoch": 7.667726550079491, + "grad_norm": 6.010955414298864, + "learning_rate": 4.774208564802158e-05, + "loss": 0.1622, + "step": 4823 + }, + { + "epoch": 7.669316375198728, + "grad_norm": 2.5799361163688626, + "learning_rate": 4.7745766578362445e-05, + "loss": 0.152, + "step": 4824 + }, + { + "epoch": 7.670906200317965, + "grad_norm": 3.2691986827547828, + "learning_rate": 4.77494448870418e-05, + "loss": 0.1753, + "step": 4825 + }, + { + "epoch": 7.672496025437201, + "grad_norm": 4.815672266979673, + "learning_rate": 4.775312057281467e-05, + "loss": 0.1878, + "step": 4826 + }, + { + "epoch": 7.674085850556438, + "grad_norm": 2.9888513525560874, + "learning_rate": 4.7756793634436945e-05, + "loss": 0.1837, + "step": 4827 + }, + { + "epoch": 7.675675675675675, + "grad_norm": 3.0308953550536937, + "learning_rate": 4.7760464070665465e-05, + "loss": 0.1197, + "step": 4828 + }, + { + "epoch": 7.677265500794912, + "grad_norm": 8.587909077285017, + "learning_rate": 4.776413188025789e-05, + "loss": 28.0462, + "step": 4829 + }, + { + "epoch": 7.678855325914149, + "grad_norm": 1.941815489594796, + "learning_rate": 4.776779706197282e-05, + "loss": 0.1404, + "step": 4830 + }, + { + "epoch": 7.680445151033386, + "grad_norm": 8.638761538916617, + "learning_rate": 4.777145961456971e-05, + "loss": 0.1992, + "step": 4831 + }, + { + "epoch": 7.682034976152623, + "grad_norm": 2.1899099388093406, + "learning_rate": 4.777511953680893e-05, + "loss": 0.1636, + "step": 4832 + }, + { + "epoch": 7.68362480127186, + "grad_norm": 3.2966946054890993, + "learning_rate": 4.777877682745171e-05, + "loss": 0.1725, + "step": 4833 + }, + { + "epoch": 7.685214626391097, + "grad_norm": 6.20409913863841, + "learning_rate": 4.778243148526021e-05, + "loss": 0.2071, + "step": 4834 + }, + { + "epoch": 7.6868044515103335, + "grad_norm": 5.307366886487229, + "learning_rate": 4.778608350899745e-05, + "loss": 0.1797, + "step": 4835 + }, + { + "epoch": 7.6883942766295705, + "grad_norm": 2.2138608774455952, + "learning_rate": 4.778973289742736e-05, + "loss": 0.2032, + "step": 4836 + }, + { + "epoch": 7.6899841017488075, + "grad_norm": 6.247543299588143, + "learning_rate": 4.779337964931475e-05, + "loss": 0.2135, + "step": 4837 + }, + { + "epoch": 7.6915739268680445, + "grad_norm": 5.703974704592912, + "learning_rate": 4.779702376342531e-05, + "loss": 0.152, + "step": 4838 + }, + { + "epoch": 7.6931637519872815, + "grad_norm": 1.7663526446162068, + "learning_rate": 4.7800665238525666e-05, + "loss": 0.1344, + "step": 4839 + }, + { + "epoch": 7.6947535771065185, + "grad_norm": 7.217796914777775, + "learning_rate": 4.78043040733833e-05, + "loss": 0.188, + "step": 4840 + }, + { + "epoch": 7.6963434022257555, + "grad_norm": 3.2411011228412674, + "learning_rate": 4.780794026676659e-05, + "loss": 0.1779, + "step": 4841 + }, + { + "epoch": 7.697933227344992, + "grad_norm": 3.705838019085251, + "learning_rate": 4.7811573817444834e-05, + "loss": 0.1848, + "step": 4842 + }, + { + "epoch": 7.699523052464229, + "grad_norm": 4.244345211545069, + "learning_rate": 4.781520472418819e-05, + "loss": 0.1451, + "step": 4843 + }, + { + "epoch": 7.701112877583466, + "grad_norm": 2.494870191204854, + "learning_rate": 4.781883298576773e-05, + "loss": 0.1693, + "step": 4844 + }, + { + "epoch": 7.702702702702703, + "grad_norm": 3.7886826839151198, + "learning_rate": 4.7822458600955426e-05, + "loss": 0.1389, + "step": 4845 + }, + { + "epoch": 7.70429252782194, + "grad_norm": 3.3716713277912156, + "learning_rate": 4.7826081568524144e-05, + "loss": 0.175, + "step": 4846 + }, + { + "epoch": 7.705882352941177, + "grad_norm": 4.6290037961701875, + "learning_rate": 4.782970188724762e-05, + "loss": 0.1545, + "step": 4847 + }, + { + "epoch": 7.707472178060414, + "grad_norm": 5.008468793844193, + "learning_rate": 4.783331955590052e-05, + "loss": 0.1613, + "step": 4848 + }, + { + "epoch": 7.709062003179651, + "grad_norm": 3.2710190445831806, + "learning_rate": 4.7836934573258396e-05, + "loss": 0.1095, + "step": 4849 + }, + { + "epoch": 7.710651828298887, + "grad_norm": 4.187683332824225, + "learning_rate": 4.784054693809769e-05, + "loss": 0.1716, + "step": 4850 + }, + { + "epoch": 7.712241653418124, + "grad_norm": 3.9547069636363887, + "learning_rate": 4.784415664919576e-05, + "loss": 0.1689, + "step": 4851 + }, + { + "epoch": 7.713831478537361, + "grad_norm": 3.024567056434687, + "learning_rate": 4.784776370533084e-05, + "loss": 0.2025, + "step": 4852 + }, + { + "epoch": 7.715421303656598, + "grad_norm": 4.092607250776077, + "learning_rate": 4.7851368105282055e-05, + "loss": 0.1782, + "step": 4853 + }, + { + "epoch": 7.717011128775835, + "grad_norm": 4.795757882785464, + "learning_rate": 4.7854969847829474e-05, + "loss": 0.1679, + "step": 4854 + }, + { + "epoch": 7.718600953895072, + "grad_norm": 4.144649297081837, + "learning_rate": 4.785856893175402e-05, + "loss": 0.1671, + "step": 4855 + }, + { + "epoch": 7.720190779014309, + "grad_norm": 6.301477941112075, + "learning_rate": 4.786216535583754e-05, + "loss": 0.1434, + "step": 4856 + }, + { + "epoch": 7.721780604133546, + "grad_norm": 2.7178251560997504, + "learning_rate": 4.7865759118862785e-05, + "loss": 0.1548, + "step": 4857 + }, + { + "epoch": 7.723370429252782, + "grad_norm": 2.95976233048554, + "learning_rate": 4.786935021961337e-05, + "loss": 0.1649, + "step": 4858 + }, + { + "epoch": 7.724960254372019, + "grad_norm": 7.604426793717746, + "learning_rate": 4.7872938656873865e-05, + "loss": 0.2387, + "step": 4859 + }, + { + "epoch": 7.726550079491256, + "grad_norm": 2.174276802192697, + "learning_rate": 4.78765244294297e-05, + "loss": 0.1113, + "step": 4860 + }, + { + "epoch": 7.728139904610493, + "grad_norm": 3.8805292812049044, + "learning_rate": 4.788010753606722e-05, + "loss": 0.1593, + "step": 4861 + }, + { + "epoch": 7.72972972972973, + "grad_norm": 3.555134815355614, + "learning_rate": 4.788368797557368e-05, + "loss": 0.1184, + "step": 4862 + }, + { + "epoch": 7.731319554848967, + "grad_norm": 7.951382167906444, + "learning_rate": 4.788726574673723e-05, + "loss": 0.1805, + "step": 4863 + }, + { + "epoch": 7.732909379968204, + "grad_norm": 2.5168882791677594, + "learning_rate": 4.789084084834691e-05, + "loss": 0.1687, + "step": 4864 + }, + { + "epoch": 7.73449920508744, + "grad_norm": 4.5787408520337705, + "learning_rate": 4.789441327919269e-05, + "loss": 0.1918, + "step": 4865 + }, + { + "epoch": 7.736089030206677, + "grad_norm": 5.901620815009177, + "learning_rate": 4.789798303806544e-05, + "loss": 0.1853, + "step": 4866 + }, + { + "epoch": 7.737678855325914, + "grad_norm": 4.688257340758661, + "learning_rate": 4.790155012375691e-05, + "loss": 0.1701, + "step": 4867 + }, + { + "epoch": 7.739268680445151, + "grad_norm": 5.670597067576088, + "learning_rate": 4.790511453505977e-05, + "loss": 0.1317, + "step": 4868 + }, + { + "epoch": 7.740858505564388, + "grad_norm": 4.622546343518444, + "learning_rate": 4.790867627076761e-05, + "loss": 0.1734, + "step": 4869 + }, + { + "epoch": 7.742448330683625, + "grad_norm": 5.1573734395383815, + "learning_rate": 4.79122353296749e-05, + "loss": 0.1894, + "step": 4870 + }, + { + "epoch": 7.744038155802862, + "grad_norm": 4.2621720310157585, + "learning_rate": 4.791579171057704e-05, + "loss": 0.1793, + "step": 4871 + }, + { + "epoch": 7.745627980922099, + "grad_norm": 4.435375773390268, + "learning_rate": 4.7919345412270306e-05, + "loss": 0.1696, + "step": 4872 + }, + { + "epoch": 7.747217806041336, + "grad_norm": 2.3945174558290327, + "learning_rate": 4.792289643355191e-05, + "loss": 0.1638, + "step": 4873 + }, + { + "epoch": 7.748807631160572, + "grad_norm": 1.9890454351364455, + "learning_rate": 4.792644477321995e-05, + "loss": 0.1568, + "step": 4874 + }, + { + "epoch": 7.750397456279809, + "grad_norm": 3.1940890099101154, + "learning_rate": 4.792999043007347e-05, + "loss": 0.2603, + "step": 4875 + }, + { + "epoch": 7.751987281399046, + "grad_norm": 17.32519917476074, + "learning_rate": 4.793353340291235e-05, + "loss": 2.2677, + "step": 4876 + }, + { + "epoch": 7.753577106518283, + "grad_norm": 4.527711701871871, + "learning_rate": 4.7937073690537456e-05, + "loss": 0.1758, + "step": 4877 + }, + { + "epoch": 7.75516693163752, + "grad_norm": 3.6039058619977555, + "learning_rate": 4.7940611291750514e-05, + "loss": 0.1502, + "step": 4878 + }, + { + "epoch": 7.756756756756757, + "grad_norm": 4.331219215680899, + "learning_rate": 4.7944146205354186e-05, + "loss": 0.1947, + "step": 4879 + }, + { + "epoch": 7.758346581875994, + "grad_norm": 4.982291651447193, + "learning_rate": 4.7947678430152016e-05, + "loss": 0.1933, + "step": 4880 + }, + { + "epoch": 7.75993640699523, + "grad_norm": 4.167488160064972, + "learning_rate": 4.795120796494849e-05, + "loss": 0.1368, + "step": 4881 + }, + { + "epoch": 7.761526232114467, + "grad_norm": 4.055252257055197, + "learning_rate": 4.7954734808548964e-05, + "loss": 0.149, + "step": 4882 + }, + { + "epoch": 7.763116057233704, + "grad_norm": 8.49608409897939, + "learning_rate": 4.7958258959759754e-05, + "loss": 0.3281, + "step": 4883 + }, + { + "epoch": 7.764705882352941, + "grad_norm": 3.4343683080686853, + "learning_rate": 4.7961780417388045e-05, + "loss": 0.1487, + "step": 4884 + }, + { + "epoch": 7.766295707472178, + "grad_norm": 4.919509848077963, + "learning_rate": 4.796529918024197e-05, + "loss": 0.1434, + "step": 4885 + }, + { + "epoch": 7.767885532591415, + "grad_norm": 5.521508602156541, + "learning_rate": 4.796881524713053e-05, + "loss": 0.2172, + "step": 4886 + }, + { + "epoch": 7.769475357710652, + "grad_norm": 3.550189502190708, + "learning_rate": 4.7972328616863695e-05, + "loss": 0.2002, + "step": 4887 + }, + { + "epoch": 7.771065182829888, + "grad_norm": 4.0170862647156635, + "learning_rate": 4.797583928825228e-05, + "loss": 0.214, + "step": 4888 + }, + { + "epoch": 7.772655007949125, + "grad_norm": 4.737471834537013, + "learning_rate": 4.797934726010809e-05, + "loss": 0.1481, + "step": 4889 + }, + { + "epoch": 7.774244833068362, + "grad_norm": 3.642311542246314, + "learning_rate": 4.798285253124377e-05, + "loss": 0.2271, + "step": 4890 + }, + { + "epoch": 7.775834658187599, + "grad_norm": 2.59105675825928, + "learning_rate": 4.798635510047293e-05, + "loss": 0.1685, + "step": 4891 + }, + { + "epoch": 7.777424483306836, + "grad_norm": 2.197181112600223, + "learning_rate": 4.798985496661007e-05, + "loss": 0.135, + "step": 4892 + }, + { + "epoch": 7.779014308426073, + "grad_norm": 3.48471007541512, + "learning_rate": 4.799335212847062e-05, + "loss": 0.1306, + "step": 4893 + }, + { + "epoch": 7.78060413354531, + "grad_norm": 3.056013578425701, + "learning_rate": 4.799684658487091e-05, + "loss": 0.1186, + "step": 4894 + }, + { + "epoch": 7.782193958664547, + "grad_norm": 3.5535282736313762, + "learning_rate": 4.800033833462819e-05, + "loss": 0.1439, + "step": 4895 + }, + { + "epoch": 7.783783783783784, + "grad_norm": 2.675849154433615, + "learning_rate": 4.800382737656064e-05, + "loss": 0.1791, + "step": 4896 + }, + { + "epoch": 7.78537360890302, + "grad_norm": 4.58423152103168, + "learning_rate": 4.800731370948734e-05, + "loss": 0.1792, + "step": 4897 + }, + { + "epoch": 7.786963434022257, + "grad_norm": 2.0729235708226637, + "learning_rate": 4.8010797332228294e-05, + "loss": 0.1527, + "step": 4898 + }, + { + "epoch": 7.788553259141494, + "grad_norm": 5.468472774126725, + "learning_rate": 4.801427824360441e-05, + "loss": 0.1866, + "step": 4899 + }, + { + "epoch": 7.790143084260731, + "grad_norm": 3.772952143291985, + "learning_rate": 4.801775644243754e-05, + "loss": 0.1975, + "step": 4900 + }, + { + "epoch": 7.791732909379968, + "grad_norm": 4.326817190959473, + "learning_rate": 4.802123192755044e-05, + "loss": 0.1809, + "step": 4901 + }, + { + "epoch": 7.793322734499205, + "grad_norm": 6.195245824248576, + "learning_rate": 4.8024704697766774e-05, + "loss": 0.1545, + "step": 4902 + }, + { + "epoch": 7.794912559618442, + "grad_norm": 3.433346259110007, + "learning_rate": 4.802817475191115e-05, + "loss": 0.1766, + "step": 4903 + }, + { + "epoch": 7.796502384737678, + "grad_norm": 5.628266352613258, + "learning_rate": 4.8031642088809064e-05, + "loss": 0.1576, + "step": 4904 + }, + { + "epoch": 7.798092209856915, + "grad_norm": 3.692341727245951, + "learning_rate": 4.803510670728695e-05, + "loss": 0.1631, + "step": 4905 + }, + { + "epoch": 7.799682034976152, + "grad_norm": 6.794840960498299, + "learning_rate": 4.803856860617217e-05, + "loss": 0.2067, + "step": 4906 + }, + { + "epoch": 7.801271860095389, + "grad_norm": 6.397007367718696, + "learning_rate": 4.8042027784293e-05, + "loss": 0.244, + "step": 4907 + }, + { + "epoch": 7.802861685214626, + "grad_norm": 3.8735856557977844, + "learning_rate": 4.804548424047861e-05, + "loss": 0.1543, + "step": 4908 + }, + { + "epoch": 7.804451510333863, + "grad_norm": 4.531061281199375, + "learning_rate": 4.804893797355914e-05, + "loss": 0.1502, + "step": 4909 + }, + { + "epoch": 7.8060413354531, + "grad_norm": 3.7018281863408946, + "learning_rate": 4.805238898236562e-05, + "loss": 0.199, + "step": 4910 + }, + { + "epoch": 7.807631160572337, + "grad_norm": 3.1156121204912917, + "learning_rate": 4.805583726573e-05, + "loss": 0.1495, + "step": 4911 + }, + { + "epoch": 7.809220985691574, + "grad_norm": 3.303473257271987, + "learning_rate": 4.805928282248516e-05, + "loss": 0.2053, + "step": 4912 + }, + { + "epoch": 7.8108108108108105, + "grad_norm": 3.598543308071502, + "learning_rate": 4.806272565146492e-05, + "loss": 0.1852, + "step": 4913 + }, + { + "epoch": 7.8124006359300475, + "grad_norm": 3.5785539744186483, + "learning_rate": 4.8066165751503984e-05, + "loss": 0.1204, + "step": 4914 + }, + { + "epoch": 7.8139904610492845, + "grad_norm": 3.661590418799151, + "learning_rate": 4.806960312143802e-05, + "loss": 0.1953, + "step": 4915 + }, + { + "epoch": 7.8155802861685215, + "grad_norm": 2.957558397410124, + "learning_rate": 4.80730377601036e-05, + "loss": 0.1853, + "step": 4916 + }, + { + "epoch": 7.8171701112877585, + "grad_norm": 2.4502807742983346, + "learning_rate": 4.807646966633822e-05, + "loss": 0.1266, + "step": 4917 + }, + { + "epoch": 7.8187599364069955, + "grad_norm": 4.311878999091106, + "learning_rate": 4.807989883898031e-05, + "loss": 0.1458, + "step": 4918 + }, + { + "epoch": 7.8203497615262325, + "grad_norm": 4.083731894959133, + "learning_rate": 4.808332527686921e-05, + "loss": 0.1952, + "step": 4919 + }, + { + "epoch": 7.821939586645469, + "grad_norm": 3.2306281618224553, + "learning_rate": 4.80867489788452e-05, + "loss": 0.135, + "step": 4920 + }, + { + "epoch": 7.823529411764706, + "grad_norm": 3.226340466006041, + "learning_rate": 4.809016994374947e-05, + "loss": 0.2138, + "step": 4921 + }, + { + "epoch": 7.825119236883943, + "grad_norm": 20.61238321127757, + "learning_rate": 4.809358817042417e-05, + "loss": 2.9132, + "step": 4922 + }, + { + "epoch": 7.82670906200318, + "grad_norm": 3.8745391467908914, + "learning_rate": 4.809700365771234e-05, + "loss": 0.1781, + "step": 4923 + }, + { + "epoch": 7.828298887122417, + "grad_norm": 3.509846113538995, + "learning_rate": 4.810041640445796e-05, + "loss": 0.1543, + "step": 4924 + }, + { + "epoch": 7.829888712241654, + "grad_norm": 2.3764799870692426, + "learning_rate": 4.810382640950595e-05, + "loss": 0.115, + "step": 4925 + }, + { + "epoch": 7.831478537360891, + "grad_norm": 2.652961818009014, + "learning_rate": 4.8107233671702124e-05, + "loss": 0.1746, + "step": 4926 + }, + { + "epoch": 7.833068362480127, + "grad_norm": 2.686173328715501, + "learning_rate": 4.811063818989327e-05, + "loss": 0.1792, + "step": 4927 + }, + { + "epoch": 7.834658187599364, + "grad_norm": 208.27179330863018, + "learning_rate": 4.811403996292707e-05, + "loss": 14.0542, + "step": 4928 + }, + { + "epoch": 7.836248012718601, + "grad_norm": 3.3088274349735487, + "learning_rate": 4.8117438989652145e-05, + "loss": 0.1676, + "step": 4929 + }, + { + "epoch": 7.837837837837838, + "grad_norm": 4.262856600110085, + "learning_rate": 4.812083526891807e-05, + "loss": 0.1949, + "step": 4930 + }, + { + "epoch": 7.839427662957075, + "grad_norm": 2.1374844179255272, + "learning_rate": 4.8124228799575296e-05, + "loss": 0.124, + "step": 4931 + }, + { + "epoch": 7.841017488076312, + "grad_norm": 2.681152960889788, + "learning_rate": 4.812761958047525e-05, + "loss": 0.1275, + "step": 4932 + }, + { + "epoch": 7.842607313195549, + "grad_norm": 12.863216084517088, + "learning_rate": 4.8131007610470275e-05, + "loss": 0.7009, + "step": 4933 + }, + { + "epoch": 7.844197138314786, + "grad_norm": 3.7485051764350734, + "learning_rate": 4.8134392888413654e-05, + "loss": 0.1792, + "step": 4934 + }, + { + "epoch": 7.845786963434023, + "grad_norm": 3.026990433203727, + "learning_rate": 4.8137775413159575e-05, + "loss": 0.188, + "step": 4935 + }, + { + "epoch": 7.847376788553259, + "grad_norm": 2.1007247296818132, + "learning_rate": 4.8141155183563196e-05, + "loss": 0.1603, + "step": 4936 + }, + { + "epoch": 7.848966613672496, + "grad_norm": 1.999488471969087, + "learning_rate": 4.8144532198480576e-05, + "loss": 0.1537, + "step": 4937 + }, + { + "epoch": 7.850556438791733, + "grad_norm": 3.128559512297237, + "learning_rate": 4.814790645676871e-05, + "loss": 0.1304, + "step": 4938 + }, + { + "epoch": 7.85214626391097, + "grad_norm": 2.0926042102396143, + "learning_rate": 4.8151277957285545e-05, + "loss": 0.183, + "step": 4939 + }, + { + "epoch": 7.853736089030207, + "grad_norm": 1.6195699862714954, + "learning_rate": 4.815464669888995e-05, + "loss": 0.1365, + "step": 4940 + }, + { + "epoch": 7.855325914149444, + "grad_norm": 2.809229748432529, + "learning_rate": 4.815801268044172e-05, + "loss": 0.1787, + "step": 4941 + }, + { + "epoch": 7.856915739268681, + "grad_norm": 2.0247053277342633, + "learning_rate": 4.8161375900801604e-05, + "loss": 0.1713, + "step": 4942 + }, + { + "epoch": 7.858505564387917, + "grad_norm": 2.2090789953658363, + "learning_rate": 4.8164736358831266e-05, + "loss": 0.1527, + "step": 4943 + }, + { + "epoch": 7.860095389507154, + "grad_norm": 1.9611068478568885, + "learning_rate": 4.816809405339331e-05, + "loss": 0.1993, + "step": 4944 + }, + { + "epoch": 7.861685214626391, + "grad_norm": 1.1890423270793156, + "learning_rate": 4.817144898335129e-05, + "loss": 0.1809, + "step": 4945 + }, + { + "epoch": 7.863275039745628, + "grad_norm": 3.5338773883974666, + "learning_rate": 4.817480114756967e-05, + "loss": 0.2079, + "step": 4946 + }, + { + "epoch": 7.864864864864865, + "grad_norm": 1.6960537379975122, + "learning_rate": 4.817815054491387e-05, + "loss": 0.1478, + "step": 4947 + }, + { + "epoch": 7.866454689984102, + "grad_norm": 2.9087269689041975, + "learning_rate": 4.818149717425024e-05, + "loss": 0.163, + "step": 4948 + }, + { + "epoch": 7.868044515103339, + "grad_norm": 1.6404688098672655, + "learning_rate": 4.8184841034446064e-05, + "loss": 0.1332, + "step": 4949 + }, + { + "epoch": 7.869634340222575, + "grad_norm": 2.553962142007059, + "learning_rate": 4.818818212436957e-05, + "loss": 0.2385, + "step": 4950 + }, + { + "epoch": 7.871224165341812, + "grad_norm": 3.0910186405873987, + "learning_rate": 4.819152044288992e-05, + "loss": 0.1582, + "step": 4951 + }, + { + "epoch": 7.872813990461049, + "grad_norm": 36.03967410521832, + "learning_rate": 4.819485598887722e-05, + "loss": 0.9207, + "step": 4952 + }, + { + "epoch": 7.874403815580286, + "grad_norm": 2.0236553660488026, + "learning_rate": 4.819818876120249e-05, + "loss": 0.1926, + "step": 4953 + }, + { + "epoch": 7.875993640699523, + "grad_norm": 4.184349843779772, + "learning_rate": 4.820151875873772e-05, + "loss": 0.222, + "step": 4954 + }, + { + "epoch": 7.87758346581876, + "grad_norm": 2.210387507969358, + "learning_rate": 4.820484598035584e-05, + "loss": 0.1751, + "step": 4955 + }, + { + "epoch": 7.879173290937997, + "grad_norm": 6297.770638160034, + "learning_rate": 4.8208170424930675e-05, + "loss": 10.4534, + "step": 4956 + }, + { + "epoch": 7.880763116057234, + "grad_norm": 3.8524524340204884, + "learning_rate": 4.821149209133705e-05, + "loss": 0.1446, + "step": 4957 + }, + { + "epoch": 7.882352941176471, + "grad_norm": 3.82605314730378, + "learning_rate": 4.821481097845068e-05, + "loss": 0.1254, + "step": 4958 + }, + { + "epoch": 7.883942766295707, + "grad_norm": 2.2741560276795423, + "learning_rate": 4.821812708514824e-05, + "loss": 0.168, + "step": 4959 + }, + { + "epoch": 7.885532591414944, + "grad_norm": 120.86045405974924, + "learning_rate": 4.8221440410307376e-05, + "loss": 1.3924, + "step": 4960 + }, + { + "epoch": 7.887122416534181, + "grad_norm": 3.909459770430315, + "learning_rate": 4.822475095280662e-05, + "loss": 0.1521, + "step": 4961 + }, + { + "epoch": 7.888712241653418, + "grad_norm": 5.3537000396522645, + "learning_rate": 4.8228058711525496e-05, + "loss": 0.2861, + "step": 4962 + }, + { + "epoch": 7.890302066772655, + "grad_norm": 3.868391643518189, + "learning_rate": 4.8231363685344426e-05, + "loss": 0.227, + "step": 4963 + }, + { + "epoch": 7.891891891891892, + "grad_norm": 4.908552289778511, + "learning_rate": 4.82346658731448e-05, + "loss": 0.1837, + "step": 4964 + }, + { + "epoch": 7.893481717011129, + "grad_norm": 3.925555464974056, + "learning_rate": 4.8237965273808956e-05, + "loss": 1.4195, + "step": 4965 + }, + { + "epoch": 7.895071542130365, + "grad_norm": 3.755552242890727, + "learning_rate": 4.824126188622016e-05, + "loss": 0.2346, + "step": 4966 + }, + { + "epoch": 7.896661367249602, + "grad_norm": 5.485537256042443, + "learning_rate": 4.824455570926263e-05, + "loss": 0.2818, + "step": 4967 + }, + { + "epoch": 7.898251192368839, + "grad_norm": 3.5866565661282523, + "learning_rate": 4.824784674182152e-05, + "loss": 0.1539, + "step": 4968 + }, + { + "epoch": 7.899841017488076, + "grad_norm": 2.8630403838349814, + "learning_rate": 4.8251134982782956e-05, + "loss": 0.2061, + "step": 4969 + }, + { + "epoch": 7.901430842607313, + "grad_norm": 2.8642125361057658, + "learning_rate": 4.8254420431033964e-05, + "loss": 0.1734, + "step": 4970 + }, + { + "epoch": 7.90302066772655, + "grad_norm": 1.9940617647341292, + "learning_rate": 4.8257703085462547e-05, + "loss": 0.1343, + "step": 4971 + }, + { + "epoch": 7.904610492845787, + "grad_norm": 2.8690220318202786, + "learning_rate": 4.826098294495764e-05, + "loss": 0.2251, + "step": 4972 + }, + { + "epoch": 7.906200317965024, + "grad_norm": 2.4856491389265605, + "learning_rate": 4.8264260008409135e-05, + "loss": 0.1804, + "step": 4973 + }, + { + "epoch": 7.907790143084261, + "grad_norm": 2.7832817978862643, + "learning_rate": 4.8267534274707873e-05, + "loss": 0.1704, + "step": 4974 + }, + { + "epoch": 7.909379968203497, + "grad_norm": 2.485709787805316, + "learning_rate": 4.8270805742745626e-05, + "loss": 0.1077, + "step": 4975 + }, + { + "epoch": 7.910969793322734, + "grad_norm": 2.041579058703087, + "learning_rate": 4.827407441141511e-05, + "loss": 0.1658, + "step": 4976 + }, + { + "epoch": 7.912559618441971, + "grad_norm": 3.5393131999962026, + "learning_rate": 4.827734027961001e-05, + "loss": 0.1621, + "step": 4977 + }, + { + "epoch": 7.914149443561208, + "grad_norm": 2.4866715331014344, + "learning_rate": 4.828060334622495e-05, + "loss": 0.3023, + "step": 4978 + }, + { + "epoch": 7.915739268680445, + "grad_norm": 1.8636972870552049, + "learning_rate": 4.828386361015549e-05, + "loss": 0.1323, + "step": 4979 + }, + { + "epoch": 7.917329093799682, + "grad_norm": 1.911153724512037, + "learning_rate": 4.828712107029816e-05, + "loss": 0.1595, + "step": 4980 + }, + { + "epoch": 7.918918918918919, + "grad_norm": 1.9949316022347554, + "learning_rate": 4.829037572555042e-05, + "loss": 0.182, + "step": 4981 + }, + { + "epoch": 7.920508744038155, + "grad_norm": 3.007769842892119, + "learning_rate": 4.829362757481069e-05, + "loss": 0.2447, + "step": 4982 + }, + { + "epoch": 7.922098569157392, + "grad_norm": 2.267652205254621, + "learning_rate": 4.829687661697834e-05, + "loss": 0.1728, + "step": 4983 + }, + { + "epoch": 7.923688394276629, + "grad_norm": 3.0160224837107217, + "learning_rate": 4.8300122850953675e-05, + "loss": 0.1868, + "step": 4984 + }, + { + "epoch": 7.925278219395866, + "grad_norm": 3.974766516167264, + "learning_rate": 4.8303366275637975e-05, + "loss": 0.1843, + "step": 4985 + }, + { + "epoch": 7.926868044515103, + "grad_norm": 3.510510741236876, + "learning_rate": 4.830660688993346e-05, + "loss": 0.1805, + "step": 4986 + }, + { + "epoch": 7.92845786963434, + "grad_norm": 3.2329235420290963, + "learning_rate": 4.8309844692743284e-05, + "loss": 0.1473, + "step": 4987 + }, + { + "epoch": 7.930047694753577, + "grad_norm": 1.8636818152672503, + "learning_rate": 4.8313079682971575e-05, + "loss": 0.2082, + "step": 4988 + }, + { + "epoch": 7.9316375198728135, + "grad_norm": 4.150152789953504, + "learning_rate": 4.8316311859523424e-05, + "loss": 0.2378, + "step": 4989 + }, + { + "epoch": 7.9332273449920505, + "grad_norm": 5.461450180775667, + "learning_rate": 4.831954122130483e-05, + "loss": 0.3557, + "step": 4990 + }, + { + "epoch": 7.9348171701112875, + "grad_norm": 3.414994117779668, + "learning_rate": 4.832276776722278e-05, + "loss": 0.1343, + "step": 4991 + }, + { + "epoch": 7.9364069952305245, + "grad_norm": 3.1187476501957017, + "learning_rate": 4.832599149618521e-05, + "loss": 0.165, + "step": 4992 + }, + { + "epoch": 7.9379968203497615, + "grad_norm": 3.520146182749627, + "learning_rate": 4.832921240710099e-05, + "loss": 0.1717, + "step": 4993 + }, + { + "epoch": 7.9395866454689985, + "grad_norm": 2.165721718800238, + "learning_rate": 4.8332430498879984e-05, + "loss": 0.1634, + "step": 4994 + }, + { + "epoch": 7.9411764705882355, + "grad_norm": 2.972952135351359, + "learning_rate": 4.833564577043297e-05, + "loss": 0.121, + "step": 4995 + }, + { + "epoch": 7.9427662957074725, + "grad_norm": 7.257678551190708, + "learning_rate": 4.8338858220671684e-05, + "loss": 0.3683, + "step": 4996 + }, + { + "epoch": 7.9443561208267095, + "grad_norm": 2.633176318003089, + "learning_rate": 4.834206784850885e-05, + "loss": 0.1805, + "step": 4997 + }, + { + "epoch": 7.945945945945946, + "grad_norm": 2.6103477711132403, + "learning_rate": 4.83452746528581e-05, + "loss": 0.2239, + "step": 4998 + }, + { + "epoch": 7.947535771065183, + "grad_norm": 3.9649821498683635, + "learning_rate": 4.834847863263407e-05, + "loss": 0.1448, + "step": 4999 + }, + { + "epoch": 7.94912559618442, + "grad_norm": 2.0525047929742555, + "learning_rate": 4.835167978675231e-05, + "loss": 0.1375, + "step": 5000 + }, + { + "epoch": 7.950715421303657, + "grad_norm": 5.677274030606974, + "learning_rate": 4.835487811412937e-05, + "loss": 0.1743, + "step": 5001 + }, + { + "epoch": 7.952305246422894, + "grad_norm": 2.8843080693596956, + "learning_rate": 4.83580736136827e-05, + "loss": 0.223, + "step": 5002 + }, + { + "epoch": 7.953895071542131, + "grad_norm": 3.515493977270323, + "learning_rate": 4.836126628433077e-05, + "loss": 0.1867, + "step": 5003 + }, + { + "epoch": 7.955484896661368, + "grad_norm": 4.489343337604896, + "learning_rate": 4.836445612499296e-05, + "loss": 0.1714, + "step": 5004 + }, + { + "epoch": 7.957074721780604, + "grad_norm": 3.331081329995802, + "learning_rate": 4.8367643134589624e-05, + "loss": 0.186, + "step": 5005 + }, + { + "epoch": 7.958664546899841, + "grad_norm": 4.881074649041259, + "learning_rate": 4.837082731204207e-05, + "loss": 0.1529, + "step": 5006 + }, + { + "epoch": 7.960254372019078, + "grad_norm": 3.51735837917988, + "learning_rate": 4.8374008656272586e-05, + "loss": 0.1149, + "step": 5007 + }, + { + "epoch": 7.961844197138315, + "grad_norm": 72.72641604663492, + "learning_rate": 4.837718716620439e-05, + "loss": 2.4093, + "step": 5008 + }, + { + "epoch": 7.963434022257552, + "grad_norm": 3.656394431152972, + "learning_rate": 4.8380362840761676e-05, + "loss": 0.1938, + "step": 5009 + }, + { + "epoch": 7.965023847376789, + "grad_norm": 4.53340132389491, + "learning_rate": 4.838353567886959e-05, + "loss": 0.1221, + "step": 5010 + }, + { + "epoch": 7.966613672496026, + "grad_norm": 4.626590611320177, + "learning_rate": 4.838670567945424e-05, + "loss": 0.1367, + "step": 5011 + }, + { + "epoch": 7.968203497615263, + "grad_norm": 21.287611978067964, + "learning_rate": 4.8389872841442705e-05, + "loss": 1.2258, + "step": 5012 + }, + { + "epoch": 7.9697933227345, + "grad_norm": 4.63752469798867, + "learning_rate": 4.8393037163763e-05, + "loss": 0.181, + "step": 5013 + }, + { + "epoch": 7.971383147853736, + "grad_norm": 2.333990262694119, + "learning_rate": 4.8396198645344134e-05, + "loss": 0.1516, + "step": 5014 + }, + { + "epoch": 7.972972972972973, + "grad_norm": 5.377266394369033, + "learning_rate": 4.8399357285116045e-05, + "loss": 0.1836, + "step": 5015 + }, + { + "epoch": 7.97456279809221, + "grad_norm": 4.711071915768296, + "learning_rate": 4.840251308200966e-05, + "loss": 0.1612, + "step": 5016 + }, + { + "epoch": 7.976152623211447, + "grad_norm": 1.4702010273453119, + "learning_rate": 4.840566603495684e-05, + "loss": 0.1361, + "step": 5017 + }, + { + "epoch": 7.977742448330684, + "grad_norm": 4.3272317999364525, + "learning_rate": 4.840881614289045e-05, + "loss": 0.182, + "step": 5018 + }, + { + "epoch": 7.979332273449921, + "grad_norm": 2.8566255497366932, + "learning_rate": 4.841196340474427e-05, + "loss": 0.175, + "step": 5019 + }, + { + "epoch": 7.980922098569158, + "grad_norm": 2.5530906168616774, + "learning_rate": 4.841510781945306e-05, + "loss": 0.1966, + "step": 5020 + }, + { + "epoch": 7.982511923688394, + "grad_norm": 2.574887823804332, + "learning_rate": 4.841824938595258e-05, + "loss": 0.1303, + "step": 5021 + }, + { + "epoch": 7.984101748807631, + "grad_norm": 2.7344968921442794, + "learning_rate": 4.84213881031795e-05, + "loss": 0.1467, + "step": 5022 + }, + { + "epoch": 7.985691573926868, + "grad_norm": 1.7139668977011346, + "learning_rate": 4.8424523970071476e-05, + "loss": 0.1373, + "step": 5023 + }, + { + "epoch": 7.987281399046105, + "grad_norm": 2.1863213701401603, + "learning_rate": 4.842765698556715e-05, + "loss": 0.1708, + "step": 5024 + }, + { + "epoch": 7.988871224165342, + "grad_norm": 2.9409486271313074, + "learning_rate": 4.8430787148606084e-05, + "loss": 0.1747, + "step": 5025 + }, + { + "epoch": 7.990461049284579, + "grad_norm": 2.7856634254795534, + "learning_rate": 4.843391445812886e-05, + "loss": 0.2679, + "step": 5026 + }, + { + "epoch": 7.992050874403816, + "grad_norm": 2.356756428038399, + "learning_rate": 4.8437038913076975e-05, + "loss": 0.1242, + "step": 5027 + }, + { + "epoch": 7.993640699523052, + "grad_norm": 1.6939474389965583, + "learning_rate": 4.844016051239292e-05, + "loss": 0.123, + "step": 5028 + }, + { + "epoch": 7.995230524642289, + "grad_norm": 1.1415276109176946, + "learning_rate": 4.8443279255020146e-05, + "loss": 0.1174, + "step": 5029 + }, + { + "epoch": 7.996820349761526, + "grad_norm": 2.7278707722831483, + "learning_rate": 4.844639513990309e-05, + "loss": 0.1782, + "step": 5030 + }, + { + "epoch": 7.998410174880763, + "grad_norm": 1.8911915656045215, + "learning_rate": 4.84495081659871e-05, + "loss": 0.1361, + "step": 5031 + }, + { + "epoch": 8.0, + "grad_norm": 0.925024960554119, + "learning_rate": 4.845261833221856e-05, + "loss": 0.1079, + "step": 5032 + }, + { + "epoch": 8.001589825119236, + "grad_norm": 1.989107764401011, + "learning_rate": 4.8455725637544785e-05, + "loss": 0.1675, + "step": 5033 + }, + { + "epoch": 8.003179650238474, + "grad_norm": 2.10204168035289, + "learning_rate": 4.8458830080914055e-05, + "loss": 0.1427, + "step": 5034 + }, + { + "epoch": 8.00476947535771, + "grad_norm": 2.4918120409684743, + "learning_rate": 4.846193166127564e-05, + "loss": 0.104, + "step": 5035 + }, + { + "epoch": 8.006359300476948, + "grad_norm": 2.5552538175072783, + "learning_rate": 4.846503037757976e-05, + "loss": 0.2282, + "step": 5036 + }, + { + "epoch": 8.007949125596184, + "grad_norm": 17.708679381229274, + "learning_rate": 4.846812622877762e-05, + "loss": 0.6405, + "step": 5037 + }, + { + "epoch": 8.009538950715422, + "grad_norm": 29.95816071195867, + "learning_rate": 4.8471219213821375e-05, + "loss": 0.9014, + "step": 5038 + }, + { + "epoch": 8.011128775834658, + "grad_norm": 4.010814833868078, + "learning_rate": 4.847430933166417e-05, + "loss": 0.1559, + "step": 5039 + }, + { + "epoch": 8.012718600953894, + "grad_norm": 1.3113955660256025, + "learning_rate": 4.8477396581260104e-05, + "loss": 0.159, + "step": 5040 + }, + { + "epoch": 8.014308426073132, + "grad_norm": 3.3830668105294075, + "learning_rate": 4.848048096156426e-05, + "loss": 0.2008, + "step": 5041 + }, + { + "epoch": 8.015898251192368, + "grad_norm": 1.6304903489434344, + "learning_rate": 4.848356247153269e-05, + "loss": 0.0997, + "step": 5042 + }, + { + "epoch": 8.017488076311606, + "grad_norm": 2.053729717927404, + "learning_rate": 4.848664111012241e-05, + "loss": 0.1181, + "step": 5043 + }, + { + "epoch": 8.019077901430842, + "grad_norm": 2.339705707559315, + "learning_rate": 4.848971687629142e-05, + "loss": 0.2172, + "step": 5044 + }, + { + "epoch": 8.02066772655008, + "grad_norm": 1.9049209585492022, + "learning_rate": 4.849278976899867e-05, + "loss": 0.1698, + "step": 5045 + }, + { + "epoch": 8.022257551669316, + "grad_norm": 1.7479877192775553, + "learning_rate": 4.849585978720411e-05, + "loss": 0.1744, + "step": 5046 + }, + { + "epoch": 8.023847376788554, + "grad_norm": 2.601232710297744, + "learning_rate": 4.8498926929868645e-05, + "loss": 0.1153, + "step": 5047 + }, + { + "epoch": 8.02543720190779, + "grad_norm": 1.6748542772064732, + "learning_rate": 4.850199119595415e-05, + "loss": 0.1239, + "step": 5048 + }, + { + "epoch": 8.027027027027026, + "grad_norm": 1.9852019692223268, + "learning_rate": 4.850505258442351e-05, + "loss": 0.2041, + "step": 5049 + }, + { + "epoch": 8.028616852146264, + "grad_norm": 1.5200058012211717, + "learning_rate": 4.8508111094240514e-05, + "loss": 0.1664, + "step": 5050 + }, + { + "epoch": 8.0302066772655, + "grad_norm": 7.8813923493468945, + "learning_rate": 4.851116672437e-05, + "loss": 1.1759, + "step": 5051 + }, + { + "epoch": 8.031796502384738, + "grad_norm": 2.149300783597332, + "learning_rate": 4.851421947377773e-05, + "loss": 0.1144, + "step": 5052 + }, + { + "epoch": 8.033386327503974, + "grad_norm": 2.631064849511695, + "learning_rate": 4.8517269341430476e-05, + "loss": 0.1398, + "step": 5053 + }, + { + "epoch": 8.034976152623212, + "grad_norm": 2.0362179453743634, + "learning_rate": 4.852031632629596e-05, + "loss": 0.2771, + "step": 5054 + }, + { + "epoch": 8.036565977742448, + "grad_norm": 2.659735742624196, + "learning_rate": 4.8523360427342875e-05, + "loss": 0.1298, + "step": 5055 + }, + { + "epoch": 8.038155802861684, + "grad_norm": 3.3699110046016423, + "learning_rate": 4.8526401643540925e-05, + "loss": 0.1455, + "step": 5056 + }, + { + "epoch": 8.039745627980922, + "grad_norm": 2.4363078973284846, + "learning_rate": 4.852943997386075e-05, + "loss": 0.1363, + "step": 5057 + }, + { + "epoch": 8.041335453100158, + "grad_norm": 2.357255205382854, + "learning_rate": 4.8532475417274e-05, + "loss": 0.1403, + "step": 5058 + }, + { + "epoch": 8.042925278219396, + "grad_norm": 3.5152844141740114, + "learning_rate": 4.853550797275328e-05, + "loss": 0.1566, + "step": 5059 + }, + { + "epoch": 8.044515103338632, + "grad_norm": 2.920398335448487, + "learning_rate": 4.8538537639272175e-05, + "loss": 0.1136, + "step": 5060 + }, + { + "epoch": 8.04610492845787, + "grad_norm": 2.4211379317054504, + "learning_rate": 4.854156441580526e-05, + "loss": 0.2112, + "step": 5061 + }, + { + "epoch": 8.047694753577106, + "grad_norm": 1.9405182915941486, + "learning_rate": 4.854458830132808e-05, + "loss": 0.1861, + "step": 5062 + }, + { + "epoch": 8.049284578696344, + "grad_norm": 2.484882014963822, + "learning_rate": 4.854760929481715e-05, + "loss": 0.175, + "step": 5063 + }, + { + "epoch": 8.05087440381558, + "grad_norm": 2.4016260627204793, + "learning_rate": 4.855062739524999e-05, + "loss": 0.154, + "step": 5064 + }, + { + "epoch": 8.052464228934817, + "grad_norm": 1.6714071169990865, + "learning_rate": 4.855364260160507e-05, + "loss": 0.1855, + "step": 5065 + }, + { + "epoch": 8.054054054054054, + "grad_norm": 3.3494337759288264, + "learning_rate": 4.855665491286185e-05, + "loss": 0.1848, + "step": 5066 + }, + { + "epoch": 8.05564387917329, + "grad_norm": 3.2428780862555735, + "learning_rate": 4.8559664328000787e-05, + "loss": 0.1534, + "step": 5067 + }, + { + "epoch": 8.057233704292528, + "grad_norm": 3.2648436834465295, + "learning_rate": 4.856267084600328e-05, + "loss": 0.1613, + "step": 5068 + }, + { + "epoch": 8.058823529411764, + "grad_norm": 1.399458306156896, + "learning_rate": 4.856567446585176e-05, + "loss": 0.1258, + "step": 5069 + }, + { + "epoch": 8.060413354531002, + "grad_norm": 1.8608149007419768, + "learning_rate": 4.8568675186529584e-05, + "loss": 0.1123, + "step": 5070 + }, + { + "epoch": 8.062003179650238, + "grad_norm": 5.061552608209933, + "learning_rate": 4.8571673007021125e-05, + "loss": 0.1536, + "step": 5071 + }, + { + "epoch": 8.063593004769475, + "grad_norm": 1.6434893161404394, + "learning_rate": 4.857466792631173e-05, + "loss": 0.1755, + "step": 5072 + }, + { + "epoch": 8.065182829888712, + "grad_norm": 2.256192376170943, + "learning_rate": 4.857765994338774e-05, + "loss": 0.2697, + "step": 5073 + }, + { + "epoch": 8.066772655007949, + "grad_norm": 3.682355376107431, + "learning_rate": 4.858064905723645e-05, + "loss": 0.1762, + "step": 5074 + }, + { + "epoch": 8.068362480127186, + "grad_norm": 3.487010063237753, + "learning_rate": 4.858363526684615e-05, + "loss": 0.1418, + "step": 5075 + }, + { + "epoch": 8.069952305246423, + "grad_norm": 2.2733395496727504, + "learning_rate": 4.858661857120613e-05, + "loss": 0.1462, + "step": 5076 + }, + { + "epoch": 8.07154213036566, + "grad_norm": 1.5253731033270141, + "learning_rate": 4.858959896930665e-05, + "loss": 0.1606, + "step": 5077 + }, + { + "epoch": 8.073131955484897, + "grad_norm": 2.573573349019112, + "learning_rate": 4.859257646013893e-05, + "loss": 0.1668, + "step": 5078 + }, + { + "epoch": 8.074721780604133, + "grad_norm": 2.3639744649447834, + "learning_rate": 4.859555104269522e-05, + "loss": 0.1701, + "step": 5079 + }, + { + "epoch": 8.07631160572337, + "grad_norm": 3.46895466841614, + "learning_rate": 4.859852271596873e-05, + "loss": 0.179, + "step": 5080 + }, + { + "epoch": 8.077901430842607, + "grad_norm": 1.7324412639155373, + "learning_rate": 4.860149147895365e-05, + "loss": 0.1831, + "step": 5081 + }, + { + "epoch": 8.079491255961845, + "grad_norm": 4.054462334402116, + "learning_rate": 4.8604457330645173e-05, + "loss": 0.172, + "step": 5082 + }, + { + "epoch": 8.08108108108108, + "grad_norm": 1.8345069868168538, + "learning_rate": 4.860742027003944e-05, + "loss": 0.1516, + "step": 5083 + }, + { + "epoch": 8.082670906200319, + "grad_norm": 2.9366653892170955, + "learning_rate": 4.861038029613362e-05, + "loss": 0.1255, + "step": 5084 + }, + { + "epoch": 8.084260731319555, + "grad_norm": 2.1227697436929787, + "learning_rate": 4.8613337407925855e-05, + "loss": 0.1981, + "step": 5085 + }, + { + "epoch": 8.085850556438793, + "grad_norm": 1.6663289171036149, + "learning_rate": 4.861629160441527e-05, + "loss": 0.1369, + "step": 5086 + }, + { + "epoch": 8.087440381558029, + "grad_norm": 1.7742186171482957, + "learning_rate": 4.8619242884601954e-05, + "loss": 0.1729, + "step": 5087 + }, + { + "epoch": 8.089030206677265, + "grad_norm": 2.505975295467478, + "learning_rate": 4.862219124748703e-05, + "loss": 0.1565, + "step": 5088 + }, + { + "epoch": 8.090620031796503, + "grad_norm": 1.4969390085601693, + "learning_rate": 4.8625136692072574e-05, + "loss": 0.1329, + "step": 5089 + }, + { + "epoch": 8.092209856915739, + "grad_norm": 1.9226946206321054, + "learning_rate": 4.8628079217361663e-05, + "loss": 0.1535, + "step": 5090 + }, + { + "epoch": 8.093799682034977, + "grad_norm": 1.7962061657998751, + "learning_rate": 4.863101882235837e-05, + "loss": 0.1729, + "step": 5091 + }, + { + "epoch": 8.095389507154213, + "grad_norm": 2.714155443758785, + "learning_rate": 4.863395550606772e-05, + "loss": 0.1423, + "step": 5092 + }, + { + "epoch": 8.09697933227345, + "grad_norm": 3.220946056173669, + "learning_rate": 4.863688926749577e-05, + "loss": 0.1578, + "step": 5093 + }, + { + "epoch": 8.098569157392687, + "grad_norm": 75.36177943614244, + "learning_rate": 4.8639820105649535e-05, + "loss": 10.5632, + "step": 5094 + }, + { + "epoch": 8.100158982511923, + "grad_norm": 2.0920316126414784, + "learning_rate": 4.864274801953705e-05, + "loss": 0.1298, + "step": 5095 + }, + { + "epoch": 8.10174880763116, + "grad_norm": 2.349893150327733, + "learning_rate": 4.864567300816731e-05, + "loss": 0.1386, + "step": 5096 + }, + { + "epoch": 8.103338632750397, + "grad_norm": 1.6148199090849726, + "learning_rate": 4.8648595070550316e-05, + "loss": 0.1654, + "step": 5097 + }, + { + "epoch": 8.104928457869635, + "grad_norm": 1.1855275938211247, + "learning_rate": 4.865151420569705e-05, + "loss": 0.1748, + "step": 5098 + }, + { + "epoch": 8.106518282988871, + "grad_norm": 3.757814379494921, + "learning_rate": 4.86544304126195e-05, + "loss": 0.2652, + "step": 5099 + }, + { + "epoch": 8.108108108108109, + "grad_norm": 2.0810210084667466, + "learning_rate": 4.865734369033062e-05, + "loss": 0.1435, + "step": 5100 + }, + { + "epoch": 8.109697933227345, + "grad_norm": 2.8327125865112275, + "learning_rate": 4.866025403784439e-05, + "loss": 0.0961, + "step": 5101 + }, + { + "epoch": 8.111287758346581, + "grad_norm": 2.186951042405222, + "learning_rate": 4.866316145417575e-05, + "loss": 0.0871, + "step": 5102 + }, + { + "epoch": 8.112877583465819, + "grad_norm": 3.074450709300555, + "learning_rate": 4.866606593834065e-05, + "loss": 0.1334, + "step": 5103 + }, + { + "epoch": 8.114467408585055, + "grad_norm": 2.5700805469993955, + "learning_rate": 4.8668967489356034e-05, + "loss": 0.1737, + "step": 5104 + }, + { + "epoch": 8.116057233704293, + "grad_norm": 1.6084743916483626, + "learning_rate": 4.867186610623981e-05, + "loss": 0.1386, + "step": 5105 + }, + { + "epoch": 8.117647058823529, + "grad_norm": 3.3816783243757107, + "learning_rate": 4.867476178801093e-05, + "loss": 0.1834, + "step": 5106 + }, + { + "epoch": 8.119236883942767, + "grad_norm": 2.253987751977176, + "learning_rate": 4.867765453368929e-05, + "loss": 0.2127, + "step": 5107 + }, + { + "epoch": 8.120826709062003, + "grad_norm": 2.025088584386359, + "learning_rate": 4.86805443422958e-05, + "loss": 0.1535, + "step": 5108 + }, + { + "epoch": 8.12241653418124, + "grad_norm": 2.175575740413713, + "learning_rate": 4.868343121285238e-05, + "loss": 0.1717, + "step": 5109 + }, + { + "epoch": 8.124006359300477, + "grad_norm": 3.068379900181183, + "learning_rate": 4.868631514438191e-05, + "loss": 0.1306, + "step": 5110 + }, + { + "epoch": 8.125596184419713, + "grad_norm": 2.248948629379114, + "learning_rate": 4.8689196135908304e-05, + "loss": 0.2042, + "step": 5111 + }, + { + "epoch": 8.127186009538951, + "grad_norm": 2.0092609175527305, + "learning_rate": 4.869207418645643e-05, + "loss": 0.1786, + "step": 5112 + }, + { + "epoch": 8.128775834658187, + "grad_norm": 3.407706667274365, + "learning_rate": 4.869494929505219e-05, + "loss": 0.1982, + "step": 5113 + }, + { + "epoch": 8.130365659777425, + "grad_norm": 3.0147510131993767, + "learning_rate": 4.869782146072246e-05, + "loss": 0.1854, + "step": 5114 + }, + { + "epoch": 8.131955484896661, + "grad_norm": 1.5695947214518804, + "learning_rate": 4.87006906824951e-05, + "loss": 0.145, + "step": 5115 + }, + { + "epoch": 8.133545310015899, + "grad_norm": 4.833169820705534, + "learning_rate": 4.8703556959399e-05, + "loss": 0.2007, + "step": 5116 + }, + { + "epoch": 8.135135135135135, + "grad_norm": 3.9600644260696094, + "learning_rate": 4.8706420290464016e-05, + "loss": 0.1664, + "step": 5117 + }, + { + "epoch": 8.136724960254371, + "grad_norm": 3.9825442762113346, + "learning_rate": 4.870928067472103e-05, + "loss": 0.2354, + "step": 5118 + }, + { + "epoch": 8.138314785373609, + "grad_norm": 2.6415039032099052, + "learning_rate": 4.8712138111201895e-05, + "loss": 0.1143, + "step": 5119 + }, + { + "epoch": 8.139904610492845, + "grad_norm": 2.407983251857994, + "learning_rate": 4.871499259893947e-05, + "loss": 0.185, + "step": 5120 + }, + { + "epoch": 8.141494435612083, + "grad_norm": 3.023349539440283, + "learning_rate": 4.871784413696762e-05, + "loss": 0.1623, + "step": 5121 + }, + { + "epoch": 8.14308426073132, + "grad_norm": 3.3805192455978337, + "learning_rate": 4.872069272432121e-05, + "loss": 0.1775, + "step": 5122 + }, + { + "epoch": 8.144674085850557, + "grad_norm": 2.6870568389920764, + "learning_rate": 4.8723538360036077e-05, + "loss": 0.1519, + "step": 5123 + }, + { + "epoch": 8.146263910969793, + "grad_norm": 2.9914578830004386, + "learning_rate": 4.872638104314909e-05, + "loss": 0.1667, + "step": 5124 + }, + { + "epoch": 8.147853736089031, + "grad_norm": 1.6159279862744265, + "learning_rate": 4.87292207726981e-05, + "loss": 0.1182, + "step": 5125 + }, + { + "epoch": 8.149443561208267, + "grad_norm": 2.2214822011658906, + "learning_rate": 4.873205754772196e-05, + "loss": 0.1308, + "step": 5126 + }, + { + "epoch": 8.151033386327503, + "grad_norm": 3.5764076108523035, + "learning_rate": 4.8734891367260525e-05, + "loss": 0.2098, + "step": 5127 + }, + { + "epoch": 8.152623211446741, + "grad_norm": 6.02222529069119, + "learning_rate": 4.8737722230354655e-05, + "loss": 0.2569, + "step": 5128 + }, + { + "epoch": 8.154213036565977, + "grad_norm": 7.065109609755068, + "learning_rate": 4.87405501360462e-05, + "loss": 0.1658, + "step": 5129 + }, + { + "epoch": 8.155802861685215, + "grad_norm": 39.89551919826817, + "learning_rate": 4.874337508337801e-05, + "loss": 3.2823, + "step": 5130 + }, + { + "epoch": 8.157392686804451, + "grad_norm": 7.973781235316152, + "learning_rate": 4.874619707139396e-05, + "loss": 0.1853, + "step": 5131 + }, + { + "epoch": 8.15898251192369, + "grad_norm": 6.329582590176935, + "learning_rate": 4.87490160991389e-05, + "loss": 0.1553, + "step": 5132 + }, + { + "epoch": 8.160572337042925, + "grad_norm": 6.858028108699794, + "learning_rate": 4.875183216565868e-05, + "loss": 0.1701, + "step": 5133 + }, + { + "epoch": 8.162162162162161, + "grad_norm": 5.758975850370503, + "learning_rate": 4.875464527000018e-05, + "loss": 0.1555, + "step": 5134 + }, + { + "epoch": 8.1637519872814, + "grad_norm": 5.741301440435139, + "learning_rate": 4.8757455411211266e-05, + "loss": 0.2389, + "step": 5135 + }, + { + "epoch": 8.165341812400635, + "grad_norm": 7.332041417984684, + "learning_rate": 4.876026258834079e-05, + "loss": 0.169, + "step": 5136 + }, + { + "epoch": 8.166931637519873, + "grad_norm": 4.068308988177857, + "learning_rate": 4.8763066800438635e-05, + "loss": 0.1618, + "step": 5137 + }, + { + "epoch": 8.16852146263911, + "grad_norm": 5.986395387747771, + "learning_rate": 4.876586804655568e-05, + "loss": 0.8942, + "step": 5138 + }, + { + "epoch": 8.170111287758347, + "grad_norm": 6.346919712646774, + "learning_rate": 4.8768666325743806e-05, + "loss": 0.1728, + "step": 5139 + }, + { + "epoch": 8.171701112877583, + "grad_norm": 5.21306762001224, + "learning_rate": 4.877146163705589e-05, + "loss": 0.1367, + "step": 5140 + }, + { + "epoch": 8.17329093799682, + "grad_norm": 19.44706709532741, + "learning_rate": 4.877425397954583e-05, + "loss": 2.1659, + "step": 5141 + }, + { + "epoch": 8.174880763116057, + "grad_norm": 4.163740128320601, + "learning_rate": 4.8777043352268495e-05, + "loss": 0.1362, + "step": 5142 + }, + { + "epoch": 8.176470588235293, + "grad_norm": 5.3487394858397375, + "learning_rate": 4.877982975427981e-05, + "loss": 0.1857, + "step": 5143 + }, + { + "epoch": 8.178060413354531, + "grad_norm": 8.437431986420682, + "learning_rate": 4.878261318463667e-05, + "loss": 0.1618, + "step": 5144 + }, + { + "epoch": 8.179650238473767, + "grad_norm": 4.069851437525765, + "learning_rate": 4.878539364239697e-05, + "loss": 0.1722, + "step": 5145 + }, + { + "epoch": 8.181240063593005, + "grad_norm": 3.259277439054276, + "learning_rate": 4.878817112661966e-05, + "loss": 0.1655, + "step": 5146 + }, + { + "epoch": 8.182829888712241, + "grad_norm": 4.124615043932462, + "learning_rate": 4.879094563636463e-05, + "loss": 0.1451, + "step": 5147 + }, + { + "epoch": 8.18441971383148, + "grad_norm": 7.654023684998529, + "learning_rate": 4.879371717069282e-05, + "loss": 0.162, + "step": 5148 + }, + { + "epoch": 8.186009538950715, + "grad_norm": 3.6814200714536116, + "learning_rate": 4.879648572866616e-05, + "loss": 0.1129, + "step": 5149 + }, + { + "epoch": 8.187599364069952, + "grad_norm": 2.6310844814604986, + "learning_rate": 4.879925130934761e-05, + "loss": 0.182, + "step": 5150 + }, + { + "epoch": 8.18918918918919, + "grad_norm": 4.224656188466838, + "learning_rate": 4.880201391180111e-05, + "loss": 0.1939, + "step": 5151 + }, + { + "epoch": 8.190779014308426, + "grad_norm": 8.199392219149974, + "learning_rate": 4.880477353509162e-05, + "loss": 0.2188, + "step": 5152 + }, + { + "epoch": 8.192368839427663, + "grad_norm": 3.037337703526505, + "learning_rate": 4.8807530178285106e-05, + "loss": 0.1805, + "step": 5153 + }, + { + "epoch": 8.1939586645469, + "grad_norm": 4.958204366187704, + "learning_rate": 4.881028384044855e-05, + "loss": 0.2067, + "step": 5154 + }, + { + "epoch": 8.195548489666137, + "grad_norm": 2.94269733282517, + "learning_rate": 4.8813034520649924e-05, + "loss": 0.2347, + "step": 5155 + }, + { + "epoch": 8.197138314785374, + "grad_norm": 4.692347114989652, + "learning_rate": 4.881578221795823e-05, + "loss": 0.2012, + "step": 5156 + }, + { + "epoch": 8.19872813990461, + "grad_norm": 8.101816048744546, + "learning_rate": 4.8818526931443485e-05, + "loss": 0.191, + "step": 5157 + }, + { + "epoch": 8.200317965023848, + "grad_norm": 5.770573230813346, + "learning_rate": 4.882126866017668e-05, + "loss": 0.1651, + "step": 5158 + }, + { + "epoch": 8.201907790143084, + "grad_norm": 3.7366074780925067, + "learning_rate": 4.8824007403229856e-05, + "loss": 0.1512, + "step": 5159 + }, + { + "epoch": 8.203497615262322, + "grad_norm": 6.734892775608437, + "learning_rate": 4.882674315967604e-05, + "loss": 0.1976, + "step": 5160 + }, + { + "epoch": 8.205087440381558, + "grad_norm": 7.637269689687009, + "learning_rate": 4.882947592858927e-05, + "loss": 0.304, + "step": 5161 + }, + { + "epoch": 8.206677265500796, + "grad_norm": 97.90059977319034, + "learning_rate": 4.8832205709044616e-05, + "loss": 4.642, + "step": 5162 + }, + { + "epoch": 8.208267090620032, + "grad_norm": 7.6042165185828985, + "learning_rate": 4.8834932500118145e-05, + "loss": 1.398, + "step": 5163 + }, + { + "epoch": 8.20985691573927, + "grad_norm": 3.558600878836077, + "learning_rate": 4.8837656300886934e-05, + "loss": 0.1384, + "step": 5164 + }, + { + "epoch": 8.211446740858506, + "grad_norm": 2.722214540350977, + "learning_rate": 4.884037711042907e-05, + "loss": 0.1643, + "step": 5165 + }, + { + "epoch": 8.213036565977742, + "grad_norm": 4.840202520019953, + "learning_rate": 4.884309492782367e-05, + "loss": 0.1969, + "step": 5166 + }, + { + "epoch": 8.21462639109698, + "grad_norm": 6.370580431527253, + "learning_rate": 4.884580975215084e-05, + "loss": 0.207, + "step": 5167 + }, + { + "epoch": 8.216216216216216, + "grad_norm": 5.000963481905595, + "learning_rate": 4.884852158249171e-05, + "loss": 0.1368, + "step": 5168 + }, + { + "epoch": 8.217806041335454, + "grad_norm": 2.4877380733947105, + "learning_rate": 4.8851230417928434e-05, + "loss": 0.163, + "step": 5169 + }, + { + "epoch": 8.21939586645469, + "grad_norm": 2.669421997818057, + "learning_rate": 4.8853936257544164e-05, + "loss": 0.1617, + "step": 5170 + }, + { + "epoch": 8.220985691573928, + "grad_norm": 11.155057582548519, + "learning_rate": 4.885663910042306e-05, + "loss": 0.1643, + "step": 5171 + }, + { + "epoch": 8.222575516693164, + "grad_norm": 5.970463845638016, + "learning_rate": 4.885933894565032e-05, + "loss": 0.1395, + "step": 5172 + }, + { + "epoch": 8.2241653418124, + "grad_norm": 5.394771150309761, + "learning_rate": 4.886203579231215e-05, + "loss": 0.1382, + "step": 5173 + }, + { + "epoch": 8.225755166931638, + "grad_norm": 4.188564426935115, + "learning_rate": 4.886472963949575e-05, + "loss": 0.2, + "step": 5174 + }, + { + "epoch": 8.227344992050874, + "grad_norm": 8.931463146857507, + "learning_rate": 4.8867420486289354e-05, + "loss": 0.1801, + "step": 5175 + }, + { + "epoch": 8.228934817170112, + "grad_norm": 8.833761808555167, + "learning_rate": 4.887010833178222e-05, + "loss": 0.1681, + "step": 5176 + }, + { + "epoch": 8.230524642289348, + "grad_norm": 3.9647504007996597, + "learning_rate": 4.88727931750646e-05, + "loss": 0.1298, + "step": 5177 + }, + { + "epoch": 8.232114467408586, + "grad_norm": 6.044575551671389, + "learning_rate": 4.8875475015227757e-05, + "loss": 0.2581, + "step": 5178 + }, + { + "epoch": 8.233704292527822, + "grad_norm": 6.829881060934325, + "learning_rate": 4.887815385136402e-05, + "loss": 0.1676, + "step": 5179 + }, + { + "epoch": 8.235294117647058, + "grad_norm": 6.391502807997331, + "learning_rate": 4.888082968256666e-05, + "loss": 0.1981, + "step": 5180 + }, + { + "epoch": 8.236883942766296, + "grad_norm": 5.3806315264486635, + "learning_rate": 4.888350250793004e-05, + "loss": 0.1741, + "step": 5181 + }, + { + "epoch": 8.238473767885532, + "grad_norm": 4.317727754970113, + "learning_rate": 4.8886172326549487e-05, + "loss": 0.1752, + "step": 5182 + }, + { + "epoch": 8.24006359300477, + "grad_norm": 8.512217022928274, + "learning_rate": 4.8888839137521374e-05, + "loss": 0.2635, + "step": 5183 + }, + { + "epoch": 8.241653418124006, + "grad_norm": 10.718361276680985, + "learning_rate": 4.8891502939943066e-05, + "loss": 0.258, + "step": 5184 + }, + { + "epoch": 8.243243243243244, + "grad_norm": 4.736729915587201, + "learning_rate": 4.889416373291298e-05, + "loss": 0.2422, + "step": 5185 + }, + { + "epoch": 8.24483306836248, + "grad_norm": 6.0482217978997435, + "learning_rate": 4.889682151553052e-05, + "loss": 0.1838, + "step": 5186 + }, + { + "epoch": 8.246422893481718, + "grad_norm": 4.962444063929784, + "learning_rate": 4.889947628689613e-05, + "loss": 0.1829, + "step": 5187 + }, + { + "epoch": 8.248012718600954, + "grad_norm": 8.520293487623997, + "learning_rate": 4.8902128046111265e-05, + "loss": 0.2805, + "step": 5188 + }, + { + "epoch": 8.24960254372019, + "grad_norm": 3.8317577871902797, + "learning_rate": 4.890477679227841e-05, + "loss": 0.1693, + "step": 5189 + }, + { + "epoch": 8.251192368839428, + "grad_norm": 4.882498688998951, + "learning_rate": 4.8907422524501035e-05, + "loss": 0.2435, + "step": 5190 + }, + { + "epoch": 8.252782193958664, + "grad_norm": 3.945022711690706, + "learning_rate": 4.891006524188368e-05, + "loss": 0.1708, + "step": 5191 + }, + { + "epoch": 8.254372019077902, + "grad_norm": 16.06430289263403, + "learning_rate": 4.8912704943531875e-05, + "loss": 1.4076, + "step": 5192 + }, + { + "epoch": 8.255961844197138, + "grad_norm": 5.873966633531453, + "learning_rate": 4.891534162855217e-05, + "loss": 0.1456, + "step": 5193 + }, + { + "epoch": 8.257551669316376, + "grad_norm": 6.866517041196331, + "learning_rate": 4.8917975296052143e-05, + "loss": 0.1769, + "step": 5194 + }, + { + "epoch": 8.259141494435612, + "grad_norm": 6.650230467671032, + "learning_rate": 4.89206059451404e-05, + "loss": 0.1805, + "step": 5195 + }, + { + "epoch": 8.260731319554848, + "grad_norm": 3.5341219459840283, + "learning_rate": 4.892323357492656e-05, + "loss": 0.1161, + "step": 5196 + }, + { + "epoch": 8.262321144674086, + "grad_norm": 14.013213368948511, + "learning_rate": 4.892585818452126e-05, + "loss": 0.5632, + "step": 5197 + }, + { + "epoch": 8.263910969793322, + "grad_norm": 6.1331426128261555, + "learning_rate": 4.892847977303617e-05, + "loss": 0.1288, + "step": 5198 + }, + { + "epoch": 8.26550079491256, + "grad_norm": 3.9631569790210857, + "learning_rate": 4.893109833958397e-05, + "loss": 0.1559, + "step": 5199 + }, + { + "epoch": 8.267090620031796, + "grad_norm": 5.164803340192376, + "learning_rate": 4.893371388327838e-05, + "loss": 0.1723, + "step": 5200 + }, + { + "epoch": 8.268680445151034, + "grad_norm": 3.1909787563243364, + "learning_rate": 4.893632640323412e-05, + "loss": 0.1792, + "step": 5201 + }, + { + "epoch": 8.27027027027027, + "grad_norm": 4.00174147756515, + "learning_rate": 4.893893589856696e-05, + "loss": 0.155, + "step": 5202 + }, + { + "epoch": 8.271860095389506, + "grad_norm": 5.260586429382655, + "learning_rate": 4.8941542368393684e-05, + "loss": 0.1551, + "step": 5203 + }, + { + "epoch": 8.273449920508744, + "grad_norm": 3.494809418478414, + "learning_rate": 4.894414581183208e-05, + "loss": 0.2013, + "step": 5204 + }, + { + "epoch": 8.27503974562798, + "grad_norm": 5.838286594044415, + "learning_rate": 4.8946746228000984e-05, + "loss": 0.1372, + "step": 5205 + }, + { + "epoch": 8.276629570747218, + "grad_norm": 5.196317897579684, + "learning_rate": 4.894934361602025e-05, + "loss": 0.1405, + "step": 5206 + }, + { + "epoch": 8.278219395866454, + "grad_norm": 5.856442941594216, + "learning_rate": 4.895193797501076e-05, + "loss": 0.1788, + "step": 5207 + }, + { + "epoch": 8.279809220985692, + "grad_norm": 5.5449694105474565, + "learning_rate": 4.895452930409441e-05, + "loss": 0.2566, + "step": 5208 + }, + { + "epoch": 8.281399046104928, + "grad_norm": 87.86077550014348, + "learning_rate": 4.895711760239414e-05, + "loss": 4.4801, + "step": 5209 + }, + { + "epoch": 8.282988871224166, + "grad_norm": 9.590915178282932, + "learning_rate": 4.895970286903388e-05, + "loss": 0.1611, + "step": 5210 + }, + { + "epoch": 8.284578696343402, + "grad_norm": 3.128102702413848, + "learning_rate": 4.896228510313864e-05, + "loss": 0.1846, + "step": 5211 + }, + { + "epoch": 8.286168521462638, + "grad_norm": 6.567415604785429, + "learning_rate": 4.896486430383441e-05, + "loss": 0.1966, + "step": 5212 + }, + { + "epoch": 8.287758346581876, + "grad_norm": 7.721039401642896, + "learning_rate": 4.896744047024823e-05, + "loss": 0.2279, + "step": 5213 + }, + { + "epoch": 8.289348171701112, + "grad_norm": 2.948921880715489, + "learning_rate": 4.897001360150816e-05, + "loss": 0.2146, + "step": 5214 + }, + { + "epoch": 8.29093799682035, + "grad_norm": 5.659402234850209, + "learning_rate": 4.897258369674329e-05, + "loss": 0.27, + "step": 5215 + }, + { + "epoch": 8.292527821939586, + "grad_norm": 4.559841348103803, + "learning_rate": 4.897515075508373e-05, + "loss": 0.1553, + "step": 5216 + }, + { + "epoch": 8.294117647058824, + "grad_norm": 6.706544222161147, + "learning_rate": 4.897771477566063e-05, + "loss": 0.1886, + "step": 5217 + }, + { + "epoch": 8.29570747217806, + "grad_norm": 4.548358254777338, + "learning_rate": 4.898027575760616e-05, + "loss": 0.1829, + "step": 5218 + }, + { + "epoch": 8.297297297297296, + "grad_norm": 5.1247578637065825, + "learning_rate": 4.898283370005352e-05, + "loss": 0.2366, + "step": 5219 + }, + { + "epoch": 8.298887122416534, + "grad_norm": 4.571321321419958, + "learning_rate": 4.898538860213694e-05, + "loss": 0.1594, + "step": 5220 + }, + { + "epoch": 8.30047694753577, + "grad_norm": 4.537687256467799, + "learning_rate": 4.8987940462991673e-05, + "loss": 0.8997, + "step": 5221 + }, + { + "epoch": 8.302066772655008, + "grad_norm": 2.6697843199493025, + "learning_rate": 4.899048928175401e-05, + "loss": 0.1543, + "step": 5222 + }, + { + "epoch": 8.303656597774244, + "grad_norm": 2.756016651302308, + "learning_rate": 4.8993035057561275e-05, + "loss": 0.1856, + "step": 5223 + }, + { + "epoch": 8.305246422893482, + "grad_norm": 3.253289504201642, + "learning_rate": 4.899557778955181e-05, + "loss": 0.1276, + "step": 5224 + }, + { + "epoch": 8.306836248012718, + "grad_norm": 3.4433672875085204, + "learning_rate": 4.899811747686498e-05, + "loss": 0.1167, + "step": 5225 + }, + { + "epoch": 8.308426073131955, + "grad_norm": 2.824304003167212, + "learning_rate": 4.9000654118641216e-05, + "loss": 0.1651, + "step": 5226 + }, + { + "epoch": 8.310015898251192, + "grad_norm": 3.9514099250939707, + "learning_rate": 4.900318771402194e-05, + "loss": 0.1611, + "step": 5227 + }, + { + "epoch": 8.311605723370429, + "grad_norm": 4.190345561846332, + "learning_rate": 4.900571826214962e-05, + "loss": 0.1512, + "step": 5228 + }, + { + "epoch": 8.313195548489666, + "grad_norm": 2.9013314500611087, + "learning_rate": 4.9008245762167774e-05, + "loss": 0.1627, + "step": 5229 + }, + { + "epoch": 8.314785373608903, + "grad_norm": 3.351260112020737, + "learning_rate": 4.901077021322092e-05, + "loss": 0.1923, + "step": 5230 + }, + { + "epoch": 8.31637519872814, + "grad_norm": 3.9189466201487284, + "learning_rate": 4.901329161445462e-05, + "loss": 0.1853, + "step": 5231 + }, + { + "epoch": 8.317965023847377, + "grad_norm": 3.0202256536078487, + "learning_rate": 4.901580996501549e-05, + "loss": 0.1247, + "step": 5232 + }, + { + "epoch": 8.319554848966614, + "grad_norm": 2.579485755274775, + "learning_rate": 4.901832526405114e-05, + "loss": 0.1665, + "step": 5233 + }, + { + "epoch": 8.32114467408585, + "grad_norm": 2.9397876030323453, + "learning_rate": 4.902083751071024e-05, + "loss": 0.1811, + "step": 5234 + }, + { + "epoch": 8.322734499205087, + "grad_norm": 4.4254710250530875, + "learning_rate": 4.902334670414249e-05, + "loss": 0.1398, + "step": 5235 + }, + { + "epoch": 8.324324324324325, + "grad_norm": 3.1932599757742643, + "learning_rate": 4.902585284349861e-05, + "loss": 0.1582, + "step": 5236 + }, + { + "epoch": 8.32591414944356, + "grad_norm": 4.634639633651532, + "learning_rate": 4.9028355927930364e-05, + "loss": 0.1292, + "step": 5237 + }, + { + "epoch": 8.327503974562799, + "grad_norm": 3.533237632194829, + "learning_rate": 4.9030855956590556e-05, + "loss": 0.1915, + "step": 5238 + }, + { + "epoch": 8.329093799682035, + "grad_norm": 2.7767241728853045, + "learning_rate": 4.903335292863301e-05, + "loss": 0.1131, + "step": 5239 + }, + { + "epoch": 8.330683624801273, + "grad_norm": 4.802771803266855, + "learning_rate": 4.90358468432126e-05, + "loss": 0.1367, + "step": 5240 + }, + { + "epoch": 8.332273449920509, + "grad_norm": 5.098031567717378, + "learning_rate": 4.9038337699485204e-05, + "loss": 0.1937, + "step": 5241 + }, + { + "epoch": 8.333863275039745, + "grad_norm": 4.737081570876735, + "learning_rate": 4.904082549660779e-05, + "loss": 0.2119, + "step": 5242 + }, + { + "epoch": 8.335453100158983, + "grad_norm": 2.9478429925511667, + "learning_rate": 4.90433102337383e-05, + "loss": 0.196, + "step": 5243 + }, + { + "epoch": 8.337042925278219, + "grad_norm": 3.8561220831821137, + "learning_rate": 4.904579191003576e-05, + "loss": 0.2372, + "step": 5244 + }, + { + "epoch": 8.338632750397457, + "grad_norm": 3.0099693909976035, + "learning_rate": 4.90482705246602e-05, + "loss": 0.151, + "step": 5245 + }, + { + "epoch": 8.340222575516693, + "grad_norm": 2.193109020057152, + "learning_rate": 4.90507460767727e-05, + "loss": 0.1978, + "step": 5246 + }, + { + "epoch": 8.34181240063593, + "grad_norm": 6.39258534503877, + "learning_rate": 4.905321856553539e-05, + "loss": 0.1537, + "step": 5247 + }, + { + "epoch": 8.343402225755167, + "grad_norm": 2.379677022393169, + "learning_rate": 4.90556879901114e-05, + "loss": 0.2022, + "step": 5248 + }, + { + "epoch": 8.344992050874405, + "grad_norm": 2.79748801443883, + "learning_rate": 4.905815434966493e-05, + "loss": 0.1312, + "step": 5249 + }, + { + "epoch": 8.34658187599364, + "grad_norm": 5.441808451721164, + "learning_rate": 4.906061764336121e-05, + "loss": 0.2791, + "step": 5250 + }, + { + "epoch": 8.348171701112877, + "grad_norm": 5.305065736541831, + "learning_rate": 4.90630778703665e-05, + "loss": 0.1901, + "step": 5251 + }, + { + "epoch": 8.349761526232115, + "grad_norm": 3.6061518577156493, + "learning_rate": 4.906553502984811e-05, + "loss": 0.1887, + "step": 5252 + }, + { + "epoch": 8.35135135135135, + "grad_norm": 2.960120012606837, + "learning_rate": 4.9067989120974365e-05, + "loss": 0.1636, + "step": 5253 + }, + { + "epoch": 8.352941176470589, + "grad_norm": 106.46024947762956, + "learning_rate": 4.907044014291465e-05, + "loss": 10.2831, + "step": 5254 + }, + { + "epoch": 8.354531001589825, + "grad_norm": 2.3653963338019817, + "learning_rate": 4.9072888094839395e-05, + "loss": 0.1272, + "step": 5255 + }, + { + "epoch": 8.356120826709063, + "grad_norm": 2.358956290349129, + "learning_rate": 4.9075332975920044e-05, + "loss": 0.1568, + "step": 5256 + }, + { + "epoch": 8.357710651828299, + "grad_norm": 1.9740540950788474, + "learning_rate": 4.9077774785329086e-05, + "loss": 0.1723, + "step": 5257 + }, + { + "epoch": 8.359300476947535, + "grad_norm": 2.5672380672896358, + "learning_rate": 4.908021352224008e-05, + "loss": 0.1723, + "step": 5258 + }, + { + "epoch": 8.360890302066773, + "grad_norm": 4.492768352970576, + "learning_rate": 4.908264918582759e-05, + "loss": 0.2534, + "step": 5259 + }, + { + "epoch": 8.362480127186009, + "grad_norm": 2.822903171506547, + "learning_rate": 4.9085081775267224e-05, + "loss": 0.2096, + "step": 5260 + }, + { + "epoch": 8.364069952305247, + "grad_norm": 22.37692774602424, + "learning_rate": 4.908751128973565e-05, + "loss": 1.7843, + "step": 5261 + }, + { + "epoch": 8.365659777424483, + "grad_norm": 2.0605519122002973, + "learning_rate": 4.908993772841055e-05, + "loss": 0.1181, + "step": 5262 + }, + { + "epoch": 8.36724960254372, + "grad_norm": 2.4204997321029498, + "learning_rate": 4.9092361090470686e-05, + "loss": 0.1931, + "step": 5263 + }, + { + "epoch": 8.368839427662957, + "grad_norm": 2.8684335333211433, + "learning_rate": 4.9094781375095826e-05, + "loss": 0.1759, + "step": 5264 + }, + { + "epoch": 8.370429252782195, + "grad_norm": 1.8659848105796324, + "learning_rate": 4.909719858146679e-05, + "loss": 0.1252, + "step": 5265 + }, + { + "epoch": 8.372019077901431, + "grad_norm": 5.849631051963039, + "learning_rate": 4.9099612708765436e-05, + "loss": 0.1978, + "step": 5266 + }, + { + "epoch": 8.373608903020667, + "grad_norm": 3.9283184986611266, + "learning_rate": 4.910202375617468e-05, + "loss": 0.1411, + "step": 5267 + }, + { + "epoch": 8.375198728139905, + "grad_norm": 3.497363741466084, + "learning_rate": 4.9104431722878465e-05, + "loss": 0.1743, + "step": 5268 + }, + { + "epoch": 8.376788553259141, + "grad_norm": 1.9346962150505627, + "learning_rate": 4.910683660806178e-05, + "loss": 0.1209, + "step": 5269 + }, + { + "epoch": 8.378378378378379, + "grad_norm": 3.7233080710675805, + "learning_rate": 4.9109238410910656e-05, + "loss": 0.226, + "step": 5270 + }, + { + "epoch": 8.379968203497615, + "grad_norm": 3.449231595526045, + "learning_rate": 4.911163713061217e-05, + "loss": 0.1833, + "step": 5271 + }, + { + "epoch": 8.381558028616853, + "grad_norm": 2.4162674074122474, + "learning_rate": 4.911403276635446e-05, + "loss": 0.188, + "step": 5272 + }, + { + "epoch": 8.383147853736089, + "grad_norm": 2.731341131133504, + "learning_rate": 4.911642531732667e-05, + "loss": 0.1493, + "step": 5273 + }, + { + "epoch": 8.384737678855325, + "grad_norm": 1.9543608791707343, + "learning_rate": 4.9118814782719e-05, + "loss": 0.1102, + "step": 5274 + }, + { + "epoch": 8.386327503974563, + "grad_norm": 3.0925080207048676, + "learning_rate": 4.9121201161722736e-05, + "loss": 0.1456, + "step": 5275 + }, + { + "epoch": 8.3879173290938, + "grad_norm": 2.8240128829409863, + "learning_rate": 4.9123584453530146e-05, + "loss": 0.2029, + "step": 5276 + }, + { + "epoch": 8.389507154213037, + "grad_norm": 2.616092093590299, + "learning_rate": 4.912596465733458e-05, + "loss": 0.1793, + "step": 5277 + }, + { + "epoch": 8.391096979332273, + "grad_norm": 3.059673638547035, + "learning_rate": 4.912834177233043e-05, + "loss": 0.2162, + "step": 5278 + }, + { + "epoch": 8.392686804451511, + "grad_norm": 3.3904397680155536, + "learning_rate": 4.913071579771313e-05, + "loss": 0.1711, + "step": 5279 + }, + { + "epoch": 8.394276629570747, + "grad_norm": 3.024704874453519, + "learning_rate": 4.913308673267914e-05, + "loss": 0.1548, + "step": 5280 + }, + { + "epoch": 8.395866454689983, + "grad_norm": 14.650792106555212, + "learning_rate": 4.913545457642601e-05, + "loss": 0.6741, + "step": 5281 + }, + { + "epoch": 8.397456279809221, + "grad_norm": 3.6219158360054813, + "learning_rate": 4.9137819328152295e-05, + "loss": 0.2102, + "step": 5282 + }, + { + "epoch": 8.399046104928457, + "grad_norm": 2.671013271501224, + "learning_rate": 4.914018098705762e-05, + "loss": 0.1762, + "step": 5283 + }, + { + "epoch": 8.400635930047695, + "grad_norm": 1.8485907469218965, + "learning_rate": 4.914253955234264e-05, + "loss": 0.1846, + "step": 5284 + }, + { + "epoch": 8.402225755166931, + "grad_norm": 3.5859817072984863, + "learning_rate": 4.914489502320907e-05, + "loss": 0.2096, + "step": 5285 + }, + { + "epoch": 8.40381558028617, + "grad_norm": 2.610210003210112, + "learning_rate": 4.9147247398859674e-05, + "loss": 0.1728, + "step": 5286 + }, + { + "epoch": 8.405405405405405, + "grad_norm": 2.8423993764542432, + "learning_rate": 4.914959667849825e-05, + "loss": 0.1531, + "step": 5287 + }, + { + "epoch": 8.406995230524643, + "grad_norm": 3.704948637808435, + "learning_rate": 4.915194286132966e-05, + "loss": 0.1463, + "step": 5288 + }, + { + "epoch": 8.40858505564388, + "grad_norm": 2.430679407619619, + "learning_rate": 4.9154285946559797e-05, + "loss": 0.1291, + "step": 5289 + }, + { + "epoch": 8.410174880763115, + "grad_norm": 2.22283240924879, + "learning_rate": 4.915662593339561e-05, + "loss": 0.1409, + "step": 5290 + }, + { + "epoch": 8.411764705882353, + "grad_norm": 1.9433183171946475, + "learning_rate": 4.915896282104511e-05, + "loss": 0.1428, + "step": 5291 + }, + { + "epoch": 8.41335453100159, + "grad_norm": 2.81413126835575, + "learning_rate": 4.916129660871734e-05, + "loss": 0.1233, + "step": 5292 + }, + { + "epoch": 8.414944356120827, + "grad_norm": 2.978531311579743, + "learning_rate": 4.9163627295622405e-05, + "loss": 0.1682, + "step": 5293 + }, + { + "epoch": 8.416534181240063, + "grad_norm": 2.1283200649221063, + "learning_rate": 4.916595488097143e-05, + "loss": 0.1523, + "step": 5294 + }, + { + "epoch": 8.418124006359301, + "grad_norm": 2.1782933843273535, + "learning_rate": 4.916827936397663e-05, + "loss": 0.2392, + "step": 5295 + }, + { + "epoch": 8.419713831478537, + "grad_norm": 3.2287932142068714, + "learning_rate": 4.917060074385125e-05, + "loss": 0.0957, + "step": 5296 + }, + { + "epoch": 8.421303656597773, + "grad_norm": 1.8983814782553385, + "learning_rate": 4.9172919019809577e-05, + "loss": 0.1129, + "step": 5297 + }, + { + "epoch": 8.422893481717011, + "grad_norm": 2.043028107481882, + "learning_rate": 4.917523419106696e-05, + "loss": 0.1512, + "step": 5298 + }, + { + "epoch": 8.424483306836247, + "grad_norm": 1.6292407051523936, + "learning_rate": 4.917754625683982e-05, + "loss": 0.1166, + "step": 5299 + }, + { + "epoch": 8.426073131955485, + "grad_norm": 3.2009921697792265, + "learning_rate": 4.9179855216345574e-05, + "loss": 0.1532, + "step": 5300 + }, + { + "epoch": 8.427662957074721, + "grad_norm": 2.5046091260967382, + "learning_rate": 4.918216106880274e-05, + "loss": 0.162, + "step": 5301 + }, + { + "epoch": 8.42925278219396, + "grad_norm": 1.4477122186233438, + "learning_rate": 4.9184463813430874e-05, + "loss": 0.1294, + "step": 5302 + }, + { + "epoch": 8.430842607313195, + "grad_norm": 2.0808443001759307, + "learning_rate": 4.918676344945057e-05, + "loss": 0.1387, + "step": 5303 + }, + { + "epoch": 8.432432432432432, + "grad_norm": 3.4722817572399736, + "learning_rate": 4.918905997608349e-05, + "loss": 0.1169, + "step": 5304 + }, + { + "epoch": 8.43402225755167, + "grad_norm": 1.7834210535188175, + "learning_rate": 4.919135339255235e-05, + "loss": 0.1514, + "step": 5305 + }, + { + "epoch": 8.435612082670906, + "grad_norm": 3.549258419295536, + "learning_rate": 4.9193643698080896e-05, + "loss": 0.1257, + "step": 5306 + }, + { + "epoch": 8.437201907790143, + "grad_norm": 2.135335086993168, + "learning_rate": 4.919593089189395e-05, + "loss": 0.1251, + "step": 5307 + }, + { + "epoch": 8.43879173290938, + "grad_norm": 2.7324989244674476, + "learning_rate": 4.919821497321738e-05, + "loss": 0.1355, + "step": 5308 + }, + { + "epoch": 8.440381558028617, + "grad_norm": 3.108249698821315, + "learning_rate": 4.920049594127811e-05, + "loss": 0.0847, + "step": 5309 + }, + { + "epoch": 8.441971383147854, + "grad_norm": 3.8324659117139523, + "learning_rate": 4.9202773795304105e-05, + "loss": 0.1442, + "step": 5310 + }, + { + "epoch": 8.443561208267091, + "grad_norm": 2.9114297254820447, + "learning_rate": 4.92050485345244e-05, + "loss": 0.1605, + "step": 5311 + }, + { + "epoch": 8.445151033386328, + "grad_norm": 2.2575344297921354, + "learning_rate": 4.9207320158169085e-05, + "loss": 0.1834, + "step": 5312 + }, + { + "epoch": 8.446740858505564, + "grad_norm": 4.46271290209314, + "learning_rate": 4.920958866546929e-05, + "loss": 0.1266, + "step": 5313 + }, + { + "epoch": 8.448330683624802, + "grad_norm": 2.632949746560885, + "learning_rate": 4.921185405565721e-05, + "loss": 0.1703, + "step": 5314 + }, + { + "epoch": 8.449920508744038, + "grad_norm": 1.5292550860867795, + "learning_rate": 4.92141163279661e-05, + "loss": 0.1575, + "step": 5315 + }, + { + "epoch": 8.451510333863276, + "grad_norm": 3.6924596501061937, + "learning_rate": 4.9216375481630235e-05, + "loss": 0.1751, + "step": 5316 + }, + { + "epoch": 8.453100158982512, + "grad_norm": 2.8556549031307843, + "learning_rate": 4.921863151588501e-05, + "loss": 0.197, + "step": 5317 + }, + { + "epoch": 8.45468998410175, + "grad_norm": 3.4569292758391654, + "learning_rate": 4.922088442996681e-05, + "loss": 0.1694, + "step": 5318 + }, + { + "epoch": 8.456279809220986, + "grad_norm": 4.75826638250189, + "learning_rate": 4.922313422311312e-05, + "loss": 0.1698, + "step": 5319 + }, + { + "epoch": 8.457869634340222, + "grad_norm": 3.0096040373583457, + "learning_rate": 4.922538089456246e-05, + "loss": 0.1436, + "step": 5320 + }, + { + "epoch": 8.45945945945946, + "grad_norm": 4.06178144854372, + "learning_rate": 4.922762444355443e-05, + "loss": 0.2772, + "step": 5321 + }, + { + "epoch": 8.461049284578696, + "grad_norm": 5.60862366425306, + "learning_rate": 4.922986486932964e-05, + "loss": 0.1543, + "step": 5322 + }, + { + "epoch": 8.462639109697934, + "grad_norm": 3.9315973099604724, + "learning_rate": 4.923210217112981e-05, + "loss": 0.1645, + "step": 5323 + }, + { + "epoch": 8.46422893481717, + "grad_norm": 5.064737565771541, + "learning_rate": 4.923433634819769e-05, + "loss": 0.1903, + "step": 5324 + }, + { + "epoch": 8.465818759936408, + "grad_norm": 2.4867557842249366, + "learning_rate": 4.9236567399777086e-05, + "loss": 0.1671, + "step": 5325 + }, + { + "epoch": 8.467408585055644, + "grad_norm": 3.2372233415935203, + "learning_rate": 4.923879532511287e-05, + "loss": 0.1632, + "step": 5326 + }, + { + "epoch": 8.46899841017488, + "grad_norm": 3.1878006677521937, + "learning_rate": 4.924102012345097e-05, + "loss": 0.1411, + "step": 5327 + }, + { + "epoch": 8.470588235294118, + "grad_norm": 1.7662965155768184, + "learning_rate": 4.924324179403838e-05, + "loss": 0.1179, + "step": 5328 + }, + { + "epoch": 8.472178060413354, + "grad_norm": 3.3278216161135963, + "learning_rate": 4.9245460336123136e-05, + "loss": 0.123, + "step": 5329 + }, + { + "epoch": 8.473767885532592, + "grad_norm": 4.8320227480986295, + "learning_rate": 4.924767574895434e-05, + "loss": 0.2014, + "step": 5330 + }, + { + "epoch": 8.475357710651828, + "grad_norm": 3.1957688492147396, + "learning_rate": 4.9249888031782165e-05, + "loss": 0.1424, + "step": 5331 + }, + { + "epoch": 8.476947535771066, + "grad_norm": 2.170324698454584, + "learning_rate": 4.925209718385782e-05, + "loss": 0.134, + "step": 5332 + }, + { + "epoch": 8.478537360890302, + "grad_norm": 4.772899770568751, + "learning_rate": 4.9254303204433606e-05, + "loss": 0.1616, + "step": 5333 + }, + { + "epoch": 8.48012718600954, + "grad_norm": 2.107926589327192, + "learning_rate": 4.925650609276284e-05, + "loss": 0.1931, + "step": 5334 + }, + { + "epoch": 8.481717011128776, + "grad_norm": 5.4083502728477715, + "learning_rate": 4.925870584809995e-05, + "loss": 0.1252, + "step": 5335 + }, + { + "epoch": 8.483306836248012, + "grad_norm": 2.563024889302822, + "learning_rate": 4.926090246970038e-05, + "loss": 0.1586, + "step": 5336 + }, + { + "epoch": 8.48489666136725, + "grad_norm": 3.0801614223906215, + "learning_rate": 4.926309595682066e-05, + "loss": 0.1363, + "step": 5337 + }, + { + "epoch": 8.486486486486486, + "grad_norm": 3.5371272781998018, + "learning_rate": 4.9265286308718375e-05, + "loss": 0.249, + "step": 5338 + }, + { + "epoch": 8.488076311605724, + "grad_norm": 3.4570654302383876, + "learning_rate": 4.926747352465217e-05, + "loss": 0.1191, + "step": 5339 + }, + { + "epoch": 8.48966613672496, + "grad_norm": 3.8388216274033065, + "learning_rate": 4.926965760388175e-05, + "loss": 0.1553, + "step": 5340 + }, + { + "epoch": 8.491255961844198, + "grad_norm": 2.372808241279813, + "learning_rate": 4.9271838545667876e-05, + "loss": 0.1434, + "step": 5341 + }, + { + "epoch": 8.492845786963434, + "grad_norm": 1.8989263663178342, + "learning_rate": 4.9274016349272396e-05, + "loss": 0.1775, + "step": 5342 + }, + { + "epoch": 8.49443561208267, + "grad_norm": 3.247566368898396, + "learning_rate": 4.927619101395818e-05, + "loss": 0.1867, + "step": 5343 + }, + { + "epoch": 8.496025437201908, + "grad_norm": 3.1951971134236548, + "learning_rate": 4.92783625389892e-05, + "loss": 0.1341, + "step": 5344 + }, + { + "epoch": 8.497615262321144, + "grad_norm": 2.4354308287264854, + "learning_rate": 4.928053092363047e-05, + "loss": 0.1759, + "step": 5345 + }, + { + "epoch": 8.499205087440382, + "grad_norm": 4.419238405218485, + "learning_rate": 4.928269616714807e-05, + "loss": 0.1728, + "step": 5346 + }, + { + "epoch": 8.500794912559618, + "grad_norm": 3.7601825096158237, + "learning_rate": 4.9284858268809136e-05, + "loss": 0.1405, + "step": 5347 + }, + { + "epoch": 8.502384737678856, + "grad_norm": 3.311300250270439, + "learning_rate": 4.9287017227881886e-05, + "loss": 0.1432, + "step": 5348 + }, + { + "epoch": 8.503974562798092, + "grad_norm": 1.6282049387074289, + "learning_rate": 4.928917304363558e-05, + "loss": 0.1522, + "step": 5349 + }, + { + "epoch": 8.505564387917328, + "grad_norm": 1.4853614179252492, + "learning_rate": 4.929132571534057e-05, + "loss": 0.126, + "step": 5350 + }, + { + "epoch": 8.507154213036566, + "grad_norm": 3.136041163491659, + "learning_rate": 4.9293475242268225e-05, + "loss": 0.1503, + "step": 5351 + }, + { + "epoch": 8.508744038155802, + "grad_norm": 1.6306641668795792, + "learning_rate": 4.9295621623691034e-05, + "loss": 0.1331, + "step": 5352 + }, + { + "epoch": 8.51033386327504, + "grad_norm": 2.4338045323218376, + "learning_rate": 4.9297764858882514e-05, + "loss": 0.1244, + "step": 5353 + }, + { + "epoch": 8.511923688394276, + "grad_norm": 3.102727274921048, + "learning_rate": 4.9299904947117266e-05, + "loss": 0.1602, + "step": 5354 + }, + { + "epoch": 8.513513513513514, + "grad_norm": 1.6165132152331934, + "learning_rate": 4.9302041887670934e-05, + "loss": 0.2094, + "step": 5355 + }, + { + "epoch": 8.51510333863275, + "grad_norm": 4.329370257804166, + "learning_rate": 4.930417567982025e-05, + "loss": 0.2149, + "step": 5356 + }, + { + "epoch": 8.516693163751988, + "grad_norm": 1.778164170316947, + "learning_rate": 4.9306306322843e-05, + "loss": 0.1159, + "step": 5357 + }, + { + "epoch": 8.518282988871224, + "grad_norm": 1.516082035650747, + "learning_rate": 4.930843381601804e-05, + "loss": 0.1533, + "step": 5358 + }, + { + "epoch": 8.51987281399046, + "grad_norm": 3.4319479040109218, + "learning_rate": 4.931055815862528e-05, + "loss": 0.1578, + "step": 5359 + }, + { + "epoch": 8.521462639109698, + "grad_norm": 3.0463594935694016, + "learning_rate": 4.931267934994573e-05, + "loss": 0.1202, + "step": 5360 + }, + { + "epoch": 8.523052464228934, + "grad_norm": 5.256050008214594, + "learning_rate": 4.931479738926143e-05, + "loss": 0.1299, + "step": 5361 + }, + { + "epoch": 8.524642289348172, + "grad_norm": 4.2834923668719105, + "learning_rate": 4.931691227585549e-05, + "loss": 0.2076, + "step": 5362 + }, + { + "epoch": 8.526232114467408, + "grad_norm": 7.108533742635488, + "learning_rate": 4.931902400901212e-05, + "loss": 0.1595, + "step": 5363 + }, + { + "epoch": 8.527821939586646, + "grad_norm": 4.41908977854237, + "learning_rate": 4.932113258801655e-05, + "loss": 0.1426, + "step": 5364 + }, + { + "epoch": 8.529411764705882, + "grad_norm": 4.699906809807721, + "learning_rate": 4.9323238012155126e-05, + "loss": 0.229, + "step": 5365 + }, + { + "epoch": 8.53100158982512, + "grad_norm": 5.885687841813832, + "learning_rate": 4.9325340280715226e-05, + "loss": 0.2433, + "step": 5366 + }, + { + "epoch": 8.532591414944356, + "grad_norm": 2.6055410330467166, + "learning_rate": 4.93274393929853e-05, + "loss": 0.1362, + "step": 5367 + }, + { + "epoch": 8.534181240063592, + "grad_norm": 5.986865662664183, + "learning_rate": 4.932953534825489e-05, + "loss": 0.1654, + "step": 5368 + }, + { + "epoch": 8.53577106518283, + "grad_norm": 2.49943389831294, + "learning_rate": 4.9331628145814584e-05, + "loss": 0.1833, + "step": 5369 + }, + { + "epoch": 8.537360890302066, + "grad_norm": 3.197348559398834, + "learning_rate": 4.9333717784956056e-05, + "loss": 0.158, + "step": 5370 + }, + { + "epoch": 8.538950715421304, + "grad_norm": 3.802283353641469, + "learning_rate": 4.933580426497202e-05, + "loss": 0.1982, + "step": 5371 + }, + { + "epoch": 8.54054054054054, + "grad_norm": 4.478837507858739, + "learning_rate": 4.933788758515629e-05, + "loss": 0.1532, + "step": 5372 + }, + { + "epoch": 8.542130365659778, + "grad_norm": 3.4600801385739004, + "learning_rate": 4.9339967744803736e-05, + "loss": 0.1335, + "step": 5373 + }, + { + "epoch": 8.543720190779014, + "grad_norm": 3.281763449744702, + "learning_rate": 4.93420447432103e-05, + "loss": 0.1904, + "step": 5374 + }, + { + "epoch": 8.54531001589825, + "grad_norm": 3.3804908270241145, + "learning_rate": 4.934411857967299e-05, + "loss": 0.1527, + "step": 5375 + }, + { + "epoch": 8.546899841017488, + "grad_norm": 3.5102358313016526, + "learning_rate": 4.9346189253489885e-05, + "loss": 0.2371, + "step": 5376 + }, + { + "epoch": 8.548489666136724, + "grad_norm": 2.8051459592225276, + "learning_rate": 4.9348256763960146e-05, + "loss": 0.1509, + "step": 5377 + }, + { + "epoch": 8.550079491255962, + "grad_norm": 2.3954601618164895, + "learning_rate": 4.935032111038399e-05, + "loss": 0.192, + "step": 5378 + }, + { + "epoch": 8.551669316375198, + "grad_norm": 3.537551720490227, + "learning_rate": 4.9352382292062716e-05, + "loss": 0.1905, + "step": 5379 + }, + { + "epoch": 8.553259141494436, + "grad_norm": 2.7329384958045218, + "learning_rate": 4.9354440308298674e-05, + "loss": 0.177, + "step": 5380 + }, + { + "epoch": 8.554848966613672, + "grad_norm": 2.734468682161299, + "learning_rate": 4.935649515839531e-05, + "loss": 0.1263, + "step": 5381 + }, + { + "epoch": 8.556438791732909, + "grad_norm": 2.6681682091860965, + "learning_rate": 4.9358546841657145e-05, + "loss": 0.2348, + "step": 5382 + }, + { + "epoch": 8.558028616852146, + "grad_norm": 2.5217726125561604, + "learning_rate": 4.936059535738973e-05, + "loss": 0.1502, + "step": 5383 + }, + { + "epoch": 8.559618441971383, + "grad_norm": 3.6494322035699645, + "learning_rate": 4.9362640704899745e-05, + "loss": 0.1303, + "step": 5384 + }, + { + "epoch": 8.56120826709062, + "grad_norm": 1.6539480199151637, + "learning_rate": 4.936468288349489e-05, + "loss": 0.1424, + "step": 5385 + }, + { + "epoch": 8.562798092209857, + "grad_norm": 3.1993332730859563, + "learning_rate": 4.9366721892483977e-05, + "loss": 0.3289, + "step": 5386 + }, + { + "epoch": 8.564387917329094, + "grad_norm": 2.570409973559718, + "learning_rate": 4.936875773117687e-05, + "loss": 0.1679, + "step": 5387 + }, + { + "epoch": 8.56597774244833, + "grad_norm": 2.3152773956564157, + "learning_rate": 4.9370790398884516e-05, + "loss": 0.1601, + "step": 5388 + }, + { + "epoch": 8.567567567567568, + "grad_norm": 2.5771613542912806, + "learning_rate": 4.937281989491892e-05, + "loss": 0.181, + "step": 5389 + }, + { + "epoch": 8.569157392686805, + "grad_norm": 7.237714807085615, + "learning_rate": 4.9374846218593176e-05, + "loss": 1.567, + "step": 5390 + }, + { + "epoch": 8.57074721780604, + "grad_norm": 1.7606775407458635, + "learning_rate": 4.937686936922145e-05, + "loss": 0.1653, + "step": 5391 + }, + { + "epoch": 8.572337042925279, + "grad_norm": 1.7126827827193933, + "learning_rate": 4.937888934611898e-05, + "loss": 0.1818, + "step": 5392 + }, + { + "epoch": 8.573926868044515, + "grad_norm": 3.268731511640149, + "learning_rate": 4.9380906148602074e-05, + "loss": 0.2183, + "step": 5393 + }, + { + "epoch": 8.575516693163753, + "grad_norm": 2.54086252678503, + "learning_rate": 4.938291977598811e-05, + "loss": 0.1719, + "step": 5394 + }, + { + "epoch": 8.577106518282989, + "grad_norm": 1.9363334553375875, + "learning_rate": 4.938493022759556e-05, + "loss": 0.1576, + "step": 5395 + }, + { + "epoch": 8.578696343402227, + "grad_norm": 3.5559670509670283, + "learning_rate": 4.938693750274395e-05, + "loss": 0.1122, + "step": 5396 + }, + { + "epoch": 8.580286168521463, + "grad_norm": 2.199565049027433, + "learning_rate": 4.9388941600753906e-05, + "loss": 0.158, + "step": 5397 + }, + { + "epoch": 8.581875993640699, + "grad_norm": 3.274174558580387, + "learning_rate": 4.939094252094709e-05, + "loss": 0.1429, + "step": 5398 + }, + { + "epoch": 8.583465818759937, + "grad_norm": 3.6001994584842247, + "learning_rate": 4.939294026264628e-05, + "loss": 0.175, + "step": 5399 + }, + { + "epoch": 8.585055643879173, + "grad_norm": 3.727261711661235, + "learning_rate": 4.9394934825175306e-05, + "loss": 0.3545, + "step": 5400 + } + ], + "logging_steps": 1, + "max_steps": 6000, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 600, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1145460372226048.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}