diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14582 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 2077, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00048155059290916753, + "grad_norm": 1.6082209348678589, + "learning_rate": 0.0, + "loss": 2.2043, + "step": 1 + }, + { + "epoch": 0.0009631011858183351, + "grad_norm": 2.065955400466919, + "learning_rate": 3.1746031746031746e-06, + "loss": 3.2716, + "step": 2 + }, + { + "epoch": 0.0014446517787275025, + "grad_norm": 1.0382134914398193, + "learning_rate": 6.349206349206349e-06, + "loss": 1.0446, + "step": 3 + }, + { + "epoch": 0.0019262023716366701, + "grad_norm": 1.901087760925293, + "learning_rate": 9.523809523809523e-06, + "loss": 2.1973, + "step": 4 + }, + { + "epoch": 0.0024077529645458377, + "grad_norm": 1.3971447944641113, + "learning_rate": 1.2698412698412699e-05, + "loss": 2.1493, + "step": 5 + }, + { + "epoch": 0.002889303557455005, + "grad_norm": 1.7060186862945557, + "learning_rate": 1.5873015873015872e-05, + "loss": 2.0483, + "step": 6 + }, + { + "epoch": 0.0033708541503641725, + "grad_norm": 1.3689404726028442, + "learning_rate": 1.9047619047619046e-05, + "loss": 2.4116, + "step": 7 + }, + { + "epoch": 0.0038524047432733403, + "grad_norm": 2.2580316066741943, + "learning_rate": 2.2222222222222223e-05, + "loss": 3.0569, + "step": 8 + }, + { + "epoch": 0.004333955336182508, + "grad_norm": 1.0360066890716553, + "learning_rate": 2.5396825396825397e-05, + "loss": 2.0509, + "step": 9 + }, + { + "epoch": 0.004815505929091675, + "grad_norm": 1.3235849142074585, + "learning_rate": 2.857142857142857e-05, + "loss": 1.878, + "step": 10 + }, + { + "epoch": 0.005297056522000842, + "grad_norm": 2.439751625061035, + "learning_rate": 3.1746031746031745e-05, + "loss": 2.1217, + "step": 11 + }, + { + "epoch": 0.00577860711491001, + "grad_norm": 2.174870252609253, + "learning_rate": 3.492063492063492e-05, + "loss": 1.7963, + "step": 12 + }, + { + "epoch": 0.006260157707819178, + "grad_norm": 2.6036598682403564, + "learning_rate": 3.809523809523809e-05, + "loss": 2.4504, + "step": 13 + }, + { + "epoch": 0.006741708300728345, + "grad_norm": 2.6642045974731445, + "learning_rate": 4.126984126984127e-05, + "loss": 1.3983, + "step": 14 + }, + { + "epoch": 0.007223258893637513, + "grad_norm": 1.496778964996338, + "learning_rate": 4.4444444444444447e-05, + "loss": 1.4108, + "step": 15 + }, + { + "epoch": 0.0077048094865466805, + "grad_norm": 1.440912127494812, + "learning_rate": 4.761904761904762e-05, + "loss": 2.019, + "step": 16 + }, + { + "epoch": 0.008186360079455848, + "grad_norm": 2.5316059589385986, + "learning_rate": 5.0793650793650794e-05, + "loss": 1.3808, + "step": 17 + }, + { + "epoch": 0.008667910672365015, + "grad_norm": 2.173659563064575, + "learning_rate": 5.396825396825397e-05, + "loss": 1.3812, + "step": 18 + }, + { + "epoch": 0.009149461265274182, + "grad_norm": 1.1414191722869873, + "learning_rate": 5.714285714285714e-05, + "loss": 1.4883, + "step": 19 + }, + { + "epoch": 0.00963101185818335, + "grad_norm": 3.4201762676239014, + "learning_rate": 6.0317460317460316e-05, + "loss": 1.7685, + "step": 20 + }, + { + "epoch": 0.010112562451092518, + "grad_norm": 2.068449020385742, + "learning_rate": 6.349206349206349e-05, + "loss": 1.1874, + "step": 21 + }, + { + "epoch": 0.010594113044001685, + "grad_norm": 2.3861048221588135, + "learning_rate": 6.666666666666667e-05, + "loss": 1.352, + "step": 22 + }, + { + "epoch": 0.011075663636910853, + "grad_norm": 1.0080865621566772, + "learning_rate": 6.984126984126984e-05, + "loss": 0.687, + "step": 23 + }, + { + "epoch": 0.01155721422982002, + "grad_norm": 1.1371558904647827, + "learning_rate": 7.301587301587302e-05, + "loss": 0.5372, + "step": 24 + }, + { + "epoch": 0.012038764822729187, + "grad_norm": 1.938717007637024, + "learning_rate": 7.619047619047618e-05, + "loss": 1.1799, + "step": 25 + }, + { + "epoch": 0.012520315415638356, + "grad_norm": 1.2184033393859863, + "learning_rate": 7.936507936507937e-05, + "loss": 0.5629, + "step": 26 + }, + { + "epoch": 0.013001866008547523, + "grad_norm": 1.136045217514038, + "learning_rate": 8.253968253968255e-05, + "loss": 0.6372, + "step": 27 + }, + { + "epoch": 0.01348341660145669, + "grad_norm": 1.970751404762268, + "learning_rate": 8.571428571428571e-05, + "loss": 0.6222, + "step": 28 + }, + { + "epoch": 0.013964967194365858, + "grad_norm": 2.355557441711426, + "learning_rate": 8.888888888888889e-05, + "loss": 1.2012, + "step": 29 + }, + { + "epoch": 0.014446517787275025, + "grad_norm": 1.6930681467056274, + "learning_rate": 9.206349206349206e-05, + "loss": 0.9074, + "step": 30 + }, + { + "epoch": 0.014928068380184192, + "grad_norm": 1.6450234651565552, + "learning_rate": 9.523809523809524e-05, + "loss": 0.7782, + "step": 31 + }, + { + "epoch": 0.015409618973093361, + "grad_norm": 1.2797229290008545, + "learning_rate": 9.841269841269841e-05, + "loss": 0.9376, + "step": 32 + }, + { + "epoch": 0.01589116956600253, + "grad_norm": 1.571645736694336, + "learning_rate": 0.00010158730158730159, + "loss": 0.6965, + "step": 33 + }, + { + "epoch": 0.016372720158911697, + "grad_norm": 2.3224563598632812, + "learning_rate": 0.00010476190476190477, + "loss": 0.8556, + "step": 34 + }, + { + "epoch": 0.016854270751820864, + "grad_norm": 1.4120062589645386, + "learning_rate": 0.00010793650793650794, + "loss": 0.5677, + "step": 35 + }, + { + "epoch": 0.01733582134473003, + "grad_norm": 0.676503598690033, + "learning_rate": 0.00011111111111111112, + "loss": 0.9639, + "step": 36 + }, + { + "epoch": 0.017817371937639197, + "grad_norm": 2.5169730186462402, + "learning_rate": 0.00011428571428571428, + "loss": 0.6157, + "step": 37 + }, + { + "epoch": 0.018298922530548364, + "grad_norm": 1.063428282737732, + "learning_rate": 0.00011746031746031746, + "loss": 0.7783, + "step": 38 + }, + { + "epoch": 0.018780473123457535, + "grad_norm": 1.2619324922561646, + "learning_rate": 0.00012063492063492063, + "loss": 0.4081, + "step": 39 + }, + { + "epoch": 0.0192620237163667, + "grad_norm": 1.517112135887146, + "learning_rate": 0.0001238095238095238, + "loss": 1.2443, + "step": 40 + }, + { + "epoch": 0.01974357430927587, + "grad_norm": 1.5194569826126099, + "learning_rate": 0.00012698412698412698, + "loss": 0.7078, + "step": 41 + }, + { + "epoch": 0.020225124902185036, + "grad_norm": 1.1256518363952637, + "learning_rate": 0.00013015873015873017, + "loss": 0.4796, + "step": 42 + }, + { + "epoch": 0.020706675495094203, + "grad_norm": 1.4986801147460938, + "learning_rate": 0.00013333333333333334, + "loss": 0.7092, + "step": 43 + }, + { + "epoch": 0.02118822608800337, + "grad_norm": 1.3655855655670166, + "learning_rate": 0.0001365079365079365, + "loss": 0.5856, + "step": 44 + }, + { + "epoch": 0.02166977668091254, + "grad_norm": 2.3696224689483643, + "learning_rate": 0.00013968253968253967, + "loss": 0.7981, + "step": 45 + }, + { + "epoch": 0.022151327273821707, + "grad_norm": 1.1910518407821655, + "learning_rate": 0.00014285714285714287, + "loss": 0.7553, + "step": 46 + }, + { + "epoch": 0.022632877866730874, + "grad_norm": 3.5885837078094482, + "learning_rate": 0.00014603174603174603, + "loss": 0.5839, + "step": 47 + }, + { + "epoch": 0.02311442845964004, + "grad_norm": 6.353276252746582, + "learning_rate": 0.00014920634920634923, + "loss": 0.6652, + "step": 48 + }, + { + "epoch": 0.023595979052549208, + "grad_norm": 13.263524055480957, + "learning_rate": 0.00015238095238095237, + "loss": 1.7776, + "step": 49 + }, + { + "epoch": 0.024077529645458375, + "grad_norm": 5.20112419128418, + "learning_rate": 0.00015555555555555556, + "loss": 1.7248, + "step": 50 + }, + { + "epoch": 0.024559080238367545, + "grad_norm": 38.3751335144043, + "learning_rate": 0.00015873015873015873, + "loss": 1.6096, + "step": 51 + }, + { + "epoch": 0.025040630831276712, + "grad_norm": 3.156832218170166, + "learning_rate": 0.00016190476190476192, + "loss": 1.4219, + "step": 52 + }, + { + "epoch": 0.02552218142418588, + "grad_norm": 1.7823532819747925, + "learning_rate": 0.0001650793650793651, + "loss": 0.5728, + "step": 53 + }, + { + "epoch": 0.026003732017095046, + "grad_norm": 0.5301952958106995, + "learning_rate": 0.00016825396825396826, + "loss": 0.3504, + "step": 54 + }, + { + "epoch": 0.026485282610004213, + "grad_norm": 1.6530343294143677, + "learning_rate": 0.00017142857142857143, + "loss": 1.1747, + "step": 55 + }, + { + "epoch": 0.02696683320291338, + "grad_norm": 1.4897655248641968, + "learning_rate": 0.00017460317460317462, + "loss": 0.4258, + "step": 56 + }, + { + "epoch": 0.02744838379582255, + "grad_norm": 1.5002144575119019, + "learning_rate": 0.00017777777777777779, + "loss": 0.5882, + "step": 57 + }, + { + "epoch": 0.027929934388731717, + "grad_norm": 1.3163996934890747, + "learning_rate": 0.00018095238095238095, + "loss": 0.3548, + "step": 58 + }, + { + "epoch": 0.028411484981640884, + "grad_norm": 0.955341100692749, + "learning_rate": 0.00018412698412698412, + "loss": 0.6858, + "step": 59 + }, + { + "epoch": 0.02889303557455005, + "grad_norm": 1.5476465225219727, + "learning_rate": 0.00018730158730158731, + "loss": 0.427, + "step": 60 + }, + { + "epoch": 0.029374586167459218, + "grad_norm": 0.5300901532173157, + "learning_rate": 0.00019047619047619048, + "loss": 0.4842, + "step": 61 + }, + { + "epoch": 0.029856136760368385, + "grad_norm": 1.4688984155654907, + "learning_rate": 0.00019365079365079365, + "loss": 0.5488, + "step": 62 + }, + { + "epoch": 0.030337687353277555, + "grad_norm": 3.4266722202301025, + "learning_rate": 0.00019682539682539682, + "loss": 0.5898, + "step": 63 + }, + { + "epoch": 0.030819237946186722, + "grad_norm": 1.9127459526062012, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 64 + }, + { + "epoch": 0.03130078853909589, + "grad_norm": 2.3638687133789062, + "learning_rate": 0.00019999987833918285, + "loss": 0.7013, + "step": 65 + }, + { + "epoch": 0.03178233913200506, + "grad_norm": 1.9451192617416382, + "learning_rate": 0.00019999951335702735, + "loss": 0.8187, + "step": 66 + }, + { + "epoch": 0.03226388972491422, + "grad_norm": 2.2886431217193604, + "learning_rate": 0.00019999890505442158, + "loss": 0.575, + "step": 67 + }, + { + "epoch": 0.03274544031782339, + "grad_norm": 2.9639065265655518, + "learning_rate": 0.0001999980534328457, + "loss": 0.8134, + "step": 68 + }, + { + "epoch": 0.03322699091073256, + "grad_norm": 2.0369532108306885, + "learning_rate": 0.0001999969584943719, + "loss": 0.7999, + "step": 69 + }, + { + "epoch": 0.03370854150364173, + "grad_norm": 1.9339641332626343, + "learning_rate": 0.00019999562024166438, + "loss": 0.5923, + "step": 70 + }, + { + "epoch": 0.03419009209655089, + "grad_norm": 2.312232494354248, + "learning_rate": 0.0001999940386779794, + "loss": 0.4547, + "step": 71 + }, + { + "epoch": 0.03467164268946006, + "grad_norm": 0.7201380729675293, + "learning_rate": 0.00019999221380716527, + "loss": 0.4716, + "step": 72 + }, + { + "epoch": 0.03515319328236923, + "grad_norm": 1.7096892595291138, + "learning_rate": 0.00019999014563366226, + "loss": 0.6265, + "step": 73 + }, + { + "epoch": 0.035634743875278395, + "grad_norm": 1.983033299446106, + "learning_rate": 0.00019998783416250268, + "loss": 0.7009, + "step": 74 + }, + { + "epoch": 0.036116294468187565, + "grad_norm": 1.0916883945465088, + "learning_rate": 0.0001999852793993109, + "loss": 0.8363, + "step": 75 + }, + { + "epoch": 0.03659784506109673, + "grad_norm": 0.7289116978645325, + "learning_rate": 0.00019998248135030315, + "loss": 0.597, + "step": 76 + }, + { + "epoch": 0.0370793956540059, + "grad_norm": 0.4685361981391907, + "learning_rate": 0.00019997944002228774, + "loss": 0.2757, + "step": 77 + }, + { + "epoch": 0.03756094624691507, + "grad_norm": 4.782323837280273, + "learning_rate": 0.00019997615542266482, + "loss": 0.7378, + "step": 78 + }, + { + "epoch": 0.03804249683982423, + "grad_norm": 2.023463249206543, + "learning_rate": 0.00019997262755942655, + "loss": 0.3977, + "step": 79 + }, + { + "epoch": 0.0385240474327334, + "grad_norm": 1.189867615699768, + "learning_rate": 0.000199968856441157, + "loss": 0.4471, + "step": 80 + }, + { + "epoch": 0.03900559802564257, + "grad_norm": 1.475141167640686, + "learning_rate": 0.0001999648420770321, + "loss": 0.5356, + "step": 81 + }, + { + "epoch": 0.03948714861855174, + "grad_norm": 3.8738622665405273, + "learning_rate": 0.0001999605844768197, + "loss": 0.7529, + "step": 82 + }, + { + "epoch": 0.0399686992114609, + "grad_norm": 0.7455668449401855, + "learning_rate": 0.00019995608365087946, + "loss": 0.773, + "step": 83 + }, + { + "epoch": 0.04045024980437007, + "grad_norm": 1.712019443511963, + "learning_rate": 0.0001999513396101628, + "loss": 0.5016, + "step": 84 + }, + { + "epoch": 0.04093180039727924, + "grad_norm": 0.6334553956985474, + "learning_rate": 0.00019994635236621306, + "loss": 0.2501, + "step": 85 + }, + { + "epoch": 0.041413350990188405, + "grad_norm": 1.0831007957458496, + "learning_rate": 0.00019994112193116528, + "loss": 0.7896, + "step": 86 + }, + { + "epoch": 0.041894901583097575, + "grad_norm": 1.1738227605819702, + "learning_rate": 0.00019993564831774618, + "loss": 0.7807, + "step": 87 + }, + { + "epoch": 0.04237645217600674, + "grad_norm": 2.181807279586792, + "learning_rate": 0.00019992993153927432, + "loss": 0.6244, + "step": 88 + }, + { + "epoch": 0.04285800276891591, + "grad_norm": 1.2576450109481812, + "learning_rate": 0.00019992397160965982, + "loss": 0.3363, + "step": 89 + }, + { + "epoch": 0.04333955336182508, + "grad_norm": 1.9345167875289917, + "learning_rate": 0.0001999177685434045, + "loss": 0.8769, + "step": 90 + }, + { + "epoch": 0.04382110395473424, + "grad_norm": 0.617825448513031, + "learning_rate": 0.00019991132235560176, + "loss": 0.8467, + "step": 91 + }, + { + "epoch": 0.044302654547643414, + "grad_norm": 0.9621971845626831, + "learning_rate": 0.00019990463306193652, + "loss": 0.5924, + "step": 92 + }, + { + "epoch": 0.04478420514055258, + "grad_norm": 1.4077861309051514, + "learning_rate": 0.00019989770067868533, + "loss": 0.6414, + "step": 93 + }, + { + "epoch": 0.04526575573346175, + "grad_norm": 2.031898260116577, + "learning_rate": 0.00019989052522271622, + "loss": 0.8294, + "step": 94 + }, + { + "epoch": 0.04574730632637091, + "grad_norm": 1.1067110300064087, + "learning_rate": 0.00019988310671148848, + "loss": 0.4979, + "step": 95 + }, + { + "epoch": 0.04622885691928008, + "grad_norm": 0.7193974852561951, + "learning_rate": 0.00019987544516305311, + "loss": 0.6319, + "step": 96 + }, + { + "epoch": 0.04671040751218925, + "grad_norm": 1.1549735069274902, + "learning_rate": 0.00019986754059605222, + "loss": 0.6422, + "step": 97 + }, + { + "epoch": 0.047191958105098415, + "grad_norm": 1.2274930477142334, + "learning_rate": 0.00019985939302971938, + "loss": 0.5289, + "step": 98 + }, + { + "epoch": 0.047673508698007586, + "grad_norm": 1.460633635520935, + "learning_rate": 0.00019985100248387933, + "loss": 0.7973, + "step": 99 + }, + { + "epoch": 0.04815505929091675, + "grad_norm": 0.9921919703483582, + "learning_rate": 0.00019984236897894816, + "loss": 0.3635, + "step": 100 + }, + { + "epoch": 0.04863660988382592, + "grad_norm": 1.82761812210083, + "learning_rate": 0.000199833492535933, + "loss": 0.7808, + "step": 101 + }, + { + "epoch": 0.04911816047673509, + "grad_norm": 1.0854943990707397, + "learning_rate": 0.00019982437317643217, + "loss": 0.9382, + "step": 102 + }, + { + "epoch": 0.04959971106964425, + "grad_norm": 1.0419723987579346, + "learning_rate": 0.00019981501092263503, + "loss": 0.8863, + "step": 103 + }, + { + "epoch": 0.050081261662553424, + "grad_norm": 1.2058320045471191, + "learning_rate": 0.00019980540579732196, + "loss": 0.5667, + "step": 104 + }, + { + "epoch": 0.05056281225546259, + "grad_norm": 0.5756928324699402, + "learning_rate": 0.00019979555782386434, + "loss": 0.4174, + "step": 105 + }, + { + "epoch": 0.05104436284837176, + "grad_norm": 0.6957888007164001, + "learning_rate": 0.00019978546702622443, + "loss": 0.781, + "step": 106 + }, + { + "epoch": 0.05152591344128093, + "grad_norm": 0.8713038563728333, + "learning_rate": 0.00019977513342895532, + "loss": 0.303, + "step": 107 + }, + { + "epoch": 0.05200746403419009, + "grad_norm": 1.7869960069656372, + "learning_rate": 0.00019976455705720083, + "loss": 0.7074, + "step": 108 + }, + { + "epoch": 0.05248901462709926, + "grad_norm": 1.197080373764038, + "learning_rate": 0.0001997537379366956, + "loss": 0.9249, + "step": 109 + }, + { + "epoch": 0.052970565220008425, + "grad_norm": 0.929421603679657, + "learning_rate": 0.00019974267609376494, + "loss": 1.0453, + "step": 110 + }, + { + "epoch": 0.053452115812917596, + "grad_norm": 1.4272781610488892, + "learning_rate": 0.00019973137155532462, + "loss": 0.3844, + "step": 111 + }, + { + "epoch": 0.05393366640582676, + "grad_norm": 0.9156218767166138, + "learning_rate": 0.00019971982434888107, + "loss": 0.6791, + "step": 112 + }, + { + "epoch": 0.05441521699873593, + "grad_norm": 0.9151055216789246, + "learning_rate": 0.00019970803450253114, + "loss": 0.7201, + "step": 113 + }, + { + "epoch": 0.0548967675916451, + "grad_norm": 1.2295348644256592, + "learning_rate": 0.0001996960020449621, + "loss": 0.6156, + "step": 114 + }, + { + "epoch": 0.055378318184554264, + "grad_norm": 0.9207524061203003, + "learning_rate": 0.00019968372700545145, + "loss": 0.6367, + "step": 115 + }, + { + "epoch": 0.055859868777463434, + "grad_norm": 2.107301712036133, + "learning_rate": 0.00019967120941386709, + "loss": 0.5326, + "step": 116 + }, + { + "epoch": 0.0563414193703726, + "grad_norm": 1.0059994459152222, + "learning_rate": 0.000199658449300667, + "loss": 0.2453, + "step": 117 + }, + { + "epoch": 0.05682296996328177, + "grad_norm": 1.5769124031066895, + "learning_rate": 0.00019964544669689928, + "loss": 1.2558, + "step": 118 + }, + { + "epoch": 0.05730452055619094, + "grad_norm": 1.3476357460021973, + "learning_rate": 0.00019963220163420214, + "loss": 0.8043, + "step": 119 + }, + { + "epoch": 0.0577860711491001, + "grad_norm": 0.693128228187561, + "learning_rate": 0.0001996187141448036, + "loss": 0.3331, + "step": 120 + }, + { + "epoch": 0.05826762174200927, + "grad_norm": 0.6747751235961914, + "learning_rate": 0.0001996049842615217, + "loss": 0.2073, + "step": 121 + }, + { + "epoch": 0.058749172334918436, + "grad_norm": 1.0784077644348145, + "learning_rate": 0.0001995910120177642, + "loss": 0.6549, + "step": 122 + }, + { + "epoch": 0.059230722927827606, + "grad_norm": 1.5669617652893066, + "learning_rate": 0.00019957679744752859, + "loss": 0.6954, + "step": 123 + }, + { + "epoch": 0.05971227352073677, + "grad_norm": 1.202558159828186, + "learning_rate": 0.00019956234058540195, + "loss": 0.4939, + "step": 124 + }, + { + "epoch": 0.06019382411364594, + "grad_norm": 2.229548692703247, + "learning_rate": 0.00019954764146656105, + "loss": 0.7145, + "step": 125 + }, + { + "epoch": 0.06067537470655511, + "grad_norm": 0.6061960458755493, + "learning_rate": 0.00019953270012677195, + "loss": 0.2894, + "step": 126 + }, + { + "epoch": 0.061156925299464274, + "grad_norm": 1.0359543561935425, + "learning_rate": 0.00019951751660239015, + "loss": 0.4198, + "step": 127 + }, + { + "epoch": 0.061638475892373444, + "grad_norm": 1.0313760042190552, + "learning_rate": 0.00019950209093036052, + "loss": 0.5328, + "step": 128 + }, + { + "epoch": 0.06212002648528261, + "grad_norm": 0.5311338901519775, + "learning_rate": 0.000199486423148217, + "loss": 0.3115, + "step": 129 + }, + { + "epoch": 0.06260157707819178, + "grad_norm": 1.4769734144210815, + "learning_rate": 0.00019947051329408276, + "loss": 0.4816, + "step": 130 + }, + { + "epoch": 0.06308312767110094, + "grad_norm": 0.6150282025337219, + "learning_rate": 0.00019945436140666981, + "loss": 0.8294, + "step": 131 + }, + { + "epoch": 0.06356467826401012, + "grad_norm": 1.398701786994934, + "learning_rate": 0.0001994379675252793, + "loss": 0.7204, + "step": 132 + }, + { + "epoch": 0.06404622885691928, + "grad_norm": 1.5035239458084106, + "learning_rate": 0.00019942133168980103, + "loss": 1.2134, + "step": 133 + }, + { + "epoch": 0.06452777944982845, + "grad_norm": 2.2912328243255615, + "learning_rate": 0.00019940445394071355, + "loss": 0.7721, + "step": 134 + }, + { + "epoch": 0.06500933004273761, + "grad_norm": 0.7829728126525879, + "learning_rate": 0.0001993873343190842, + "loss": 0.4178, + "step": 135 + }, + { + "epoch": 0.06549088063564679, + "grad_norm": 0.8886811137199402, + "learning_rate": 0.00019936997286656855, + "loss": 0.7168, + "step": 136 + }, + { + "epoch": 0.06597243122855595, + "grad_norm": 1.1705811023712158, + "learning_rate": 0.00019935236962541092, + "loss": 0.7957, + "step": 137 + }, + { + "epoch": 0.06645398182146511, + "grad_norm": 1.3834999799728394, + "learning_rate": 0.00019933452463844376, + "loss": 0.7548, + "step": 138 + }, + { + "epoch": 0.06693553241437429, + "grad_norm": 1.5906621217727661, + "learning_rate": 0.00019931643794908772, + "loss": 0.3922, + "step": 139 + }, + { + "epoch": 0.06741708300728345, + "grad_norm": 1.0307190418243408, + "learning_rate": 0.00019929810960135172, + "loss": 0.7885, + "step": 140 + }, + { + "epoch": 0.06789863360019262, + "grad_norm": 1.4867689609527588, + "learning_rate": 0.00019927953963983254, + "loss": 0.6464, + "step": 141 + }, + { + "epoch": 0.06838018419310178, + "grad_norm": 0.768412709236145, + "learning_rate": 0.00019926072810971492, + "loss": 0.5696, + "step": 142 + }, + { + "epoch": 0.06886173478601096, + "grad_norm": 0.8257545232772827, + "learning_rate": 0.00019924167505677137, + "loss": 0.9002, + "step": 143 + }, + { + "epoch": 0.06934328537892012, + "grad_norm": 1.1213666200637817, + "learning_rate": 0.00019922238052736215, + "loss": 0.9769, + "step": 144 + }, + { + "epoch": 0.06982483597182929, + "grad_norm": 1.8654189109802246, + "learning_rate": 0.00019920284456843498, + "loss": 0.9095, + "step": 145 + }, + { + "epoch": 0.07030638656473846, + "grad_norm": 2.3394224643707275, + "learning_rate": 0.00019918306722752505, + "loss": 0.5928, + "step": 146 + }, + { + "epoch": 0.07078793715764763, + "grad_norm": 2.084287405014038, + "learning_rate": 0.00019916304855275497, + "loss": 0.9175, + "step": 147 + }, + { + "epoch": 0.07126948775055679, + "grad_norm": 0.9744839668273926, + "learning_rate": 0.00019914278859283445, + "loss": 0.4434, + "step": 148 + }, + { + "epoch": 0.07175103834346597, + "grad_norm": 1.0905654430389404, + "learning_rate": 0.0001991222873970604, + "loss": 0.2876, + "step": 149 + }, + { + "epoch": 0.07223258893637513, + "grad_norm": 1.0302903652191162, + "learning_rate": 0.00019910154501531663, + "loss": 0.2347, + "step": 150 + }, + { + "epoch": 0.0727141395292843, + "grad_norm": 0.5432575941085815, + "learning_rate": 0.0001990805614980739, + "loss": 0.3851, + "step": 151 + }, + { + "epoch": 0.07319569012219346, + "grad_norm": 1.2816241979599, + "learning_rate": 0.00019905933689638955, + "loss": 0.7427, + "step": 152 + }, + { + "epoch": 0.07367724071510263, + "grad_norm": 0.5936122536659241, + "learning_rate": 0.00019903787126190772, + "loss": 0.3737, + "step": 153 + }, + { + "epoch": 0.0741587913080118, + "grad_norm": 0.7271316051483154, + "learning_rate": 0.00019901616464685888, + "loss": 0.5096, + "step": 154 + }, + { + "epoch": 0.07464034190092096, + "grad_norm": 1.7161284685134888, + "learning_rate": 0.00019899421710405996, + "loss": 0.6315, + "step": 155 + }, + { + "epoch": 0.07512189249383014, + "grad_norm": 2.101757764816284, + "learning_rate": 0.00019897202868691407, + "loss": 0.3086, + "step": 156 + }, + { + "epoch": 0.0756034430867393, + "grad_norm": 0.38321051001548767, + "learning_rate": 0.00019894959944941038, + "loss": 0.3113, + "step": 157 + }, + { + "epoch": 0.07608499367964847, + "grad_norm": 1.4041216373443604, + "learning_rate": 0.0001989269294461242, + "loss": 0.5961, + "step": 158 + }, + { + "epoch": 0.07656654427255763, + "grad_norm": 1.1330289840698242, + "learning_rate": 0.0001989040187322164, + "loss": 0.7356, + "step": 159 + }, + { + "epoch": 0.0770480948654668, + "grad_norm": 0.9022401571273804, + "learning_rate": 0.00019888086736343384, + "loss": 0.7259, + "step": 160 + }, + { + "epoch": 0.07752964545837597, + "grad_norm": 1.270098328590393, + "learning_rate": 0.0001988574753961087, + "loss": 0.3357, + "step": 161 + }, + { + "epoch": 0.07801119605128513, + "grad_norm": 1.9385939836502075, + "learning_rate": 0.00019883384288715874, + "loss": 0.7557, + "step": 162 + }, + { + "epoch": 0.07849274664419431, + "grad_norm": 0.8720075488090515, + "learning_rate": 0.000198809969894087, + "loss": 0.6153, + "step": 163 + }, + { + "epoch": 0.07897429723710347, + "grad_norm": 0.9843493103981018, + "learning_rate": 0.0001987858564749816, + "loss": 0.9762, + "step": 164 + }, + { + "epoch": 0.07945584783001264, + "grad_norm": 1.0602184534072876, + "learning_rate": 0.00019876150268851572, + "loss": 0.4329, + "step": 165 + }, + { + "epoch": 0.0799373984229218, + "grad_norm": 1.434399962425232, + "learning_rate": 0.00019873690859394737, + "loss": 0.5577, + "step": 166 + }, + { + "epoch": 0.08041894901583098, + "grad_norm": 1.0275940895080566, + "learning_rate": 0.0001987120742511193, + "loss": 0.5025, + "step": 167 + }, + { + "epoch": 0.08090049960874014, + "grad_norm": 0.7427517771720886, + "learning_rate": 0.0001986869997204589, + "loss": 0.1304, + "step": 168 + }, + { + "epoch": 0.0813820502016493, + "grad_norm": 0.7359098196029663, + "learning_rate": 0.00019866168506297788, + "loss": 0.6375, + "step": 169 + }, + { + "epoch": 0.08186360079455848, + "grad_norm": 0.8443692922592163, + "learning_rate": 0.00019863613034027224, + "loss": 0.2801, + "step": 170 + }, + { + "epoch": 0.08234515138746765, + "grad_norm": 1.6165578365325928, + "learning_rate": 0.00019861033561452223, + "loss": 0.733, + "step": 171 + }, + { + "epoch": 0.08282670198037681, + "grad_norm": 1.2908574342727661, + "learning_rate": 0.00019858430094849195, + "loss": 0.6496, + "step": 172 + }, + { + "epoch": 0.08330825257328599, + "grad_norm": 0.9643251299858093, + "learning_rate": 0.0001985580264055294, + "loss": 0.2752, + "step": 173 + }, + { + "epoch": 0.08378980316619515, + "grad_norm": 1.3496614694595337, + "learning_rate": 0.00019853151204956616, + "loss": 0.455, + "step": 174 + }, + { + "epoch": 0.08427135375910431, + "grad_norm": 1.5207899808883667, + "learning_rate": 0.00019850475794511749, + "loss": 0.4059, + "step": 175 + }, + { + "epoch": 0.08475290435201348, + "grad_norm": 0.9615983963012695, + "learning_rate": 0.00019847776415728185, + "loss": 0.362, + "step": 176 + }, + { + "epoch": 0.08523445494492266, + "grad_norm": 1.003831148147583, + "learning_rate": 0.000198450530751741, + "loss": 0.9694, + "step": 177 + }, + { + "epoch": 0.08571600553783182, + "grad_norm": 1.0050816535949707, + "learning_rate": 0.00019842305779475968, + "loss": 0.7572, + "step": 178 + }, + { + "epoch": 0.08619755613074098, + "grad_norm": 1.1458672285079956, + "learning_rate": 0.00019839534535318558, + "loss": 0.7746, + "step": 179 + }, + { + "epoch": 0.08667910672365016, + "grad_norm": 1.8988335132598877, + "learning_rate": 0.00019836739349444899, + "loss": 0.3546, + "step": 180 + }, + { + "epoch": 0.08716065731655932, + "grad_norm": 1.2725721597671509, + "learning_rate": 0.00019833920228656292, + "loss": 0.8146, + "step": 181 + }, + { + "epoch": 0.08764220790946849, + "grad_norm": 0.7741675972938538, + "learning_rate": 0.0001983107717981226, + "loss": 0.4781, + "step": 182 + }, + { + "epoch": 0.08812375850237765, + "grad_norm": 0.8907474279403687, + "learning_rate": 0.00019828210209830562, + "loss": 0.4843, + "step": 183 + }, + { + "epoch": 0.08860530909528683, + "grad_norm": 0.5008806586265564, + "learning_rate": 0.00019825319325687154, + "loss": 0.5877, + "step": 184 + }, + { + "epoch": 0.08908685968819599, + "grad_norm": 1.6996382474899292, + "learning_rate": 0.00019822404534416182, + "loss": 0.683, + "step": 185 + }, + { + "epoch": 0.08956841028110515, + "grad_norm": 1.2283899784088135, + "learning_rate": 0.00019819465843109963, + "loss": 0.6434, + "step": 186 + }, + { + "epoch": 0.09004996087401433, + "grad_norm": 0.6681622266769409, + "learning_rate": 0.00019816503258918969, + "loss": 0.1654, + "step": 187 + }, + { + "epoch": 0.0905315114669235, + "grad_norm": 0.8348255157470703, + "learning_rate": 0.00019813516789051808, + "loss": 0.6038, + "step": 188 + }, + { + "epoch": 0.09101306205983266, + "grad_norm": 1.049277424812317, + "learning_rate": 0.0001981050644077521, + "loss": 0.7738, + "step": 189 + }, + { + "epoch": 0.09149461265274182, + "grad_norm": 0.7587938904762268, + "learning_rate": 0.00019807472221414002, + "loss": 0.4791, + "step": 190 + }, + { + "epoch": 0.091976163245651, + "grad_norm": 0.8252549767494202, + "learning_rate": 0.00019804414138351094, + "loss": 0.6476, + "step": 191 + }, + { + "epoch": 0.09245771383856016, + "grad_norm": 2.6498475074768066, + "learning_rate": 0.00019801332199027467, + "loss": 0.9018, + "step": 192 + }, + { + "epoch": 0.09293926443146933, + "grad_norm": 1.3425184488296509, + "learning_rate": 0.00019798226410942146, + "loss": 0.6266, + "step": 193 + }, + { + "epoch": 0.0934208150243785, + "grad_norm": 1.1814745664596558, + "learning_rate": 0.00019795096781652182, + "loss": 0.8622, + "step": 194 + }, + { + "epoch": 0.09390236561728767, + "grad_norm": 0.6430162787437439, + "learning_rate": 0.00019791943318772643, + "loss": 0.7153, + "step": 195 + }, + { + "epoch": 0.09438391621019683, + "grad_norm": 0.6195939183235168, + "learning_rate": 0.00019788766029976587, + "loss": 0.5441, + "step": 196 + }, + { + "epoch": 0.09486546680310601, + "grad_norm": 1.0443035364151, + "learning_rate": 0.0001978556492299504, + "loss": 0.6088, + "step": 197 + }, + { + "epoch": 0.09534701739601517, + "grad_norm": 0.6970458626747131, + "learning_rate": 0.00019782340005616996, + "loss": 0.4531, + "step": 198 + }, + { + "epoch": 0.09582856798892433, + "grad_norm": 0.8405738472938538, + "learning_rate": 0.0001977909128568937, + "loss": 0.3627, + "step": 199 + }, + { + "epoch": 0.0963101185818335, + "grad_norm": 1.174099326133728, + "learning_rate": 0.00019775818771117, + "loss": 0.5563, + "step": 200 + }, + { + "epoch": 0.09679166917474268, + "grad_norm": 1.4284452199935913, + "learning_rate": 0.00019772522469862626, + "loss": 0.258, + "step": 201 + }, + { + "epoch": 0.09727321976765184, + "grad_norm": 2.1673169136047363, + "learning_rate": 0.00019769202389946863, + "loss": 0.4829, + "step": 202 + }, + { + "epoch": 0.097754770360561, + "grad_norm": 1.8256748914718628, + "learning_rate": 0.0001976585853944818, + "loss": 0.774, + "step": 203 + }, + { + "epoch": 0.09823632095347018, + "grad_norm": 0.7601093649864197, + "learning_rate": 0.0001976249092650289, + "loss": 0.1981, + "step": 204 + }, + { + "epoch": 0.09871787154637934, + "grad_norm": 0.2299424111843109, + "learning_rate": 0.00019759099559305124, + "loss": 0.198, + "step": 205 + }, + { + "epoch": 0.0991994221392885, + "grad_norm": 1.2616881132125854, + "learning_rate": 0.00019755684446106812, + "loss": 0.4902, + "step": 206 + }, + { + "epoch": 0.09968097273219767, + "grad_norm": 1.1655137538909912, + "learning_rate": 0.00019752245595217662, + "loss": 1.0278, + "step": 207 + }, + { + "epoch": 0.10016252332510685, + "grad_norm": 1.1448935270309448, + "learning_rate": 0.00019748783015005144, + "loss": 0.5074, + "step": 208 + }, + { + "epoch": 0.10064407391801601, + "grad_norm": 1.745911955833435, + "learning_rate": 0.00019745296713894465, + "loss": 0.2373, + "step": 209 + }, + { + "epoch": 0.10112562451092517, + "grad_norm": 0.8816413283348083, + "learning_rate": 0.00019741786700368548, + "loss": 0.2905, + "step": 210 + }, + { + "epoch": 0.10160717510383435, + "grad_norm": 0.8495388627052307, + "learning_rate": 0.00019738252982968017, + "loss": 0.5207, + "step": 211 + }, + { + "epoch": 0.10208872569674352, + "grad_norm": 0.4556237757205963, + "learning_rate": 0.00019734695570291168, + "loss": 0.3283, + "step": 212 + }, + { + "epoch": 0.10257027628965268, + "grad_norm": 2.6945221424102783, + "learning_rate": 0.00019731114470993962, + "loss": 0.6865, + "step": 213 + }, + { + "epoch": 0.10305182688256186, + "grad_norm": 1.1560450792312622, + "learning_rate": 0.0001972750969378998, + "loss": 0.6455, + "step": 214 + }, + { + "epoch": 0.10353337747547102, + "grad_norm": 0.6902031302452087, + "learning_rate": 0.00019723881247450434, + "loss": 0.3412, + "step": 215 + }, + { + "epoch": 0.10401492806838018, + "grad_norm": 0.6760352849960327, + "learning_rate": 0.0001972022914080411, + "loss": 0.8517, + "step": 216 + }, + { + "epoch": 0.10449647866128935, + "grad_norm": 1.7814079523086548, + "learning_rate": 0.00019716553382737379, + "loss": 0.3828, + "step": 217 + }, + { + "epoch": 0.10497802925419852, + "grad_norm": 1.5083913803100586, + "learning_rate": 0.00019712853982194152, + "loss": 0.7554, + "step": 218 + }, + { + "epoch": 0.10545957984710769, + "grad_norm": 1.2533457279205322, + "learning_rate": 0.00019709130948175876, + "loss": 0.7443, + "step": 219 + }, + { + "epoch": 0.10594113044001685, + "grad_norm": 0.36257970333099365, + "learning_rate": 0.0001970538428974149, + "loss": 0.282, + "step": 220 + }, + { + "epoch": 0.10642268103292603, + "grad_norm": 0.778958261013031, + "learning_rate": 0.00019701614016007436, + "loss": 0.7246, + "step": 221 + }, + { + "epoch": 0.10690423162583519, + "grad_norm": 0.696329653263092, + "learning_rate": 0.00019697820136147597, + "loss": 0.3454, + "step": 222 + }, + { + "epoch": 0.10738578221874436, + "grad_norm": 0.7349855303764343, + "learning_rate": 0.00019694002659393305, + "loss": 0.5118, + "step": 223 + }, + { + "epoch": 0.10786733281165352, + "grad_norm": 1.055173397064209, + "learning_rate": 0.0001969016159503331, + "loss": 0.7026, + "step": 224 + }, + { + "epoch": 0.1083488834045627, + "grad_norm": 0.6218364834785461, + "learning_rate": 0.00019686296952413747, + "loss": 0.3425, + "step": 225 + }, + { + "epoch": 0.10883043399747186, + "grad_norm": 1.393835425376892, + "learning_rate": 0.0001968240874093813, + "loss": 0.4741, + "step": 226 + }, + { + "epoch": 0.10931198459038102, + "grad_norm": 1.045833706855774, + "learning_rate": 0.00019678496970067325, + "loss": 0.4047, + "step": 227 + }, + { + "epoch": 0.1097935351832902, + "grad_norm": 0.523831844329834, + "learning_rate": 0.0001967456164931951, + "loss": 0.1359, + "step": 228 + }, + { + "epoch": 0.11027508577619936, + "grad_norm": 1.5808912515640259, + "learning_rate": 0.0001967060278827017, + "loss": 0.5815, + "step": 229 + }, + { + "epoch": 0.11075663636910853, + "grad_norm": 2.3675012588500977, + "learning_rate": 0.00019666620396552076, + "loss": 0.5516, + "step": 230 + }, + { + "epoch": 0.11123818696201769, + "grad_norm": 1.9838608503341675, + "learning_rate": 0.00019662614483855246, + "loss": 1.0456, + "step": 231 + }, + { + "epoch": 0.11171973755492687, + "grad_norm": 1.4750068187713623, + "learning_rate": 0.00019658585059926934, + "loss": 0.2397, + "step": 232 + }, + { + "epoch": 0.11220128814783603, + "grad_norm": 1.1223431825637817, + "learning_rate": 0.00019654532134571594, + "loss": 0.4388, + "step": 233 + }, + { + "epoch": 0.1126828387407452, + "grad_norm": 0.9283247590065002, + "learning_rate": 0.00019650455717650878, + "loss": 0.4922, + "step": 234 + }, + { + "epoch": 0.11316438933365437, + "grad_norm": 1.2402900457382202, + "learning_rate": 0.00019646355819083589, + "loss": 0.3975, + "step": 235 + }, + { + "epoch": 0.11364593992656354, + "grad_norm": 1.8718070983886719, + "learning_rate": 0.0001964223244884566, + "loss": 0.4683, + "step": 236 + }, + { + "epoch": 0.1141274905194727, + "grad_norm": 3.2220866680145264, + "learning_rate": 0.00019638085616970153, + "loss": 0.5209, + "step": 237 + }, + { + "epoch": 0.11460904111238188, + "grad_norm": 0.9346310496330261, + "learning_rate": 0.00019633915333547202, + "loss": 0.449, + "step": 238 + }, + { + "epoch": 0.11509059170529104, + "grad_norm": 2.3757474422454834, + "learning_rate": 0.00019629721608724004, + "loss": 0.7904, + "step": 239 + }, + { + "epoch": 0.1155721422982002, + "grad_norm": 1.2270057201385498, + "learning_rate": 0.0001962550445270481, + "loss": 0.5315, + "step": 240 + }, + { + "epoch": 0.11605369289110937, + "grad_norm": 1.180861234664917, + "learning_rate": 0.00019621263875750864, + "loss": 0.6519, + "step": 241 + }, + { + "epoch": 0.11653524348401854, + "grad_norm": 1.6197255849838257, + "learning_rate": 0.00019616999888180406, + "loss": 0.4915, + "step": 242 + }, + { + "epoch": 0.11701679407692771, + "grad_norm": 1.404569149017334, + "learning_rate": 0.0001961271250036865, + "loss": 0.8747, + "step": 243 + }, + { + "epoch": 0.11749834466983687, + "grad_norm": 1.1616580486297607, + "learning_rate": 0.0001960840172274773, + "loss": 0.3718, + "step": 244 + }, + { + "epoch": 0.11797989526274605, + "grad_norm": 0.897540271282196, + "learning_rate": 0.00019604067565806704, + "loss": 0.4625, + "step": 245 + }, + { + "epoch": 0.11846144585565521, + "grad_norm": 0.8197163939476013, + "learning_rate": 0.00019599710040091512, + "loss": 0.6373, + "step": 246 + }, + { + "epoch": 0.11894299644856438, + "grad_norm": 1.3133628368377686, + "learning_rate": 0.00019595329156204955, + "loss": 0.407, + "step": 247 + }, + { + "epoch": 0.11942454704147354, + "grad_norm": 1.0022404193878174, + "learning_rate": 0.00019590924924806676, + "loss": 0.2518, + "step": 248 + }, + { + "epoch": 0.11990609763438272, + "grad_norm": 0.5850613117218018, + "learning_rate": 0.0001958649735661312, + "loss": 0.7003, + "step": 249 + }, + { + "epoch": 0.12038764822729188, + "grad_norm": 0.6422004103660583, + "learning_rate": 0.00019582046462397515, + "loss": 0.4396, + "step": 250 + }, + { + "epoch": 0.12086919882020104, + "grad_norm": 0.7090483903884888, + "learning_rate": 0.00019577572252989854, + "loss": 0.4338, + "step": 251 + }, + { + "epoch": 0.12135074941311022, + "grad_norm": 1.4265540838241577, + "learning_rate": 0.00019573074739276858, + "loss": 0.7626, + "step": 252 + }, + { + "epoch": 0.12183230000601938, + "grad_norm": 0.5266678929328918, + "learning_rate": 0.00019568553932201947, + "loss": 0.2188, + "step": 253 + }, + { + "epoch": 0.12231385059892855, + "grad_norm": 2.281163215637207, + "learning_rate": 0.00019564009842765225, + "loss": 0.7731, + "step": 254 + }, + { + "epoch": 0.12279540119183771, + "grad_norm": 0.9748148322105408, + "learning_rate": 0.00019559442482023444, + "loss": 0.6383, + "step": 255 + }, + { + "epoch": 0.12327695178474689, + "grad_norm": 1.420639991760254, + "learning_rate": 0.0001955485186108998, + "loss": 0.5566, + "step": 256 + }, + { + "epoch": 0.12375850237765605, + "grad_norm": 0.7916795015335083, + "learning_rate": 0.00019550237991134805, + "loss": 0.4454, + "step": 257 + }, + { + "epoch": 0.12424005297056522, + "grad_norm": 3.603853225708008, + "learning_rate": 0.00019545600883384467, + "loss": 0.9742, + "step": 258 + }, + { + "epoch": 0.12472160356347439, + "grad_norm": 0.5654540061950684, + "learning_rate": 0.0001954094054912205, + "loss": 0.3091, + "step": 259 + }, + { + "epoch": 0.12520315415638356, + "grad_norm": 0.768267810344696, + "learning_rate": 0.00019536256999687157, + "loss": 0.2655, + "step": 260 + }, + { + "epoch": 0.12568470474929272, + "grad_norm": 0.9615038633346558, + "learning_rate": 0.00019531550246475876, + "loss": 0.3559, + "step": 261 + }, + { + "epoch": 0.12616625534220188, + "grad_norm": 1.5995615720748901, + "learning_rate": 0.00019526820300940756, + "loss": 0.8617, + "step": 262 + }, + { + "epoch": 0.12664780593511105, + "grad_norm": 0.7149393558502197, + "learning_rate": 0.00019522067174590778, + "loss": 0.4823, + "step": 263 + }, + { + "epoch": 0.12712935652802024, + "grad_norm": 0.7033841013908386, + "learning_rate": 0.00019517290878991324, + "loss": 0.4102, + "step": 264 + }, + { + "epoch": 0.1276109071209294, + "grad_norm": 0.5579375624656677, + "learning_rate": 0.0001951249142576416, + "loss": 0.9872, + "step": 265 + }, + { + "epoch": 0.12809245771383856, + "grad_norm": 0.7575063109397888, + "learning_rate": 0.00019507668826587387, + "loss": 0.6555, + "step": 266 + }, + { + "epoch": 0.12857400830674773, + "grad_norm": 1.1746582984924316, + "learning_rate": 0.0001950282309319544, + "loss": 0.9092, + "step": 267 + }, + { + "epoch": 0.1290555588996569, + "grad_norm": 1.4212498664855957, + "learning_rate": 0.0001949795423737903, + "loss": 0.7884, + "step": 268 + }, + { + "epoch": 0.12953710949256605, + "grad_norm": 1.8139575719833374, + "learning_rate": 0.00019493062270985144, + "loss": 0.6075, + "step": 269 + }, + { + "epoch": 0.13001866008547522, + "grad_norm": 0.8346084356307983, + "learning_rate": 0.00019488147205916985, + "loss": 0.4508, + "step": 270 + }, + { + "epoch": 0.1305002106783844, + "grad_norm": 0.9755773544311523, + "learning_rate": 0.00019483209054133976, + "loss": 0.4172, + "step": 271 + }, + { + "epoch": 0.13098176127129357, + "grad_norm": 5.871254920959473, + "learning_rate": 0.00019478247827651708, + "loss": 0.5855, + "step": 272 + }, + { + "epoch": 0.13146331186420274, + "grad_norm": 1.8749113082885742, + "learning_rate": 0.00019473263538541914, + "loss": 0.3368, + "step": 273 + }, + { + "epoch": 0.1319448624571119, + "grad_norm": 1.1247987747192383, + "learning_rate": 0.00019468256198932455, + "loss": 0.3617, + "step": 274 + }, + { + "epoch": 0.13242641305002106, + "grad_norm": 1.684675931930542, + "learning_rate": 0.00019463225821007268, + "loss": 0.9457, + "step": 275 + }, + { + "epoch": 0.13290796364293023, + "grad_norm": 0.854993999004364, + "learning_rate": 0.00019458172417006347, + "loss": 0.5827, + "step": 276 + }, + { + "epoch": 0.1333895142358394, + "grad_norm": 0.7931753396987915, + "learning_rate": 0.00019453095999225726, + "loss": 0.327, + "step": 277 + }, + { + "epoch": 0.13387106482874858, + "grad_norm": 0.6387701630592346, + "learning_rate": 0.0001944799658001742, + "loss": 0.3639, + "step": 278 + }, + { + "epoch": 0.13435261542165775, + "grad_norm": 0.8416104316711426, + "learning_rate": 0.00019442874171789418, + "loss": 0.485, + "step": 279 + }, + { + "epoch": 0.1348341660145669, + "grad_norm": 0.7823607921600342, + "learning_rate": 0.00019437728787005657, + "loss": 1.0629, + "step": 280 + }, + { + "epoch": 0.13531571660747607, + "grad_norm": 1.0796170234680176, + "learning_rate": 0.00019432560438185963, + "loss": 0.4245, + "step": 281 + }, + { + "epoch": 0.13579726720038524, + "grad_norm": 0.9896343946456909, + "learning_rate": 0.00019427369137906046, + "loss": 0.6039, + "step": 282 + }, + { + "epoch": 0.1362788177932944, + "grad_norm": 1.1208000183105469, + "learning_rate": 0.00019422154898797472, + "loss": 0.5287, + "step": 283 + }, + { + "epoch": 0.13676036838620356, + "grad_norm": 1.0062915086746216, + "learning_rate": 0.00019416917733547603, + "loss": 0.9135, + "step": 284 + }, + { + "epoch": 0.13724191897911275, + "grad_norm": 0.8195498585700989, + "learning_rate": 0.00019411657654899597, + "loss": 0.415, + "step": 285 + }, + { + "epoch": 0.13772346957202192, + "grad_norm": 1.4673748016357422, + "learning_rate": 0.0001940637467565237, + "loss": 0.5725, + "step": 286 + }, + { + "epoch": 0.13820502016493108, + "grad_norm": 0.8479679226875305, + "learning_rate": 0.00019401068808660546, + "loss": 0.5889, + "step": 287 + }, + { + "epoch": 0.13868657075784024, + "grad_norm": 0.5894740223884583, + "learning_rate": 0.0001939574006683445, + "loss": 0.5575, + "step": 288 + }, + { + "epoch": 0.1391681213507494, + "grad_norm": 0.7258508801460266, + "learning_rate": 0.00019390388463140065, + "loss": 0.2575, + "step": 289 + }, + { + "epoch": 0.13964967194365857, + "grad_norm": 1.139673113822937, + "learning_rate": 0.00019385014010598998, + "loss": 0.5806, + "step": 290 + }, + { + "epoch": 0.14013122253656773, + "grad_norm": 1.11739182472229, + "learning_rate": 0.00019379616722288456, + "loss": 0.4772, + "step": 291 + }, + { + "epoch": 0.14061277312947693, + "grad_norm": 0.9252748489379883, + "learning_rate": 0.0001937419661134121, + "loss": 0.667, + "step": 292 + }, + { + "epoch": 0.1410943237223861, + "grad_norm": 1.4080513715744019, + "learning_rate": 0.0001936875369094556, + "loss": 0.3355, + "step": 293 + }, + { + "epoch": 0.14157587431529525, + "grad_norm": 1.2333333492279053, + "learning_rate": 0.0001936328797434531, + "loss": 0.4218, + "step": 294 + }, + { + "epoch": 0.14205742490820442, + "grad_norm": 0.7450026869773865, + "learning_rate": 0.00019357799474839735, + "loss": 0.7525, + "step": 295 + }, + { + "epoch": 0.14253897550111358, + "grad_norm": 0.968317449092865, + "learning_rate": 0.00019352288205783536, + "loss": 0.3696, + "step": 296 + }, + { + "epoch": 0.14302052609402274, + "grad_norm": 0.6425029635429382, + "learning_rate": 0.00019346754180586825, + "loss": 0.2851, + "step": 297 + }, + { + "epoch": 0.14350207668693193, + "grad_norm": 0.7946616411209106, + "learning_rate": 0.00019341197412715082, + "loss": 0.5195, + "step": 298 + }, + { + "epoch": 0.1439836272798411, + "grad_norm": 0.5409590005874634, + "learning_rate": 0.00019335617915689128, + "loss": 0.2594, + "step": 299 + }, + { + "epoch": 0.14446517787275026, + "grad_norm": 0.7136370539665222, + "learning_rate": 0.00019330015703085082, + "loss": 0.31, + "step": 300 + }, + { + "epoch": 0.14494672846565942, + "grad_norm": 1.1085537672042847, + "learning_rate": 0.00019324390788534343, + "loss": 0.3294, + "step": 301 + }, + { + "epoch": 0.1454282790585686, + "grad_norm": 1.6439099311828613, + "learning_rate": 0.00019318743185723546, + "loss": 0.5998, + "step": 302 + }, + { + "epoch": 0.14590982965147775, + "grad_norm": 1.0844823122024536, + "learning_rate": 0.00019313072908394525, + "loss": 0.7925, + "step": 303 + }, + { + "epoch": 0.14639138024438691, + "grad_norm": 1.0109717845916748, + "learning_rate": 0.00019307379970344294, + "loss": 0.4869, + "step": 304 + }, + { + "epoch": 0.1468729308372961, + "grad_norm": 0.5586534142494202, + "learning_rate": 0.00019301664385425004, + "loss": 0.4611, + "step": 305 + }, + { + "epoch": 0.14735448143020527, + "grad_norm": 0.6117091178894043, + "learning_rate": 0.0001929592616754391, + "loss": 0.3099, + "step": 306 + }, + { + "epoch": 0.14783603202311443, + "grad_norm": 0.5635220408439636, + "learning_rate": 0.00019290165330663336, + "loss": 0.7553, + "step": 307 + }, + { + "epoch": 0.1483175826160236, + "grad_norm": 1.1754608154296875, + "learning_rate": 0.00019284381888800647, + "loss": 0.5363, + "step": 308 + }, + { + "epoch": 0.14879913320893276, + "grad_norm": 0.942220151424408, + "learning_rate": 0.00019278575856028206, + "loss": 0.6659, + "step": 309 + }, + { + "epoch": 0.14928068380184192, + "grad_norm": 0.9025714993476868, + "learning_rate": 0.00019272747246473345, + "loss": 0.2004, + "step": 310 + }, + { + "epoch": 0.1497622343947511, + "grad_norm": 1.1444401741027832, + "learning_rate": 0.00019266896074318334, + "loss": 0.6592, + "step": 311 + }, + { + "epoch": 0.15024378498766028, + "grad_norm": 1.4161713123321533, + "learning_rate": 0.00019261022353800344, + "loss": 0.3902, + "step": 312 + }, + { + "epoch": 0.15072533558056944, + "grad_norm": 1.1736912727355957, + "learning_rate": 0.00019255126099211402, + "loss": 0.5631, + "step": 313 + }, + { + "epoch": 0.1512068861734786, + "grad_norm": 0.8065115213394165, + "learning_rate": 0.00019249207324898376, + "loss": 0.6542, + "step": 314 + }, + { + "epoch": 0.15168843676638777, + "grad_norm": 0.8951350450515747, + "learning_rate": 0.0001924326604526292, + "loss": 0.4188, + "step": 315 + }, + { + "epoch": 0.15216998735929693, + "grad_norm": 0.7340465188026428, + "learning_rate": 0.00019237302274761458, + "loss": 0.3593, + "step": 316 + }, + { + "epoch": 0.1526515379522061, + "grad_norm": 1.4768790006637573, + "learning_rate": 0.0001923131602790513, + "loss": 0.6562, + "step": 317 + }, + { + "epoch": 0.15313308854511526, + "grad_norm": 0.6574330925941467, + "learning_rate": 0.00019225307319259768, + "loss": 0.651, + "step": 318 + }, + { + "epoch": 0.15361463913802445, + "grad_norm": 1.21963369846344, + "learning_rate": 0.00019219276163445862, + "loss": 0.5836, + "step": 319 + }, + { + "epoch": 0.1540961897309336, + "grad_norm": 0.8839986324310303, + "learning_rate": 0.00019213222575138522, + "loss": 0.7771, + "step": 320 + }, + { + "epoch": 0.15457774032384278, + "grad_norm": 1.0054893493652344, + "learning_rate": 0.00019207146569067435, + "loss": 0.3249, + "step": 321 + }, + { + "epoch": 0.15505929091675194, + "grad_norm": 1.431882381439209, + "learning_rate": 0.00019201048160016838, + "loss": 0.8559, + "step": 322 + }, + { + "epoch": 0.1555408415096611, + "grad_norm": 0.5387030243873596, + "learning_rate": 0.00019194927362825478, + "loss": 0.5607, + "step": 323 + }, + { + "epoch": 0.15602239210257027, + "grad_norm": 0.6884758472442627, + "learning_rate": 0.00019188784192386587, + "loss": 0.4673, + "step": 324 + }, + { + "epoch": 0.15650394269547943, + "grad_norm": 0.5484024286270142, + "learning_rate": 0.00019182618663647817, + "loss": 0.2409, + "step": 325 + }, + { + "epoch": 0.15698549328838862, + "grad_norm": 1.0389750003814697, + "learning_rate": 0.0001917643079161124, + "loss": 0.767, + "step": 326 + }, + { + "epoch": 0.15746704388129779, + "grad_norm": 1.2948729991912842, + "learning_rate": 0.00019170220591333283, + "loss": 0.3611, + "step": 327 + }, + { + "epoch": 0.15794859447420695, + "grad_norm": 0.956086277961731, + "learning_rate": 0.00019163988077924713, + "loss": 0.7923, + "step": 328 + }, + { + "epoch": 0.1584301450671161, + "grad_norm": 0.5217251181602478, + "learning_rate": 0.00019157733266550575, + "loss": 0.2193, + "step": 329 + }, + { + "epoch": 0.15891169566002528, + "grad_norm": 0.7246752977371216, + "learning_rate": 0.00019151456172430183, + "loss": 0.457, + "step": 330 + }, + { + "epoch": 0.15939324625293444, + "grad_norm": 1.4637582302093506, + "learning_rate": 0.0001914515681083707, + "loss": 0.7846, + "step": 331 + }, + { + "epoch": 0.1598747968458436, + "grad_norm": 0.9213384389877319, + "learning_rate": 0.00019138835197098937, + "loss": 0.5576, + "step": 332 + }, + { + "epoch": 0.1603563474387528, + "grad_norm": 0.7289940118789673, + "learning_rate": 0.00019132491346597643, + "loss": 0.7084, + "step": 333 + }, + { + "epoch": 0.16083789803166196, + "grad_norm": 0.7258747220039368, + "learning_rate": 0.00019126125274769145, + "loss": 0.2721, + "step": 334 + }, + { + "epoch": 0.16131944862457112, + "grad_norm": 0.6859827041625977, + "learning_rate": 0.00019119736997103476, + "loss": 0.3417, + "step": 335 + }, + { + "epoch": 0.16180099921748028, + "grad_norm": 0.7890269756317139, + "learning_rate": 0.000191133265291447, + "loss": 0.3954, + "step": 336 + }, + { + "epoch": 0.16228254981038945, + "grad_norm": 0.6682295203208923, + "learning_rate": 0.00019106893886490864, + "loss": 0.5069, + "step": 337 + }, + { + "epoch": 0.1627641004032986, + "grad_norm": 0.1675347089767456, + "learning_rate": 0.00019100439084793989, + "loss": 0.2698, + "step": 338 + }, + { + "epoch": 0.1632456509962078, + "grad_norm": 0.9134905934333801, + "learning_rate": 0.00019093962139759998, + "loss": 0.754, + "step": 339 + }, + { + "epoch": 0.16372720158911697, + "grad_norm": 0.7806122303009033, + "learning_rate": 0.000190874630671487, + "loss": 0.5684, + "step": 340 + }, + { + "epoch": 0.16420875218202613, + "grad_norm": 0.8654987812042236, + "learning_rate": 0.00019080941882773745, + "loss": 0.79, + "step": 341 + }, + { + "epoch": 0.1646903027749353, + "grad_norm": 1.0775362253189087, + "learning_rate": 0.00019074398602502584, + "loss": 0.4995, + "step": 342 + }, + { + "epoch": 0.16517185336784446, + "grad_norm": 1.9703220129013062, + "learning_rate": 0.00019067833242256442, + "loss": 0.8332, + "step": 343 + }, + { + "epoch": 0.16565340396075362, + "grad_norm": 1.599793791770935, + "learning_rate": 0.0001906124581801025, + "loss": 0.3727, + "step": 344 + }, + { + "epoch": 0.16613495455366278, + "grad_norm": 0.5155896544456482, + "learning_rate": 0.0001905463634579264, + "loss": 0.3954, + "step": 345 + }, + { + "epoch": 0.16661650514657197, + "grad_norm": 2.192990303039551, + "learning_rate": 0.00019048004841685888, + "loss": 0.7787, + "step": 346 + }, + { + "epoch": 0.16709805573948114, + "grad_norm": 1.794849157333374, + "learning_rate": 0.00019041351321825883, + "loss": 0.6315, + "step": 347 + }, + { + "epoch": 0.1675796063323903, + "grad_norm": 0.8186489939689636, + "learning_rate": 0.00019034675802402068, + "loss": 0.8297, + "step": 348 + }, + { + "epoch": 0.16806115692529947, + "grad_norm": 0.7664559483528137, + "learning_rate": 0.00019027978299657436, + "loss": 0.3406, + "step": 349 + }, + { + "epoch": 0.16854270751820863, + "grad_norm": 0.8643622398376465, + "learning_rate": 0.00019021258829888456, + "loss": 0.4621, + "step": 350 + }, + { + "epoch": 0.1690242581111178, + "grad_norm": 0.482008159160614, + "learning_rate": 0.00019014517409445052, + "loss": 0.152, + "step": 351 + }, + { + "epoch": 0.16950580870402696, + "grad_norm": 0.8008543252944946, + "learning_rate": 0.00019007754054730554, + "loss": 0.5645, + "step": 352 + }, + { + "epoch": 0.16998735929693615, + "grad_norm": 0.8540553450584412, + "learning_rate": 0.00019000968782201675, + "loss": 0.3154, + "step": 353 + }, + { + "epoch": 0.1704689098898453, + "grad_norm": 0.9066978096961975, + "learning_rate": 0.00018994161608368448, + "loss": 0.9975, + "step": 354 + }, + { + "epoch": 0.17095046048275447, + "grad_norm": 0.5035725235939026, + "learning_rate": 0.00018987332549794196, + "loss": 0.5239, + "step": 355 + }, + { + "epoch": 0.17143201107566364, + "grad_norm": 0.3529674708843231, + "learning_rate": 0.00018980481623095502, + "loss": 0.2252, + "step": 356 + }, + { + "epoch": 0.1719135616685728, + "grad_norm": 1.7740199565887451, + "learning_rate": 0.00018973608844942148, + "loss": 0.6024, + "step": 357 + }, + { + "epoch": 0.17239511226148196, + "grad_norm": 0.6623877286911011, + "learning_rate": 0.00018966714232057094, + "loss": 0.7023, + "step": 358 + }, + { + "epoch": 0.17287666285439113, + "grad_norm": 0.22552219033241272, + "learning_rate": 0.00018959797801216418, + "loss": 0.0767, + "step": 359 + }, + { + "epoch": 0.17335821344730032, + "grad_norm": 0.6623592972755432, + "learning_rate": 0.000189528595692493, + "loss": 0.3355, + "step": 360 + }, + { + "epoch": 0.17383976404020948, + "grad_norm": 0.6635386943817139, + "learning_rate": 0.00018945899553037956, + "loss": 0.5255, + "step": 361 + }, + { + "epoch": 0.17432131463311865, + "grad_norm": 2.071317672729492, + "learning_rate": 0.00018938917769517613, + "loss": 0.7143, + "step": 362 + }, + { + "epoch": 0.1748028652260278, + "grad_norm": 1.2140402793884277, + "learning_rate": 0.00018931914235676458, + "loss": 0.6534, + "step": 363 + }, + { + "epoch": 0.17528441581893697, + "grad_norm": 1.3414770364761353, + "learning_rate": 0.00018924888968555606, + "loss": 0.1737, + "step": 364 + }, + { + "epoch": 0.17576596641184614, + "grad_norm": 0.38692179322242737, + "learning_rate": 0.00018917841985249055, + "loss": 0.3119, + "step": 365 + }, + { + "epoch": 0.1762475170047553, + "grad_norm": 1.0631595849990845, + "learning_rate": 0.0001891077330290363, + "loss": 0.9288, + "step": 366 + }, + { + "epoch": 0.1767290675976645, + "grad_norm": 0.7500083446502686, + "learning_rate": 0.00018903682938718977, + "loss": 0.8262, + "step": 367 + }, + { + "epoch": 0.17721061819057365, + "grad_norm": 1.7560081481933594, + "learning_rate": 0.00018896570909947475, + "loss": 0.4524, + "step": 368 + }, + { + "epoch": 0.17769216878348282, + "grad_norm": 0.8761142492294312, + "learning_rate": 0.00018889437233894234, + "loss": 0.4647, + "step": 369 + }, + { + "epoch": 0.17817371937639198, + "grad_norm": 0.7768288850784302, + "learning_rate": 0.0001888228192791703, + "loss": 0.8042, + "step": 370 + }, + { + "epoch": 0.17865526996930114, + "grad_norm": 0.7059733271598816, + "learning_rate": 0.00018875105009426272, + "loss": 0.3282, + "step": 371 + }, + { + "epoch": 0.1791368205622103, + "grad_norm": 0.6245189905166626, + "learning_rate": 0.00018867906495884955, + "loss": 0.7414, + "step": 372 + }, + { + "epoch": 0.17961837115511947, + "grad_norm": 0.5447114706039429, + "learning_rate": 0.0001886068640480862, + "loss": 0.5538, + "step": 373 + }, + { + "epoch": 0.18009992174802866, + "grad_norm": 1.409658670425415, + "learning_rate": 0.00018853444753765306, + "loss": 0.606, + "step": 374 + }, + { + "epoch": 0.18058147234093783, + "grad_norm": 0.5553191304206848, + "learning_rate": 0.00018846181560375525, + "loss": 0.3979, + "step": 375 + }, + { + "epoch": 0.181063022933847, + "grad_norm": 0.588184654712677, + "learning_rate": 0.0001883889684231219, + "loss": 0.4796, + "step": 376 + }, + { + "epoch": 0.18154457352675615, + "grad_norm": 1.3587735891342163, + "learning_rate": 0.000188315906173006, + "loss": 0.9258, + "step": 377 + }, + { + "epoch": 0.18202612411966532, + "grad_norm": 1.4763239622116089, + "learning_rate": 0.0001882426290311838, + "loss": 0.3047, + "step": 378 + }, + { + "epoch": 0.18250767471257448, + "grad_norm": 1.1970669031143188, + "learning_rate": 0.00018816913717595445, + "loss": 0.5431, + "step": 379 + }, + { + "epoch": 0.18298922530548364, + "grad_norm": 0.690443754196167, + "learning_rate": 0.00018809543078613953, + "loss": 0.4144, + "step": 380 + }, + { + "epoch": 0.18347077589839283, + "grad_norm": 0.8397567868232727, + "learning_rate": 0.00018802151004108263, + "loss": 0.3262, + "step": 381 + }, + { + "epoch": 0.183952326491302, + "grad_norm": 0.8233490586280823, + "learning_rate": 0.0001879473751206489, + "loss": 0.8446, + "step": 382 + }, + { + "epoch": 0.18443387708421116, + "grad_norm": 0.7418449521064758, + "learning_rate": 0.00018787302620522467, + "loss": 0.3582, + "step": 383 + }, + { + "epoch": 0.18491542767712033, + "grad_norm": 1.2972848415374756, + "learning_rate": 0.00018779846347571693, + "loss": 0.2662, + "step": 384 + }, + { + "epoch": 0.1853969782700295, + "grad_norm": 1.407714605331421, + "learning_rate": 0.0001877236871135529, + "loss": 0.3497, + "step": 385 + }, + { + "epoch": 0.18587852886293865, + "grad_norm": 0.9143611192703247, + "learning_rate": 0.00018764869730067968, + "loss": 0.7898, + "step": 386 + }, + { + "epoch": 0.18636007945584784, + "grad_norm": 0.9028319716453552, + "learning_rate": 0.0001875734942195637, + "loss": 0.3232, + "step": 387 + }, + { + "epoch": 0.186841630048757, + "grad_norm": 0.5696462392807007, + "learning_rate": 0.0001874980780531903, + "loss": 0.6027, + "step": 388 + }, + { + "epoch": 0.18732318064166617, + "grad_norm": 1.0089211463928223, + "learning_rate": 0.00018742244898506337, + "loss": 0.6073, + "step": 389 + }, + { + "epoch": 0.18780473123457533, + "grad_norm": 1.373483657836914, + "learning_rate": 0.00018734660719920475, + "loss": 0.7117, + "step": 390 + }, + { + "epoch": 0.1882862818274845, + "grad_norm": 0.6282643675804138, + "learning_rate": 0.00018727055288015397, + "loss": 0.682, + "step": 391 + }, + { + "epoch": 0.18876783242039366, + "grad_norm": 11.492624282836914, + "learning_rate": 0.00018719428621296764, + "loss": 1.1267, + "step": 392 + }, + { + "epoch": 0.18924938301330282, + "grad_norm": 0.6377469897270203, + "learning_rate": 0.00018711780738321897, + "loss": 0.2162, + "step": 393 + }, + { + "epoch": 0.18973093360621202, + "grad_norm": 0.9371182322502136, + "learning_rate": 0.00018704111657699758, + "loss": 0.3763, + "step": 394 + }, + { + "epoch": 0.19021248419912118, + "grad_norm": 0.4177301228046417, + "learning_rate": 0.0001869642139809088, + "loss": 0.5569, + "step": 395 + }, + { + "epoch": 0.19069403479203034, + "grad_norm": 0.7619231343269348, + "learning_rate": 0.00018688709978207323, + "loss": 0.4897, + "step": 396 + }, + { + "epoch": 0.1911755853849395, + "grad_norm": 0.7791587114334106, + "learning_rate": 0.00018680977416812644, + "loss": 0.5514, + "step": 397 + }, + { + "epoch": 0.19165713597784867, + "grad_norm": 0.7366723418235779, + "learning_rate": 0.00018673223732721837, + "loss": 0.3122, + "step": 398 + }, + { + "epoch": 0.19213868657075783, + "grad_norm": 0.6116033792495728, + "learning_rate": 0.0001866544894480129, + "loss": 0.6341, + "step": 399 + }, + { + "epoch": 0.192620237163667, + "grad_norm": 0.5431279540061951, + "learning_rate": 0.00018657653071968747, + "loss": 0.6062, + "step": 400 + }, + { + "epoch": 0.1931017877565762, + "grad_norm": 1.0275205373764038, + "learning_rate": 0.00018649836133193253, + "loss": 0.6843, + "step": 401 + }, + { + "epoch": 0.19358333834948535, + "grad_norm": 0.669370710849762, + "learning_rate": 0.00018641998147495112, + "loss": 0.1173, + "step": 402 + }, + { + "epoch": 0.19406488894239451, + "grad_norm": 0.5287774205207825, + "learning_rate": 0.00018634139133945837, + "loss": 0.3223, + "step": 403 + }, + { + "epoch": 0.19454643953530368, + "grad_norm": 0.8857590556144714, + "learning_rate": 0.00018626259111668105, + "loss": 0.8797, + "step": 404 + }, + { + "epoch": 0.19502799012821284, + "grad_norm": 1.0080000162124634, + "learning_rate": 0.00018618358099835723, + "loss": 0.7005, + "step": 405 + }, + { + "epoch": 0.195509540721122, + "grad_norm": 0.6499501466751099, + "learning_rate": 0.00018610436117673555, + "loss": 0.3307, + "step": 406 + }, + { + "epoch": 0.19599109131403117, + "grad_norm": 0.9055879712104797, + "learning_rate": 0.00018602493184457505, + "loss": 0.3655, + "step": 407 + }, + { + "epoch": 0.19647264190694036, + "grad_norm": 0.8727465867996216, + "learning_rate": 0.00018594529319514437, + "loss": 0.4094, + "step": 408 + }, + { + "epoch": 0.19695419249984952, + "grad_norm": 0.7476904392242432, + "learning_rate": 0.00018586544542222169, + "loss": 0.6138, + "step": 409 + }, + { + "epoch": 0.1974357430927587, + "grad_norm": 0.9040901064872742, + "learning_rate": 0.00018578538872009384, + "loss": 0.3129, + "step": 410 + }, + { + "epoch": 0.19791729368566785, + "grad_norm": 0.33781591057777405, + "learning_rate": 0.00018570512328355612, + "loss": 0.2414, + "step": 411 + }, + { + "epoch": 0.198398844278577, + "grad_norm": 1.175412893295288, + "learning_rate": 0.00018562464930791167, + "loss": 0.7982, + "step": 412 + }, + { + "epoch": 0.19888039487148618, + "grad_norm": 1.1210882663726807, + "learning_rate": 0.00018554396698897116, + "loss": 0.299, + "step": 413 + }, + { + "epoch": 0.19936194546439534, + "grad_norm": 0.8329453468322754, + "learning_rate": 0.00018546307652305205, + "loss": 0.4991, + "step": 414 + }, + { + "epoch": 0.19984349605730453, + "grad_norm": 0.7438675761222839, + "learning_rate": 0.00018538197810697842, + "loss": 0.3879, + "step": 415 + }, + { + "epoch": 0.2003250466502137, + "grad_norm": 1.248732328414917, + "learning_rate": 0.0001853006719380802, + "loss": 0.3897, + "step": 416 + }, + { + "epoch": 0.20080659724312286, + "grad_norm": 1.3640618324279785, + "learning_rate": 0.00018521915821419284, + "loss": 0.6058, + "step": 417 + }, + { + "epoch": 0.20128814783603202, + "grad_norm": 0.6176584959030151, + "learning_rate": 0.00018513743713365698, + "loss": 0.4335, + "step": 418 + }, + { + "epoch": 0.20176969842894119, + "grad_norm": 0.8122994899749756, + "learning_rate": 0.00018505550889531765, + "loss": 0.267, + "step": 419 + }, + { + "epoch": 0.20225124902185035, + "grad_norm": 1.1649123430252075, + "learning_rate": 0.00018497337369852395, + "loss": 0.2833, + "step": 420 + }, + { + "epoch": 0.2027327996147595, + "grad_norm": 33.362525939941406, + "learning_rate": 0.0001848910317431286, + "loss": 1.2794, + "step": 421 + }, + { + "epoch": 0.2032143502076687, + "grad_norm": 18.580949783325195, + "learning_rate": 0.00018480848322948739, + "loss": 0.6011, + "step": 422 + }, + { + "epoch": 0.20369590080057787, + "grad_norm": 4.552126884460449, + "learning_rate": 0.00018472572835845873, + "loss": 0.5124, + "step": 423 + }, + { + "epoch": 0.20417745139348703, + "grad_norm": 11.55855655670166, + "learning_rate": 0.00018464276733140306, + "loss": 1.3805, + "step": 424 + }, + { + "epoch": 0.2046590019863962, + "grad_norm": 5.1312737464904785, + "learning_rate": 0.0001845596003501826, + "loss": 0.4854, + "step": 425 + }, + { + "epoch": 0.20514055257930536, + "grad_norm": 6.843419551849365, + "learning_rate": 0.00018447622761716057, + "loss": 0.3659, + "step": 426 + }, + { + "epoch": 0.20562210317221452, + "grad_norm": 2.014824390411377, + "learning_rate": 0.00018439264933520084, + "loss": 0.8321, + "step": 427 + }, + { + "epoch": 0.2061036537651237, + "grad_norm": 0.7115727066993713, + "learning_rate": 0.00018430886570766747, + "loss": 0.6498, + "step": 428 + }, + { + "epoch": 0.20658520435803288, + "grad_norm": 0.8766760230064392, + "learning_rate": 0.0001842248769384242, + "loss": 0.9035, + "step": 429 + }, + { + "epoch": 0.20706675495094204, + "grad_norm": 0.41129961609840393, + "learning_rate": 0.00018414068323183375, + "loss": 0.4483, + "step": 430 + }, + { + "epoch": 0.2075483055438512, + "grad_norm": 0.6448122262954712, + "learning_rate": 0.00018405628479275775, + "loss": 0.317, + "step": 431 + }, + { + "epoch": 0.20802985613676037, + "grad_norm": 0.35214823484420776, + "learning_rate": 0.00018397168182655583, + "loss": 0.3672, + "step": 432 + }, + { + "epoch": 0.20851140672966953, + "grad_norm": 0.7121447920799255, + "learning_rate": 0.00018388687453908527, + "loss": 0.4216, + "step": 433 + }, + { + "epoch": 0.2089929573225787, + "grad_norm": 1.320578694343567, + "learning_rate": 0.00018380186313670058, + "loss": 0.6246, + "step": 434 + }, + { + "epoch": 0.20947450791548788, + "grad_norm": 1.3082513809204102, + "learning_rate": 0.00018371664782625287, + "loss": 0.3954, + "step": 435 + }, + { + "epoch": 0.20995605850839705, + "grad_norm": 1.6819370985031128, + "learning_rate": 0.00018363122881508945, + "loss": 0.6674, + "step": 436 + }, + { + "epoch": 0.2104376091013062, + "grad_norm": 0.6928601861000061, + "learning_rate": 0.00018354560631105328, + "loss": 0.0853, + "step": 437 + }, + { + "epoch": 0.21091915969421537, + "grad_norm": 1.2375073432922363, + "learning_rate": 0.00018345978052248233, + "loss": 0.6348, + "step": 438 + }, + { + "epoch": 0.21140071028712454, + "grad_norm": 0.32179486751556396, + "learning_rate": 0.00018337375165820944, + "loss": 0.5541, + "step": 439 + }, + { + "epoch": 0.2118822608800337, + "grad_norm": 1.0096837282180786, + "learning_rate": 0.00018328751992756137, + "loss": 0.428, + "step": 440 + }, + { + "epoch": 0.21236381147294286, + "grad_norm": 2.5094215869903564, + "learning_rate": 0.0001832010855403586, + "loss": 0.5619, + "step": 441 + }, + { + "epoch": 0.21284536206585206, + "grad_norm": 0.801345705986023, + "learning_rate": 0.0001831144487069147, + "loss": 0.626, + "step": 442 + }, + { + "epoch": 0.21332691265876122, + "grad_norm": 0.7294154763221741, + "learning_rate": 0.0001830276096380358, + "loss": 0.2961, + "step": 443 + }, + { + "epoch": 0.21380846325167038, + "grad_norm": 1.20921790599823, + "learning_rate": 0.0001829405685450202, + "loss": 0.7299, + "step": 444 + }, + { + "epoch": 0.21429001384457955, + "grad_norm": 3.153576612472534, + "learning_rate": 0.00018285332563965765, + "loss": 0.9845, + "step": 445 + }, + { + "epoch": 0.2147715644374887, + "grad_norm": 1.9264358282089233, + "learning_rate": 0.00018276588113422905, + "loss": 0.453, + "step": 446 + }, + { + "epoch": 0.21525311503039787, + "grad_norm": 0.8699561357498169, + "learning_rate": 0.00018267823524150575, + "loss": 0.8435, + "step": 447 + }, + { + "epoch": 0.21573466562330704, + "grad_norm": 0.4549397826194763, + "learning_rate": 0.00018259038817474923, + "loss": 0.3738, + "step": 448 + }, + { + "epoch": 0.21621621621621623, + "grad_norm": 0.4085061550140381, + "learning_rate": 0.0001825023401477104, + "loss": 0.2635, + "step": 449 + }, + { + "epoch": 0.2166977668091254, + "grad_norm": 0.6129693388938904, + "learning_rate": 0.0001824140913746291, + "loss": 0.5645, + "step": 450 + }, + { + "epoch": 0.21717931740203456, + "grad_norm": 0.6665123105049133, + "learning_rate": 0.00018232564207023376, + "loss": 0.2078, + "step": 451 + }, + { + "epoch": 0.21766086799494372, + "grad_norm": 3.9335904121398926, + "learning_rate": 0.00018223699244974064, + "loss": 0.4767, + "step": 452 + }, + { + "epoch": 0.21814241858785288, + "grad_norm": 0.4603537619113922, + "learning_rate": 0.00018214814272885343, + "loss": 0.485, + "step": 453 + }, + { + "epoch": 0.21862396918076205, + "grad_norm": 0.8026458024978638, + "learning_rate": 0.00018205909312376276, + "loss": 0.6056, + "step": 454 + }, + { + "epoch": 0.2191055197736712, + "grad_norm": 0.575541079044342, + "learning_rate": 0.00018196984385114554, + "loss": 0.6808, + "step": 455 + }, + { + "epoch": 0.2195870703665804, + "grad_norm": 0.7481828927993774, + "learning_rate": 0.0001818803951281646, + "loss": 0.5458, + "step": 456 + }, + { + "epoch": 0.22006862095948956, + "grad_norm": 1.563093662261963, + "learning_rate": 0.000181790747172468, + "loss": 0.4364, + "step": 457 + }, + { + "epoch": 0.22055017155239873, + "grad_norm": 1.4883453845977783, + "learning_rate": 0.00018170090020218864, + "loss": 0.711, + "step": 458 + }, + { + "epoch": 0.2210317221453079, + "grad_norm": 1.3732603788375854, + "learning_rate": 0.00018161085443594365, + "loss": 0.3331, + "step": 459 + }, + { + "epoch": 0.22151327273821705, + "grad_norm": 0.5254781246185303, + "learning_rate": 0.00018152061009283382, + "loss": 0.2258, + "step": 460 + }, + { + "epoch": 0.22199482333112622, + "grad_norm": 0.8934621214866638, + "learning_rate": 0.00018143016739244314, + "loss": 0.5708, + "step": 461 + }, + { + "epoch": 0.22247637392403538, + "grad_norm": 1.4122041463851929, + "learning_rate": 0.0001813395265548383, + "loss": 0.7412, + "step": 462 + }, + { + "epoch": 0.22295792451694457, + "grad_norm": 0.9841462969779968, + "learning_rate": 0.00018124868780056814, + "loss": 0.6168, + "step": 463 + }, + { + "epoch": 0.22343947510985374, + "grad_norm": 1.0066325664520264, + "learning_rate": 0.0001811576513506629, + "loss": 0.4116, + "step": 464 + }, + { + "epoch": 0.2239210257027629, + "grad_norm": 1.3872987031936646, + "learning_rate": 0.00018106641742663397, + "loss": 0.7331, + "step": 465 + }, + { + "epoch": 0.22440257629567206, + "grad_norm": 2.360673666000366, + "learning_rate": 0.00018097498625047328, + "loss": 0.6713, + "step": 466 + }, + { + "epoch": 0.22488412688858123, + "grad_norm": 1.2105039358139038, + "learning_rate": 0.00018088335804465258, + "loss": 0.5156, + "step": 467 + }, + { + "epoch": 0.2253656774814904, + "grad_norm": 0.5330220460891724, + "learning_rate": 0.00018079153303212318, + "loss": 0.2732, + "step": 468 + }, + { + "epoch": 0.22584722807439955, + "grad_norm": 1.4327185153961182, + "learning_rate": 0.0001806995114363152, + "loss": 0.6834, + "step": 469 + }, + { + "epoch": 0.22632877866730874, + "grad_norm": 0.7444460391998291, + "learning_rate": 0.00018060729348113707, + "loss": 0.3343, + "step": 470 + }, + { + "epoch": 0.2268103292602179, + "grad_norm": 1.6748604774475098, + "learning_rate": 0.00018051487939097505, + "loss": 0.5039, + "step": 471 + }, + { + "epoch": 0.22729187985312707, + "grad_norm": 1.474113941192627, + "learning_rate": 0.00018042226939069255, + "loss": 0.9008, + "step": 472 + }, + { + "epoch": 0.22777343044603623, + "grad_norm": 0.7277254462242126, + "learning_rate": 0.00018032946370562982, + "loss": 0.4241, + "step": 473 + }, + { + "epoch": 0.2282549810389454, + "grad_norm": 0.6519062519073486, + "learning_rate": 0.00018023646256160313, + "loss": 0.2707, + "step": 474 + }, + { + "epoch": 0.22873653163185456, + "grad_norm": 1.9292597770690918, + "learning_rate": 0.00018014326618490437, + "loss": 0.2273, + "step": 475 + }, + { + "epoch": 0.22921808222476375, + "grad_norm": 0.9677076935768127, + "learning_rate": 0.0001800498748023005, + "loss": 0.7582, + "step": 476 + }, + { + "epoch": 0.22969963281767292, + "grad_norm": 0.466116338968277, + "learning_rate": 0.000179956288641033, + "loss": 0.1905, + "step": 477 + }, + { + "epoch": 0.23018118341058208, + "grad_norm": 0.2711421251296997, + "learning_rate": 0.00017986250792881718, + "loss": 0.1073, + "step": 478 + }, + { + "epoch": 0.23066273400349124, + "grad_norm": 0.9862172603607178, + "learning_rate": 0.00017976853289384184, + "loss": 0.6149, + "step": 479 + }, + { + "epoch": 0.2311442845964004, + "grad_norm": 1.0514706373214722, + "learning_rate": 0.00017967436376476855, + "loss": 0.6446, + "step": 480 + }, + { + "epoch": 0.23162583518930957, + "grad_norm": 0.6390010714530945, + "learning_rate": 0.0001795800007707312, + "loss": 0.3133, + "step": 481 + }, + { + "epoch": 0.23210738578221873, + "grad_norm": 0.2777019441127777, + "learning_rate": 0.00017948544414133534, + "loss": 0.1675, + "step": 482 + }, + { + "epoch": 0.23258893637512792, + "grad_norm": 0.983223557472229, + "learning_rate": 0.00017939069410665773, + "loss": 0.4066, + "step": 483 + }, + { + "epoch": 0.2330704869680371, + "grad_norm": 0.979427695274353, + "learning_rate": 0.0001792957508972457, + "loss": 0.5599, + "step": 484 + }, + { + "epoch": 0.23355203756094625, + "grad_norm": 0.8707508444786072, + "learning_rate": 0.00017920061474411658, + "loss": 0.3209, + "step": 485 + }, + { + "epoch": 0.23403358815385542, + "grad_norm": 1.0809909105300903, + "learning_rate": 0.00017910528587875729, + "loss": 0.4503, + "step": 486 + }, + { + "epoch": 0.23451513874676458, + "grad_norm": 0.985072910785675, + "learning_rate": 0.00017900976453312352, + "loss": 0.4379, + "step": 487 + }, + { + "epoch": 0.23499668933967374, + "grad_norm": 1.3180149793624878, + "learning_rate": 0.00017891405093963938, + "loss": 0.463, + "step": 488 + }, + { + "epoch": 0.2354782399325829, + "grad_norm": 1.7905075550079346, + "learning_rate": 0.00017881814533119675, + "loss": 0.9671, + "step": 489 + }, + { + "epoch": 0.2359597905254921, + "grad_norm": 1.3886431455612183, + "learning_rate": 0.00017872204794115474, + "loss": 0.235, + "step": 490 + }, + { + "epoch": 0.23644134111840126, + "grad_norm": 1.001554250717163, + "learning_rate": 0.0001786257590033391, + "loss": 0.3896, + "step": 491 + }, + { + "epoch": 0.23692289171131042, + "grad_norm": 0.9331504106521606, + "learning_rate": 0.00017852927875204163, + "loss": 0.6942, + "step": 492 + }, + { + "epoch": 0.2374044423042196, + "grad_norm": 1.6051104068756104, + "learning_rate": 0.00017843260742201963, + "loss": 0.4859, + "step": 493 + }, + { + "epoch": 0.23788599289712875, + "grad_norm": 0.5150108337402344, + "learning_rate": 0.00017833574524849535, + "loss": 0.4528, + "step": 494 + }, + { + "epoch": 0.23836754349003791, + "grad_norm": 1.3120126724243164, + "learning_rate": 0.00017823869246715553, + "loss": 0.3884, + "step": 495 + }, + { + "epoch": 0.23884909408294708, + "grad_norm": 1.345514178276062, + "learning_rate": 0.00017814144931415043, + "loss": 0.6149, + "step": 496 + }, + { + "epoch": 0.23933064467585627, + "grad_norm": 2.8319215774536133, + "learning_rate": 0.0001780440160260938, + "loss": 0.7515, + "step": 497 + }, + { + "epoch": 0.23981219526876543, + "grad_norm": 0.4656662344932556, + "learning_rate": 0.00017794639284006184, + "loss": 0.6784, + "step": 498 + }, + { + "epoch": 0.2402937458616746, + "grad_norm": 0.7157880663871765, + "learning_rate": 0.0001778485799935929, + "loss": 0.6326, + "step": 499 + }, + { + "epoch": 0.24077529645458376, + "grad_norm": 1.2466776371002197, + "learning_rate": 0.00017775057772468679, + "loss": 0.9202, + "step": 500 + }, + { + "epoch": 0.24125684704749292, + "grad_norm": 1.138107180595398, + "learning_rate": 0.00017765238627180424, + "loss": 0.5456, + "step": 501 + }, + { + "epoch": 0.2417383976404021, + "grad_norm": 0.6045275330543518, + "learning_rate": 0.00017755400587386632, + "loss": 0.4298, + "step": 502 + }, + { + "epoch": 0.24221994823331125, + "grad_norm": 1.1885807514190674, + "learning_rate": 0.00017745543677025378, + "loss": 0.4178, + "step": 503 + }, + { + "epoch": 0.24270149882622044, + "grad_norm": 1.1106690168380737, + "learning_rate": 0.00017735667920080661, + "loss": 0.346, + "step": 504 + }, + { + "epoch": 0.2431830494191296, + "grad_norm": 0.9215999245643616, + "learning_rate": 0.0001772577334058233, + "loss": 0.661, + "step": 505 + }, + { + "epoch": 0.24366460001203877, + "grad_norm": 0.8448766469955444, + "learning_rate": 0.00017715859962606043, + "loss": 0.4158, + "step": 506 + }, + { + "epoch": 0.24414615060494793, + "grad_norm": 0.4072727560997009, + "learning_rate": 0.00017705927810273187, + "loss": 0.212, + "step": 507 + }, + { + "epoch": 0.2446277011978571, + "grad_norm": 0.254568487405777, + "learning_rate": 0.00017695976907750844, + "loss": 0.1542, + "step": 508 + }, + { + "epoch": 0.24510925179076626, + "grad_norm": 1.490206003189087, + "learning_rate": 0.00017686007279251706, + "loss": 0.6064, + "step": 509 + }, + { + "epoch": 0.24559080238367542, + "grad_norm": 1.0446650981903076, + "learning_rate": 0.00017676018949034045, + "loss": 0.848, + "step": 510 + }, + { + "epoch": 0.2460723529765846, + "grad_norm": 1.708642840385437, + "learning_rate": 0.0001766601194140162, + "loss": 0.6488, + "step": 511 + }, + { + "epoch": 0.24655390356949378, + "grad_norm": 0.9760411977767944, + "learning_rate": 0.0001765598628070365, + "loss": 0.5871, + "step": 512 + }, + { + "epoch": 0.24703545416240294, + "grad_norm": 0.978772759437561, + "learning_rate": 0.00017645941991334732, + "loss": 0.4494, + "step": 513 + }, + { + "epoch": 0.2475170047553121, + "grad_norm": 4.773708820343018, + "learning_rate": 0.00017635879097734804, + "loss": 0.5994, + "step": 514 + }, + { + "epoch": 0.24799855534822127, + "grad_norm": 0.6961981058120728, + "learning_rate": 0.00017625797624389055, + "loss": 0.6643, + "step": 515 + }, + { + "epoch": 0.24848010594113043, + "grad_norm": 0.368940532207489, + "learning_rate": 0.00017615697595827897, + "loss": 0.2211, + "step": 516 + }, + { + "epoch": 0.2489616565340396, + "grad_norm": 1.5199044942855835, + "learning_rate": 0.0001760557903662688, + "loss": 0.3104, + "step": 517 + }, + { + "epoch": 0.24944320712694878, + "grad_norm": 1.1172236204147339, + "learning_rate": 0.00017595441971406648, + "loss": 0.792, + "step": 518 + }, + { + "epoch": 0.24992475771985795, + "grad_norm": 0.9683585166931152, + "learning_rate": 0.00017585286424832874, + "loss": 0.3843, + "step": 519 + }, + { + "epoch": 0.2504063083127671, + "grad_norm": 1.2800359725952148, + "learning_rate": 0.00017575112421616202, + "loss": 0.4601, + "step": 520 + }, + { + "epoch": 0.2508878589056763, + "grad_norm": 1.857161283493042, + "learning_rate": 0.0001756491998651218, + "loss": 0.333, + "step": 521 + }, + { + "epoch": 0.25136940949858544, + "grad_norm": 1.1550625562667847, + "learning_rate": 0.0001755470914432121, + "loss": 0.6969, + "step": 522 + }, + { + "epoch": 0.2518509600914946, + "grad_norm": 0.7429122924804688, + "learning_rate": 0.0001754447991988848, + "loss": 0.3757, + "step": 523 + }, + { + "epoch": 0.25233251068440377, + "grad_norm": 1.3738610744476318, + "learning_rate": 0.00017534232338103903, + "loss": 1.0375, + "step": 524 + }, + { + "epoch": 0.25281406127731293, + "grad_norm": 0.9827585816383362, + "learning_rate": 0.0001752396642390207, + "loss": 1.0113, + "step": 525 + }, + { + "epoch": 0.2532956118702221, + "grad_norm": 0.7637682557106018, + "learning_rate": 0.00017513682202262163, + "loss": 0.887, + "step": 526 + }, + { + "epoch": 0.25377716246313126, + "grad_norm": 1.5960534811019897, + "learning_rate": 0.00017503379698207918, + "loss": 0.3415, + "step": 527 + }, + { + "epoch": 0.2542587130560405, + "grad_norm": 1.3785816431045532, + "learning_rate": 0.00017493058936807562, + "loss": 0.7051, + "step": 528 + }, + { + "epoch": 0.25474026364894964, + "grad_norm": 0.5574414134025574, + "learning_rate": 0.00017482719943173739, + "loss": 0.5499, + "step": 529 + }, + { + "epoch": 0.2552218142418588, + "grad_norm": 0.5614613890647888, + "learning_rate": 0.00017472362742463455, + "loss": 0.3799, + "step": 530 + }, + { + "epoch": 0.25570336483476797, + "grad_norm": 0.6266032457351685, + "learning_rate": 0.0001746198735987802, + "loss": 0.6749, + "step": 531 + }, + { + "epoch": 0.25618491542767713, + "grad_norm": 0.6579415202140808, + "learning_rate": 0.00017451593820662988, + "loss": 0.3008, + "step": 532 + }, + { + "epoch": 0.2566664660205863, + "grad_norm": 0.7333241701126099, + "learning_rate": 0.00017441182150108086, + "loss": 0.6755, + "step": 533 + }, + { + "epoch": 0.25714801661349546, + "grad_norm": 0.8165299892425537, + "learning_rate": 0.0001743075237354716, + "loss": 0.6263, + "step": 534 + }, + { + "epoch": 0.2576295672064046, + "grad_norm": 0.4842590391635895, + "learning_rate": 0.00017420304516358113, + "loss": 0.4029, + "step": 535 + }, + { + "epoch": 0.2581111177993138, + "grad_norm": 1.0551700592041016, + "learning_rate": 0.00017409838603962843, + "loss": 0.6014, + "step": 536 + }, + { + "epoch": 0.25859266839222295, + "grad_norm": 1.0485131740570068, + "learning_rate": 0.00017399354661827178, + "loss": 0.3771, + "step": 537 + }, + { + "epoch": 0.2590742189851321, + "grad_norm": 0.7599074244499207, + "learning_rate": 0.00017388852715460819, + "loss": 0.3815, + "step": 538 + }, + { + "epoch": 0.2595557695780413, + "grad_norm": 2.357322931289673, + "learning_rate": 0.00017378332790417273, + "loss": 0.3267, + "step": 539 + }, + { + "epoch": 0.26003732017095044, + "grad_norm": 1.1640552282333374, + "learning_rate": 0.00017367794912293794, + "loss": 0.6993, + "step": 540 + }, + { + "epoch": 0.26051887076385966, + "grad_norm": 0.4350646138191223, + "learning_rate": 0.00017357239106731317, + "loss": 0.2485, + "step": 541 + }, + { + "epoch": 0.2610004213567688, + "grad_norm": 1.8079553842544556, + "learning_rate": 0.00017346665399414405, + "loss": 0.6977, + "step": 542 + }, + { + "epoch": 0.261481971949678, + "grad_norm": 0.7386165857315063, + "learning_rate": 0.00017336073816071168, + "loss": 0.2557, + "step": 543 + }, + { + "epoch": 0.26196352254258715, + "grad_norm": 1.385292410850525, + "learning_rate": 0.00017325464382473226, + "loss": 0.784, + "step": 544 + }, + { + "epoch": 0.2624450731354963, + "grad_norm": 0.7219933867454529, + "learning_rate": 0.00017314837124435622, + "loss": 0.2746, + "step": 545 + }, + { + "epoch": 0.2629266237284055, + "grad_norm": 1.0202337503433228, + "learning_rate": 0.00017304192067816782, + "loss": 0.4414, + "step": 546 + }, + { + "epoch": 0.26340817432131464, + "grad_norm": 0.9283572435379028, + "learning_rate": 0.00017293529238518422, + "loss": 0.1696, + "step": 547 + }, + { + "epoch": 0.2638897249142238, + "grad_norm": 1.1248360872268677, + "learning_rate": 0.0001728284866248552, + "loss": 0.6018, + "step": 548 + }, + { + "epoch": 0.26437127550713296, + "grad_norm": 0.9289939403533936, + "learning_rate": 0.00017272150365706224, + "loss": 0.3172, + "step": 549 + }, + { + "epoch": 0.2648528261000421, + "grad_norm": 1.4539984464645386, + "learning_rate": 0.00017261434374211802, + "loss": 0.3876, + "step": 550 + }, + { + "epoch": 0.2653343766929513, + "grad_norm": 0.6110948324203491, + "learning_rate": 0.00017250700714076586, + "loss": 0.3356, + "step": 551 + }, + { + "epoch": 0.26581592728586045, + "grad_norm": 0.4635971486568451, + "learning_rate": 0.00017239949411417888, + "loss": 0.2646, + "step": 552 + }, + { + "epoch": 0.2662974778787696, + "grad_norm": 3.1188924312591553, + "learning_rate": 0.0001722918049239596, + "loss": 0.7318, + "step": 553 + }, + { + "epoch": 0.2667790284716788, + "grad_norm": 0.7587296366691589, + "learning_rate": 0.00017218393983213902, + "loss": 0.1368, + "step": 554 + }, + { + "epoch": 0.267260579064588, + "grad_norm": 0.46195825934410095, + "learning_rate": 0.00017207589910117634, + "loss": 0.2474, + "step": 555 + }, + { + "epoch": 0.26774212965749716, + "grad_norm": 1.0942871570587158, + "learning_rate": 0.00017196768299395797, + "loss": 0.5037, + "step": 556 + }, + { + "epoch": 0.2682236802504063, + "grad_norm": 2.233832359313965, + "learning_rate": 0.00017185929177379714, + "loss": 0.4933, + "step": 557 + }, + { + "epoch": 0.2687052308433155, + "grad_norm": 1.2593945264816284, + "learning_rate": 0.00017175072570443312, + "loss": 0.6849, + "step": 558 + }, + { + "epoch": 0.26918678143622465, + "grad_norm": 0.5373206734657288, + "learning_rate": 0.00017164198505003066, + "loss": 0.2404, + "step": 559 + }, + { + "epoch": 0.2696683320291338, + "grad_norm": 0.7956175804138184, + "learning_rate": 0.0001715330700751793, + "loss": 0.2961, + "step": 560 + }, + { + "epoch": 0.270149882622043, + "grad_norm": 0.5333571434020996, + "learning_rate": 0.00017142398104489273, + "loss": 0.3203, + "step": 561 + }, + { + "epoch": 0.27063143321495214, + "grad_norm": 0.49340343475341797, + "learning_rate": 0.00017131471822460814, + "loss": 0.3933, + "step": 562 + }, + { + "epoch": 0.2711129838078613, + "grad_norm": 0.7872244119644165, + "learning_rate": 0.00017120528188018565, + "loss": 0.5548, + "step": 563 + }, + { + "epoch": 0.27159453440077047, + "grad_norm": 0.626449704170227, + "learning_rate": 0.00017109567227790754, + "loss": 0.2293, + "step": 564 + }, + { + "epoch": 0.27207608499367963, + "grad_norm": 0.8935983777046204, + "learning_rate": 0.00017098588968447766, + "loss": 0.717, + "step": 565 + }, + { + "epoch": 0.2725576355865888, + "grad_norm": 1.2957003116607666, + "learning_rate": 0.00017087593436702084, + "loss": 0.488, + "step": 566 + }, + { + "epoch": 0.27303918617949796, + "grad_norm": 0.8273961544036865, + "learning_rate": 0.00017076580659308222, + "loss": 0.9841, + "step": 567 + }, + { + "epoch": 0.2735207367724071, + "grad_norm": 1.0490425825119019, + "learning_rate": 0.00017065550663062634, + "loss": 0.248, + "step": 568 + }, + { + "epoch": 0.27400228736531634, + "grad_norm": 0.9461679458618164, + "learning_rate": 0.00017054503474803702, + "loss": 0.6242, + "step": 569 + }, + { + "epoch": 0.2744838379582255, + "grad_norm": 0.5606663823127747, + "learning_rate": 0.00017043439121411618, + "loss": 0.2993, + "step": 570 + }, + { + "epoch": 0.27496538855113467, + "grad_norm": 1.1390249729156494, + "learning_rate": 0.0001703235762980835, + "loss": 0.2841, + "step": 571 + }, + { + "epoch": 0.27544693914404383, + "grad_norm": 0.5928565859794617, + "learning_rate": 0.00017021259026957567, + "loss": 0.4731, + "step": 572 + }, + { + "epoch": 0.275928489736953, + "grad_norm": 1.170907735824585, + "learning_rate": 0.00017010143339864562, + "loss": 0.5631, + "step": 573 + }, + { + "epoch": 0.27641004032986216, + "grad_norm": 0.8918641209602356, + "learning_rate": 0.0001699901059557621, + "loss": 0.5582, + "step": 574 + }, + { + "epoch": 0.2768915909227713, + "grad_norm": 1.7016874551773071, + "learning_rate": 0.00016987860821180895, + "loss": 0.8922, + "step": 575 + }, + { + "epoch": 0.2773731415156805, + "grad_norm": 0.4336923658847809, + "learning_rate": 0.00016976694043808416, + "loss": 0.6153, + "step": 576 + }, + { + "epoch": 0.27785469210858965, + "grad_norm": 0.8397631049156189, + "learning_rate": 0.00016965510290629972, + "loss": 0.5027, + "step": 577 + }, + { + "epoch": 0.2783362427014988, + "grad_norm": 0.4044966697692871, + "learning_rate": 0.00016954309588858044, + "loss": 0.3501, + "step": 578 + }, + { + "epoch": 0.278817793294408, + "grad_norm": 0.47465118765830994, + "learning_rate": 0.00016943091965746366, + "loss": 0.3879, + "step": 579 + }, + { + "epoch": 0.27929934388731714, + "grad_norm": 1.3777363300323486, + "learning_rate": 0.00016931857448589845, + "loss": 0.6047, + "step": 580 + }, + { + "epoch": 0.2797808944802263, + "grad_norm": 0.4374580979347229, + "learning_rate": 0.00016920606064724488, + "loss": 0.4132, + "step": 581 + }, + { + "epoch": 0.28026244507313547, + "grad_norm": 0.8578736186027527, + "learning_rate": 0.00016909337841527344, + "loss": 0.7736, + "step": 582 + }, + { + "epoch": 0.2807439956660447, + "grad_norm": 0.7518476843833923, + "learning_rate": 0.00016898052806416444, + "loss": 0.6, + "step": 583 + }, + { + "epoch": 0.28122554625895385, + "grad_norm": 0.8887410759925842, + "learning_rate": 0.00016886750986850718, + "loss": 0.5299, + "step": 584 + }, + { + "epoch": 0.281707096851863, + "grad_norm": 0.3015702962875366, + "learning_rate": 0.00016875432410329934, + "loss": 0.3191, + "step": 585 + }, + { + "epoch": 0.2821886474447722, + "grad_norm": 0.9750669002532959, + "learning_rate": 0.0001686409710439464, + "loss": 0.516, + "step": 586 + }, + { + "epoch": 0.28267019803768134, + "grad_norm": 1.1851553916931152, + "learning_rate": 0.00016852745096626088, + "loss": 0.8312, + "step": 587 + }, + { + "epoch": 0.2831517486305905, + "grad_norm": 1.5135141611099243, + "learning_rate": 0.0001684137641464617, + "loss": 0.8539, + "step": 588 + }, + { + "epoch": 0.28363329922349967, + "grad_norm": 1.7662687301635742, + "learning_rate": 0.0001682999108611735, + "loss": 0.4994, + "step": 589 + }, + { + "epoch": 0.28411484981640883, + "grad_norm": 0.2995387613773346, + "learning_rate": 0.00016818589138742587, + "loss": 0.1881, + "step": 590 + }, + { + "epoch": 0.284596400409318, + "grad_norm": 0.4403057098388672, + "learning_rate": 0.00016807170600265296, + "loss": 0.4645, + "step": 591 + }, + { + "epoch": 0.28507795100222716, + "grad_norm": 0.8969955444335938, + "learning_rate": 0.00016795735498469246, + "loss": 0.4521, + "step": 592 + }, + { + "epoch": 0.2855595015951363, + "grad_norm": 0.9198434352874756, + "learning_rate": 0.00016784283861178513, + "loss": 0.525, + "step": 593 + }, + { + "epoch": 0.2860410521880455, + "grad_norm": 0.42939475178718567, + "learning_rate": 0.00016772815716257412, + "loss": 0.2818, + "step": 594 + }, + { + "epoch": 0.28652260278095465, + "grad_norm": 1.3019510507583618, + "learning_rate": 0.00016761331091610416, + "loss": 0.7396, + "step": 595 + }, + { + "epoch": 0.28700415337386387, + "grad_norm": 0.7892130613327026, + "learning_rate": 0.00016749830015182107, + "loss": 0.4129, + "step": 596 + }, + { + "epoch": 0.28748570396677303, + "grad_norm": 0.29359114170074463, + "learning_rate": 0.00016738312514957086, + "loss": 0.0796, + "step": 597 + }, + { + "epoch": 0.2879672545596822, + "grad_norm": 0.6003280878067017, + "learning_rate": 0.00016726778618959926, + "loss": 0.3882, + "step": 598 + }, + { + "epoch": 0.28844880515259136, + "grad_norm": 0.36162564158439636, + "learning_rate": 0.00016715228355255093, + "loss": 0.3264, + "step": 599 + }, + { + "epoch": 0.2889303557455005, + "grad_norm": 1.3743391036987305, + "learning_rate": 0.00016703661751946874, + "loss": 0.7462, + "step": 600 + }, + { + "epoch": 0.2894119063384097, + "grad_norm": 0.8231284618377686, + "learning_rate": 0.00016692078837179318, + "loss": 0.4397, + "step": 601 + }, + { + "epoch": 0.28989345693131885, + "grad_norm": 0.376017302274704, + "learning_rate": 0.00016680479639136163, + "loss": 0.4883, + "step": 602 + }, + { + "epoch": 0.290375007524228, + "grad_norm": 0.5218961834907532, + "learning_rate": 0.0001666886418604077, + "loss": 0.2014, + "step": 603 + }, + { + "epoch": 0.2908565581171372, + "grad_norm": 0.4787192940711975, + "learning_rate": 0.0001665723250615604, + "loss": 0.3536, + "step": 604 + }, + { + "epoch": 0.29133810871004634, + "grad_norm": 1.1720750331878662, + "learning_rate": 0.00016645584627784381, + "loss": 0.7304, + "step": 605 + }, + { + "epoch": 0.2918196593029555, + "grad_norm": 0.7071607112884521, + "learning_rate": 0.0001663392057926759, + "loss": 0.4578, + "step": 606 + }, + { + "epoch": 0.29230120989586467, + "grad_norm": 0.9649622440338135, + "learning_rate": 0.00016622240388986824, + "loss": 0.2672, + "step": 607 + }, + { + "epoch": 0.29278276048877383, + "grad_norm": 0.7951571941375732, + "learning_rate": 0.0001661054408536251, + "loss": 0.9557, + "step": 608 + }, + { + "epoch": 0.293264311081683, + "grad_norm": 0.8435747027397156, + "learning_rate": 0.00016598831696854288, + "loss": 0.4504, + "step": 609 + }, + { + "epoch": 0.2937458616745922, + "grad_norm": 0.9358202219009399, + "learning_rate": 0.00016587103251960937, + "loss": 0.4133, + "step": 610 + }, + { + "epoch": 0.2942274122675014, + "grad_norm": 0.8870930671691895, + "learning_rate": 0.00016575358779220294, + "loss": 0.7552, + "step": 611 + }, + { + "epoch": 0.29470896286041054, + "grad_norm": 1.79103684425354, + "learning_rate": 0.00016563598307209204, + "loss": 0.7703, + "step": 612 + }, + { + "epoch": 0.2951905134533197, + "grad_norm": 0.2896561324596405, + "learning_rate": 0.0001655182186454344, + "loss": 0.0845, + "step": 613 + }, + { + "epoch": 0.29567206404622887, + "grad_norm": 0.47754964232444763, + "learning_rate": 0.00016540029479877638, + "loss": 0.6409, + "step": 614 + }, + { + "epoch": 0.29615361463913803, + "grad_norm": 1.0190218687057495, + "learning_rate": 0.00016528221181905217, + "loss": 0.4859, + "step": 615 + }, + { + "epoch": 0.2966351652320472, + "grad_norm": 0.8215029239654541, + "learning_rate": 0.00016516396999358322, + "loss": 0.594, + "step": 616 + }, + { + "epoch": 0.29711671582495636, + "grad_norm": 0.48861509561538696, + "learning_rate": 0.00016504556961007748, + "loss": 0.4021, + "step": 617 + }, + { + "epoch": 0.2975982664178655, + "grad_norm": 1.0803600549697876, + "learning_rate": 0.00016492701095662866, + "loss": 0.4099, + "step": 618 + }, + { + "epoch": 0.2980798170107747, + "grad_norm": 1.1174275875091553, + "learning_rate": 0.00016480829432171564, + "loss": 0.5384, + "step": 619 + }, + { + "epoch": 0.29856136760368385, + "grad_norm": 0.49391186237335205, + "learning_rate": 0.0001646894199942017, + "loss": 0.473, + "step": 620 + }, + { + "epoch": 0.299042918196593, + "grad_norm": 0.5225441455841064, + "learning_rate": 0.0001645703882633338, + "loss": 0.512, + "step": 621 + }, + { + "epoch": 0.2995244687895022, + "grad_norm": 0.4738313853740692, + "learning_rate": 0.00016445119941874183, + "loss": 0.133, + "step": 622 + }, + { + "epoch": 0.30000601938241134, + "grad_norm": 1.9411455392837524, + "learning_rate": 0.00016433185375043809, + "loss": 0.6408, + "step": 623 + }, + { + "epoch": 0.30048756997532056, + "grad_norm": 1.1281530857086182, + "learning_rate": 0.00016421235154881638, + "loss": 0.4085, + "step": 624 + }, + { + "epoch": 0.3009691205682297, + "grad_norm": 0.6740239262580872, + "learning_rate": 0.00016409269310465146, + "loss": 0.429, + "step": 625 + }, + { + "epoch": 0.3014506711611389, + "grad_norm": 2.430367946624756, + "learning_rate": 0.00016397287870909813, + "loss": 0.9531, + "step": 626 + }, + { + "epoch": 0.30193222175404805, + "grad_norm": 1.3780102729797363, + "learning_rate": 0.00016385290865369079, + "loss": 1.0837, + "step": 627 + }, + { + "epoch": 0.3024137723469572, + "grad_norm": 1.098432183265686, + "learning_rate": 0.00016373278323034255, + "loss": 0.2807, + "step": 628 + }, + { + "epoch": 0.3028953229398664, + "grad_norm": 0.6460314393043518, + "learning_rate": 0.0001636125027313445, + "loss": 0.3764, + "step": 629 + }, + { + "epoch": 0.30337687353277554, + "grad_norm": 0.7264195084571838, + "learning_rate": 0.00016349206744936518, + "loss": 0.5838, + "step": 630 + }, + { + "epoch": 0.3038584241256847, + "grad_norm": 0.6020871996879578, + "learning_rate": 0.00016337147767744967, + "loss": 0.731, + "step": 631 + }, + { + "epoch": 0.30433997471859386, + "grad_norm": 0.7149215936660767, + "learning_rate": 0.0001632507337090189, + "loss": 0.4385, + "step": 632 + }, + { + "epoch": 0.304821525311503, + "grad_norm": 0.8123503923416138, + "learning_rate": 0.0001631298358378692, + "loss": 0.6232, + "step": 633 + }, + { + "epoch": 0.3053030759044122, + "grad_norm": 1.155922770500183, + "learning_rate": 0.00016300878435817113, + "loss": 0.4872, + "step": 634 + }, + { + "epoch": 0.30578462649732135, + "grad_norm": 0.8439828753471375, + "learning_rate": 0.00016288757956446918, + "loss": 0.4667, + "step": 635 + }, + { + "epoch": 0.3062661770902305, + "grad_norm": 1.7102984189987183, + "learning_rate": 0.00016276622175168083, + "loss": 0.3543, + "step": 636 + }, + { + "epoch": 0.30674772768313974, + "grad_norm": 1.6830681562423706, + "learning_rate": 0.0001626447112150959, + "loss": 0.7425, + "step": 637 + }, + { + "epoch": 0.3072292782760489, + "grad_norm": 1.8289207220077515, + "learning_rate": 0.00016252304825037576, + "loss": 0.3966, + "step": 638 + }, + { + "epoch": 0.30771082886895806, + "grad_norm": 1.2149261236190796, + "learning_rate": 0.0001624012331535528, + "loss": 0.8703, + "step": 639 + }, + { + "epoch": 0.3081923794618672, + "grad_norm": 0.47987720370292664, + "learning_rate": 0.00016227926622102947, + "loss": 0.2365, + "step": 640 + }, + { + "epoch": 0.3086739300547764, + "grad_norm": 0.8224395513534546, + "learning_rate": 0.00016215714774957772, + "loss": 0.5699, + "step": 641 + }, + { + "epoch": 0.30915548064768555, + "grad_norm": 1.7651561498641968, + "learning_rate": 0.00016203487803633822, + "loss": 0.7769, + "step": 642 + }, + { + "epoch": 0.3096370312405947, + "grad_norm": 1.286529779434204, + "learning_rate": 0.00016191245737881956, + "loss": 0.4309, + "step": 643 + }, + { + "epoch": 0.3101185818335039, + "grad_norm": 0.8839173913002014, + "learning_rate": 0.00016178988607489777, + "loss": 0.514, + "step": 644 + }, + { + "epoch": 0.31060013242641304, + "grad_norm": 1.212334394454956, + "learning_rate": 0.00016166716442281528, + "loss": 0.8795, + "step": 645 + }, + { + "epoch": 0.3110816830193222, + "grad_norm": 0.836784303188324, + "learning_rate": 0.0001615442927211805, + "loss": 0.5251, + "step": 646 + }, + { + "epoch": 0.31156323361223137, + "grad_norm": 0.4992331266403198, + "learning_rate": 0.0001614212712689668, + "loss": 0.448, + "step": 647 + }, + { + "epoch": 0.31204478420514054, + "grad_norm": 0.9768697619438171, + "learning_rate": 0.00016129810036551198, + "loss": 0.8694, + "step": 648 + }, + { + "epoch": 0.3125263347980497, + "grad_norm": 1.266432523727417, + "learning_rate": 0.00016117478031051755, + "loss": 0.6712, + "step": 649 + }, + { + "epoch": 0.31300788539095886, + "grad_norm": 0.9470400214195251, + "learning_rate": 0.00016105131140404787, + "loss": 0.386, + "step": 650 + }, + { + "epoch": 0.3134894359838681, + "grad_norm": 1.228068470954895, + "learning_rate": 0.00016092769394652947, + "loss": 0.806, + "step": 651 + }, + { + "epoch": 0.31397098657677724, + "grad_norm": 1.3695420026779175, + "learning_rate": 0.0001608039282387504, + "loss": 0.192, + "step": 652 + }, + { + "epoch": 0.3144525371696864, + "grad_norm": 0.7850420475006104, + "learning_rate": 0.00016068001458185936, + "loss": 0.4479, + "step": 653 + }, + { + "epoch": 0.31493408776259557, + "grad_norm": 1.2331979274749756, + "learning_rate": 0.0001605559532773651, + "loss": 0.4963, + "step": 654 + }, + { + "epoch": 0.31541563835550473, + "grad_norm": 0.2416716367006302, + "learning_rate": 0.00016043174462713566, + "loss": 0.1441, + "step": 655 + }, + { + "epoch": 0.3158971889484139, + "grad_norm": 0.7065779566764832, + "learning_rate": 0.00016030738893339753, + "loss": 0.4006, + "step": 656 + }, + { + "epoch": 0.31637873954132306, + "grad_norm": 0.607990026473999, + "learning_rate": 0.00016018288649873497, + "loss": 0.7187, + "step": 657 + }, + { + "epoch": 0.3168602901342322, + "grad_norm": 1.0857969522476196, + "learning_rate": 0.0001600582376260894, + "loss": 0.457, + "step": 658 + }, + { + "epoch": 0.3173418407271414, + "grad_norm": 0.6079431772232056, + "learning_rate": 0.00015993344261875847, + "loss": 0.3213, + "step": 659 + }, + { + "epoch": 0.31782339132005055, + "grad_norm": 0.4468356668949127, + "learning_rate": 0.00015980850178039547, + "loss": 0.7005, + "step": 660 + }, + { + "epoch": 0.3183049419129597, + "grad_norm": 1.4823204278945923, + "learning_rate": 0.00015968341541500842, + "loss": 0.7892, + "step": 661 + }, + { + "epoch": 0.3187864925058689, + "grad_norm": 0.6385239362716675, + "learning_rate": 0.00015955818382695953, + "loss": 0.2666, + "step": 662 + }, + { + "epoch": 0.31926804309877804, + "grad_norm": 0.7978471517562866, + "learning_rate": 0.00015943280732096438, + "loss": 0.8023, + "step": 663 + }, + { + "epoch": 0.3197495936916872, + "grad_norm": 0.8674983382225037, + "learning_rate": 0.00015930728620209113, + "loss": 0.4971, + "step": 664 + }, + { + "epoch": 0.3202311442845964, + "grad_norm": 0.9437485933303833, + "learning_rate": 0.00015918162077575976, + "loss": 0.5039, + "step": 665 + }, + { + "epoch": 0.3207126948775056, + "grad_norm": 0.5956541299819946, + "learning_rate": 0.00015905581134774153, + "loss": 0.4606, + "step": 666 + }, + { + "epoch": 0.32119424547041475, + "grad_norm": 0.6552014350891113, + "learning_rate": 0.0001589298582241579, + "loss": 0.5845, + "step": 667 + }, + { + "epoch": 0.3216757960633239, + "grad_norm": 1.130856990814209, + "learning_rate": 0.00015880376171148014, + "loss": 0.6646, + "step": 668 + }, + { + "epoch": 0.3221573466562331, + "grad_norm": 0.7940027713775635, + "learning_rate": 0.00015867752211652831, + "loss": 0.362, + "step": 669 + }, + { + "epoch": 0.32263889724914224, + "grad_norm": 1.0767316818237305, + "learning_rate": 0.00015855113974647068, + "loss": 1.2512, + "step": 670 + }, + { + "epoch": 0.3231204478420514, + "grad_norm": 1.164123773574829, + "learning_rate": 0.0001584246149088229, + "loss": 0.5231, + "step": 671 + }, + { + "epoch": 0.32360199843496057, + "grad_norm": 0.8443131446838379, + "learning_rate": 0.0001582979479114472, + "loss": 0.4686, + "step": 672 + }, + { + "epoch": 0.32408354902786973, + "grad_norm": 1.225292682647705, + "learning_rate": 0.0001581711390625519, + "loss": 0.5409, + "step": 673 + }, + { + "epoch": 0.3245650996207789, + "grad_norm": 0.8826702833175659, + "learning_rate": 0.0001580441886706903, + "loss": 0.7055, + "step": 674 + }, + { + "epoch": 0.32504665021368806, + "grad_norm": 1.543964147567749, + "learning_rate": 0.00015791709704476015, + "loss": 0.625, + "step": 675 + }, + { + "epoch": 0.3255282008065972, + "grad_norm": 1.4143108129501343, + "learning_rate": 0.00015778986449400292, + "loss": 0.3431, + "step": 676 + }, + { + "epoch": 0.3260097513995064, + "grad_norm": 0.9283997416496277, + "learning_rate": 0.00015766249132800292, + "loss": 0.4487, + "step": 677 + }, + { + "epoch": 0.3264913019924156, + "grad_norm": 0.367602676153183, + "learning_rate": 0.00015753497785668663, + "loss": 0.1819, + "step": 678 + }, + { + "epoch": 0.32697285258532477, + "grad_norm": 0.5738257765769958, + "learning_rate": 0.00015740732439032187, + "loss": 0.4424, + "step": 679 + }, + { + "epoch": 0.32745440317823393, + "grad_norm": 0.7593892812728882, + "learning_rate": 0.00015727953123951716, + "loss": 0.6976, + "step": 680 + }, + { + "epoch": 0.3279359537711431, + "grad_norm": 0.7225235104560852, + "learning_rate": 0.00015715159871522086, + "loss": 0.3711, + "step": 681 + }, + { + "epoch": 0.32841750436405226, + "grad_norm": 0.7577972412109375, + "learning_rate": 0.00015702352712872056, + "loss": 0.5368, + "step": 682 + }, + { + "epoch": 0.3288990549569614, + "grad_norm": 0.5570088624954224, + "learning_rate": 0.00015689531679164204, + "loss": 0.5758, + "step": 683 + }, + { + "epoch": 0.3293806055498706, + "grad_norm": 0.9556528329849243, + "learning_rate": 0.00015676696801594886, + "loss": 0.8813, + "step": 684 + }, + { + "epoch": 0.32986215614277975, + "grad_norm": 0.8506907820701599, + "learning_rate": 0.00015663848111394132, + "loss": 0.8259, + "step": 685 + }, + { + "epoch": 0.3303437067356889, + "grad_norm": 0.7537476420402527, + "learning_rate": 0.00015650985639825585, + "loss": 0.6906, + "step": 686 + }, + { + "epoch": 0.3308252573285981, + "grad_norm": 0.8143454194068909, + "learning_rate": 0.00015638109418186424, + "loss": 0.278, + "step": 687 + }, + { + "epoch": 0.33130680792150724, + "grad_norm": 0.7907467484474182, + "learning_rate": 0.00015625219477807277, + "loss": 0.6315, + "step": 688 + }, + { + "epoch": 0.3317883585144164, + "grad_norm": 0.7539917230606079, + "learning_rate": 0.00015612315850052166, + "loss": 0.7043, + "step": 689 + }, + { + "epoch": 0.33226990910732557, + "grad_norm": 0.9734718203544617, + "learning_rate": 0.00015599398566318396, + "loss": 0.4349, + "step": 690 + }, + { + "epoch": 0.33275145970023473, + "grad_norm": 0.5200105309486389, + "learning_rate": 0.00015586467658036524, + "loss": 0.2994, + "step": 691 + }, + { + "epoch": 0.33323301029314395, + "grad_norm": 1.2207458019256592, + "learning_rate": 0.00015573523156670244, + "loss": 0.7332, + "step": 692 + }, + { + "epoch": 0.3337145608860531, + "grad_norm": 2.161278009414673, + "learning_rate": 0.0001556056509371633, + "loss": 0.7246, + "step": 693 + }, + { + "epoch": 0.3341961114789623, + "grad_norm": 0.44565412402153015, + "learning_rate": 0.00015547593500704547, + "loss": 0.662, + "step": 694 + }, + { + "epoch": 0.33467766207187144, + "grad_norm": 0.6318372488021851, + "learning_rate": 0.00015534608409197592, + "loss": 0.2167, + "step": 695 + }, + { + "epoch": 0.3351592126647806, + "grad_norm": 0.4214036464691162, + "learning_rate": 0.00015521609850791004, + "loss": 0.3308, + "step": 696 + }, + { + "epoch": 0.33564076325768977, + "grad_norm": 0.7219396829605103, + "learning_rate": 0.0001550859785711308, + "loss": 0.5435, + "step": 697 + }, + { + "epoch": 0.33612231385059893, + "grad_norm": 0.6601801514625549, + "learning_rate": 0.0001549557245982482, + "loss": 0.8963, + "step": 698 + }, + { + "epoch": 0.3366038644435081, + "grad_norm": 1.0407145023345947, + "learning_rate": 0.00015482533690619837, + "loss": 0.2903, + "step": 699 + }, + { + "epoch": 0.33708541503641726, + "grad_norm": 0.6162377595901489, + "learning_rate": 0.00015469481581224272, + "loss": 0.2887, + "step": 700 + }, + { + "epoch": 0.3375669656293264, + "grad_norm": 0.813620388507843, + "learning_rate": 0.0001545641616339673, + "loss": 0.5469, + "step": 701 + }, + { + "epoch": 0.3380485162222356, + "grad_norm": 1.1486362218856812, + "learning_rate": 0.00015443337468928206, + "loss": 0.2534, + "step": 702 + }, + { + "epoch": 0.33853006681514475, + "grad_norm": 0.29715338349342346, + "learning_rate": 0.00015430245529641986, + "loss": 0.1138, + "step": 703 + }, + { + "epoch": 0.3390116174080539, + "grad_norm": 0.999326765537262, + "learning_rate": 0.00015417140377393596, + "loss": 0.5482, + "step": 704 + }, + { + "epoch": 0.3394931680009631, + "grad_norm": 2.416579246520996, + "learning_rate": 0.00015404022044070704, + "loss": 0.3981, + "step": 705 + }, + { + "epoch": 0.3399747185938723, + "grad_norm": 0.8240935802459717, + "learning_rate": 0.00015390890561593052, + "loss": 0.4912, + "step": 706 + }, + { + "epoch": 0.34045626918678146, + "grad_norm": 0.7639948725700378, + "learning_rate": 0.0001537774596191238, + "loss": 0.5106, + "step": 707 + }, + { + "epoch": 0.3409378197796906, + "grad_norm": 1.1090484857559204, + "learning_rate": 0.00015364588277012344, + "loss": 0.575, + "step": 708 + }, + { + "epoch": 0.3414193703725998, + "grad_norm": 1.443395733833313, + "learning_rate": 0.00015351417538908435, + "loss": 0.7842, + "step": 709 + }, + { + "epoch": 0.34190092096550895, + "grad_norm": 0.85762619972229, + "learning_rate": 0.0001533823377964791, + "loss": 0.4135, + "step": 710 + }, + { + "epoch": 0.3423824715584181, + "grad_norm": 1.1097861528396606, + "learning_rate": 0.00015325037031309704, + "loss": 0.6141, + "step": 711 + }, + { + "epoch": 0.3428640221513273, + "grad_norm": 0.4248318374156952, + "learning_rate": 0.00015311827326004363, + "loss": 0.3395, + "step": 712 + }, + { + "epoch": 0.34334557274423644, + "grad_norm": 0.5352758169174194, + "learning_rate": 0.0001529860469587396, + "loss": 0.5002, + "step": 713 + }, + { + "epoch": 0.3438271233371456, + "grad_norm": 0.6550387144088745, + "learning_rate": 0.00015285369173092015, + "loss": 0.2643, + "step": 714 + }, + { + "epoch": 0.34430867393005476, + "grad_norm": 0.827717661857605, + "learning_rate": 0.00015272120789863413, + "loss": 0.6702, + "step": 715 + }, + { + "epoch": 0.34479022452296393, + "grad_norm": 1.4903684854507446, + "learning_rate": 0.00015258859578424342, + "loss": 0.8466, + "step": 716 + }, + { + "epoch": 0.3452717751158731, + "grad_norm": 0.8539415597915649, + "learning_rate": 0.00015245585571042194, + "loss": 0.3473, + "step": 717 + }, + { + "epoch": 0.34575332570878226, + "grad_norm": 1.0758845806121826, + "learning_rate": 0.00015232298800015506, + "loss": 0.3126, + "step": 718 + }, + { + "epoch": 0.3462348763016914, + "grad_norm": 0.5768371224403381, + "learning_rate": 0.00015218999297673862, + "loss": 0.3093, + "step": 719 + }, + { + "epoch": 0.34671642689460064, + "grad_norm": 1.1279000043869019, + "learning_rate": 0.0001520568709637783, + "loss": 0.5696, + "step": 720 + }, + { + "epoch": 0.3471979774875098, + "grad_norm": 0.9293486475944519, + "learning_rate": 0.00015192362228518875, + "loss": 0.362, + "step": 721 + }, + { + "epoch": 0.34767952808041896, + "grad_norm": 0.48664113879203796, + "learning_rate": 0.00015179024726519284, + "loss": 0.5424, + "step": 722 + }, + { + "epoch": 0.34816107867332813, + "grad_norm": 0.8383696675300598, + "learning_rate": 0.00015165674622832085, + "loss": 0.2303, + "step": 723 + }, + { + "epoch": 0.3486426292662373, + "grad_norm": 1.7233173847198486, + "learning_rate": 0.0001515231194994097, + "loss": 1.0983, + "step": 724 + }, + { + "epoch": 0.34912417985914646, + "grad_norm": 0.8514331579208374, + "learning_rate": 0.00015138936740360207, + "loss": 0.7108, + "step": 725 + }, + { + "epoch": 0.3496057304520556, + "grad_norm": 0.3969762921333313, + "learning_rate": 0.00015125549026634585, + "loss": 0.188, + "step": 726 + }, + { + "epoch": 0.3500872810449648, + "grad_norm": 0.815380334854126, + "learning_rate": 0.00015112148841339295, + "loss": 0.6904, + "step": 727 + }, + { + "epoch": 0.35056883163787395, + "grad_norm": 0.595610499382019, + "learning_rate": 0.000150987362170799, + "loss": 0.4129, + "step": 728 + }, + { + "epoch": 0.3510503822307831, + "grad_norm": 1.0688610076904297, + "learning_rate": 0.00015085311186492206, + "loss": 0.8911, + "step": 729 + }, + { + "epoch": 0.3515319328236923, + "grad_norm": 0.879959762096405, + "learning_rate": 0.00015071873782242223, + "loss": 0.2627, + "step": 730 + }, + { + "epoch": 0.35201348341660144, + "grad_norm": 1.2716305255889893, + "learning_rate": 0.0001505842403702606, + "loss": 0.5803, + "step": 731 + }, + { + "epoch": 0.3524950340095106, + "grad_norm": 0.6808298826217651, + "learning_rate": 0.00015044961983569856, + "loss": 0.4929, + "step": 732 + }, + { + "epoch": 0.3529765846024198, + "grad_norm": 1.200664758682251, + "learning_rate": 0.00015031487654629702, + "loss": 0.2996, + "step": 733 + }, + { + "epoch": 0.353458135195329, + "grad_norm": 0.5520739555358887, + "learning_rate": 0.00015018001082991553, + "loss": 0.231, + "step": 734 + }, + { + "epoch": 0.35393968578823815, + "grad_norm": 0.7161489129066467, + "learning_rate": 0.0001500450230147116, + "loss": 0.2878, + "step": 735 + }, + { + "epoch": 0.3544212363811473, + "grad_norm": 0.606476366519928, + "learning_rate": 0.00014990991342913974, + "loss": 0.8334, + "step": 736 + }, + { + "epoch": 0.35490278697405647, + "grad_norm": 0.9754736423492432, + "learning_rate": 0.00014977468240195084, + "loss": 0.5581, + "step": 737 + }, + { + "epoch": 0.35538433756696564, + "grad_norm": 1.0018997192382812, + "learning_rate": 0.0001496393302621912, + "loss": 1.0035, + "step": 738 + }, + { + "epoch": 0.3558658881598748, + "grad_norm": 0.8273734450340271, + "learning_rate": 0.00014950385733920188, + "loss": 0.3748, + "step": 739 + }, + { + "epoch": 0.35634743875278396, + "grad_norm": 0.5065658092498779, + "learning_rate": 0.00014936826396261783, + "loss": 0.4041, + "step": 740 + }, + { + "epoch": 0.3568289893456931, + "grad_norm": 1.5734803676605225, + "learning_rate": 0.00014923255046236705, + "loss": 0.7453, + "step": 741 + }, + { + "epoch": 0.3573105399386023, + "grad_norm": 0.8833112120628357, + "learning_rate": 0.00014909671716866984, + "loss": 0.5966, + "step": 742 + }, + { + "epoch": 0.35779209053151145, + "grad_norm": 0.7103949189186096, + "learning_rate": 0.00014896076441203802, + "loss": 0.8407, + "step": 743 + }, + { + "epoch": 0.3582736411244206, + "grad_norm": 0.9285472631454468, + "learning_rate": 0.000148824692523274, + "loss": 0.7159, + "step": 744 + }, + { + "epoch": 0.3587551917173298, + "grad_norm": 0.969674825668335, + "learning_rate": 0.0001486885018334702, + "loss": 0.7394, + "step": 745 + }, + { + "epoch": 0.35923674231023894, + "grad_norm": 1.1744005680084229, + "learning_rate": 0.00014855219267400797, + "loss": 0.5905, + "step": 746 + }, + { + "epoch": 0.35971829290314816, + "grad_norm": 0.7571550011634827, + "learning_rate": 0.00014841576537655705, + "loss": 0.4096, + "step": 747 + }, + { + "epoch": 0.3601998434960573, + "grad_norm": 1.1209489107131958, + "learning_rate": 0.00014827922027307451, + "loss": 0.3337, + "step": 748 + }, + { + "epoch": 0.3606813940889665, + "grad_norm": 0.46004024147987366, + "learning_rate": 0.00014814255769580415, + "loss": 0.24, + "step": 749 + }, + { + "epoch": 0.36116294468187565, + "grad_norm": 1.329910159111023, + "learning_rate": 0.00014800577797727558, + "loss": 0.7595, + "step": 750 + }, + { + "epoch": 0.3616444952747848, + "grad_norm": 2.1548874378204346, + "learning_rate": 0.00014786888145030343, + "loss": 0.6889, + "step": 751 + }, + { + "epoch": 0.362126045867694, + "grad_norm": 0.367524653673172, + "learning_rate": 0.0001477318684479866, + "loss": 0.1096, + "step": 752 + }, + { + "epoch": 0.36260759646060314, + "grad_norm": 2.3556947708129883, + "learning_rate": 0.00014759473930370736, + "loss": 0.4795, + "step": 753 + }, + { + "epoch": 0.3630891470535123, + "grad_norm": 0.5870598554611206, + "learning_rate": 0.0001474574943511306, + "loss": 0.6601, + "step": 754 + }, + { + "epoch": 0.36357069764642147, + "grad_norm": 1.6864088773727417, + "learning_rate": 0.0001473201339242029, + "loss": 0.9885, + "step": 755 + }, + { + "epoch": 0.36405224823933063, + "grad_norm": 18.12716293334961, + "learning_rate": 0.000147182658357152, + "loss": 0.5065, + "step": 756 + }, + { + "epoch": 0.3645337988322398, + "grad_norm": 0.4911274313926697, + "learning_rate": 0.00014704506798448566, + "loss": 0.3673, + "step": 757 + }, + { + "epoch": 0.36501534942514896, + "grad_norm": 2.6706955432891846, + "learning_rate": 0.00014690736314099101, + "loss": 0.6796, + "step": 758 + }, + { + "epoch": 0.3654969000180581, + "grad_norm": 0.5427619814872742, + "learning_rate": 0.00014676954416173373, + "loss": 0.4817, + "step": 759 + }, + { + "epoch": 0.3659784506109673, + "grad_norm": 0.9320172071456909, + "learning_rate": 0.00014663161138205724, + "loss": 0.7076, + "step": 760 + }, + { + "epoch": 0.3664600012038765, + "grad_norm": 1.1050776243209839, + "learning_rate": 0.00014649356513758176, + "loss": 0.6332, + "step": 761 + }, + { + "epoch": 0.36694155179678567, + "grad_norm": 0.5952473878860474, + "learning_rate": 0.00014635540576420374, + "loss": 0.4186, + "step": 762 + }, + { + "epoch": 0.36742310238969483, + "grad_norm": 0.5455112457275391, + "learning_rate": 0.0001462171335980948, + "loss": 0.209, + "step": 763 + }, + { + "epoch": 0.367904652982604, + "grad_norm": 0.918086051940918, + "learning_rate": 0.00014607874897570105, + "loss": 0.3373, + "step": 764 + }, + { + "epoch": 0.36838620357551316, + "grad_norm": 0.8120787143707275, + "learning_rate": 0.0001459402522337422, + "loss": 0.3566, + "step": 765 + }, + { + "epoch": 0.3688677541684223, + "grad_norm": 0.9667643308639526, + "learning_rate": 0.00014580164370921078, + "loss": 0.3856, + "step": 766 + }, + { + "epoch": 0.3693493047613315, + "grad_norm": 1.1362155675888062, + "learning_rate": 0.0001456629237393713, + "loss": 0.409, + "step": 767 + }, + { + "epoch": 0.36983085535424065, + "grad_norm": 0.469622939825058, + "learning_rate": 0.00014552409266175952, + "loss": 0.1831, + "step": 768 + }, + { + "epoch": 0.3703124059471498, + "grad_norm": 0.7247708439826965, + "learning_rate": 0.00014538515081418142, + "loss": 0.4395, + "step": 769 + }, + { + "epoch": 0.370793956540059, + "grad_norm": 0.9316056966781616, + "learning_rate": 0.00014524609853471264, + "loss": 0.5586, + "step": 770 + }, + { + "epoch": 0.37127550713296814, + "grad_norm": 0.8127897381782532, + "learning_rate": 0.00014510693616169741, + "loss": 0.8475, + "step": 771 + }, + { + "epoch": 0.3717570577258773, + "grad_norm": 0.43745431303977966, + "learning_rate": 0.0001449676640337479, + "loss": 0.7811, + "step": 772 + }, + { + "epoch": 0.37223860831878647, + "grad_norm": 0.3969845175743103, + "learning_rate": 0.00014482828248974335, + "loss": 0.3657, + "step": 773 + }, + { + "epoch": 0.3727201589116957, + "grad_norm": 4.575024127960205, + "learning_rate": 0.00014468879186882916, + "loss": 0.8133, + "step": 774 + }, + { + "epoch": 0.37320170950460485, + "grad_norm": 0.31545108556747437, + "learning_rate": 0.00014454919251041622, + "loss": 0.2088, + "step": 775 + }, + { + "epoch": 0.373683260097514, + "grad_norm": 0.8626468181610107, + "learning_rate": 0.00014440948475418, + "loss": 0.6683, + "step": 776 + }, + { + "epoch": 0.3741648106904232, + "grad_norm": 0.8768938183784485, + "learning_rate": 0.00014426966894005966, + "loss": 0.4917, + "step": 777 + }, + { + "epoch": 0.37464636128333234, + "grad_norm": 0.7053727507591248, + "learning_rate": 0.0001441297454082573, + "loss": 0.6056, + "step": 778 + }, + { + "epoch": 0.3751279118762415, + "grad_norm": 0.5769566893577576, + "learning_rate": 0.00014398971449923722, + "loss": 0.3225, + "step": 779 + }, + { + "epoch": 0.37560946246915067, + "grad_norm": 0.9168880581855774, + "learning_rate": 0.00014384957655372483, + "loss": 0.587, + "step": 780 + }, + { + "epoch": 0.37609101306205983, + "grad_norm": 0.33026906847953796, + "learning_rate": 0.00014370933191270617, + "loss": 0.3309, + "step": 781 + }, + { + "epoch": 0.376572563654969, + "grad_norm": 0.6208348870277405, + "learning_rate": 0.0001435689809174267, + "loss": 0.536, + "step": 782 + }, + { + "epoch": 0.37705411424787816, + "grad_norm": 1.1868008375167847, + "learning_rate": 0.0001434285239093908, + "loss": 0.8623, + "step": 783 + }, + { + "epoch": 0.3775356648407873, + "grad_norm": 0.3612160384654999, + "learning_rate": 0.00014328796123036071, + "loss": 0.2214, + "step": 784 + }, + { + "epoch": 0.3780172154336965, + "grad_norm": 0.44610145688056946, + "learning_rate": 0.0001431472932223559, + "loss": 0.5085, + "step": 785 + }, + { + "epoch": 0.37849876602660565, + "grad_norm": 0.40755540132522583, + "learning_rate": 0.00014300652022765207, + "loss": 0.2879, + "step": 786 + }, + { + "epoch": 0.3789803166195148, + "grad_norm": 1.5538909435272217, + "learning_rate": 0.00014286564258878033, + "loss": 0.7417, + "step": 787 + }, + { + "epoch": 0.37946186721242403, + "grad_norm": 1.058793306350708, + "learning_rate": 0.00014272466064852644, + "loss": 0.3202, + "step": 788 + }, + { + "epoch": 0.3799434178053332, + "grad_norm": 1.0490062236785889, + "learning_rate": 0.00014258357474993, + "loss": 0.5987, + "step": 789 + }, + { + "epoch": 0.38042496839824236, + "grad_norm": 1.3995267152786255, + "learning_rate": 0.0001424423852362835, + "loss": 0.6358, + "step": 790 + }, + { + "epoch": 0.3809065189911515, + "grad_norm": 0.8307220339775085, + "learning_rate": 0.00014230109245113158, + "loss": 0.5914, + "step": 791 + }, + { + "epoch": 0.3813880695840607, + "grad_norm": 1.0649659633636475, + "learning_rate": 0.00014215969673827018, + "loss": 0.3306, + "step": 792 + }, + { + "epoch": 0.38186962017696985, + "grad_norm": 1.0158497095108032, + "learning_rate": 0.00014201819844174564, + "loss": 0.2128, + "step": 793 + }, + { + "epoch": 0.382351170769879, + "grad_norm": 0.6281611323356628, + "learning_rate": 0.0001418765979058539, + "loss": 0.5194, + "step": 794 + }, + { + "epoch": 0.3828327213627882, + "grad_norm": 0.882088303565979, + "learning_rate": 0.00014173489547513973, + "loss": 0.8551, + "step": 795 + }, + { + "epoch": 0.38331427195569734, + "grad_norm": 0.7518590092658997, + "learning_rate": 0.00014159309149439582, + "loss": 0.2317, + "step": 796 + }, + { + "epoch": 0.3837958225486065, + "grad_norm": 0.7489838600158691, + "learning_rate": 0.00014145118630866187, + "loss": 0.2913, + "step": 797 + }, + { + "epoch": 0.38427737314151567, + "grad_norm": 0.5977664589881897, + "learning_rate": 0.000141309180263224, + "loss": 0.2069, + "step": 798 + }, + { + "epoch": 0.38475892373442483, + "grad_norm": 0.5772594809532166, + "learning_rate": 0.0001411670737036135, + "loss": 0.3432, + "step": 799 + }, + { + "epoch": 0.385240474327334, + "grad_norm": 1.231155514717102, + "learning_rate": 0.0001410248669756065, + "loss": 0.1778, + "step": 800 + }, + { + "epoch": 0.38572202492024316, + "grad_norm": 1.5435062646865845, + "learning_rate": 0.00014088256042522264, + "loss": 0.4694, + "step": 801 + }, + { + "epoch": 0.3862035755131524, + "grad_norm": 0.568301796913147, + "learning_rate": 0.00014074015439872458, + "loss": 0.4258, + "step": 802 + }, + { + "epoch": 0.38668512610606154, + "grad_norm": 0.6348639130592346, + "learning_rate": 0.00014059764924261703, + "loss": 0.3922, + "step": 803 + }, + { + "epoch": 0.3871666766989707, + "grad_norm": 0.7134537696838379, + "learning_rate": 0.00014045504530364584, + "loss": 0.2355, + "step": 804 + }, + { + "epoch": 0.38764822729187987, + "grad_norm": 0.7429349422454834, + "learning_rate": 0.00014031234292879725, + "loss": 0.3596, + "step": 805 + }, + { + "epoch": 0.38812977788478903, + "grad_norm": 0.6155477166175842, + "learning_rate": 0.00014016954246529696, + "loss": 0.6571, + "step": 806 + }, + { + "epoch": 0.3886113284776982, + "grad_norm": 1.02437162399292, + "learning_rate": 0.00014002664426060942, + "loss": 0.3357, + "step": 807 + }, + { + "epoch": 0.38909287907060736, + "grad_norm": 1.0629314184188843, + "learning_rate": 0.00013988364866243693, + "loss": 0.5891, + "step": 808 + }, + { + "epoch": 0.3895744296635165, + "grad_norm": 0.8847737908363342, + "learning_rate": 0.00013974055601871868, + "loss": 0.3721, + "step": 809 + }, + { + "epoch": 0.3900559802564257, + "grad_norm": 1.1943687200546265, + "learning_rate": 0.00013959736667762998, + "loss": 0.8358, + "step": 810 + }, + { + "epoch": 0.39053753084933485, + "grad_norm": 0.6684376001358032, + "learning_rate": 0.00013945408098758156, + "loss": 0.557, + "step": 811 + }, + { + "epoch": 0.391019081442244, + "grad_norm": 0.4766930043697357, + "learning_rate": 0.0001393106992972184, + "loss": 0.3879, + "step": 812 + }, + { + "epoch": 0.3915006320351532, + "grad_norm": 0.43043360114097595, + "learning_rate": 0.00013916722195541926, + "loss": 0.2223, + "step": 813 + }, + { + "epoch": 0.39198218262806234, + "grad_norm": 1.0288745164871216, + "learning_rate": 0.00013902364931129557, + "loss": 0.3912, + "step": 814 + }, + { + "epoch": 0.39246373322097156, + "grad_norm": 0.7389081120491028, + "learning_rate": 0.00013887998171419058, + "loss": 0.6005, + "step": 815 + }, + { + "epoch": 0.3929452838138807, + "grad_norm": 1.1643013954162598, + "learning_rate": 0.00013873621951367862, + "loss": 0.9454, + "step": 816 + }, + { + "epoch": 0.3934268344067899, + "grad_norm": 1.0005336999893188, + "learning_rate": 0.00013859236305956425, + "loss": 0.5143, + "step": 817 + }, + { + "epoch": 0.39390838499969905, + "grad_norm": 1.062585473060608, + "learning_rate": 0.00013844841270188132, + "loss": 0.6584, + "step": 818 + }, + { + "epoch": 0.3943899355926082, + "grad_norm": 0.48092156648635864, + "learning_rate": 0.00013830436879089228, + "loss": 0.5506, + "step": 819 + }, + { + "epoch": 0.3948714861855174, + "grad_norm": 0.4571566581726074, + "learning_rate": 0.00013816023167708704, + "loss": 0.2258, + "step": 820 + }, + { + "epoch": 0.39535303677842654, + "grad_norm": 0.30784401297569275, + "learning_rate": 0.00013801600171118244, + "loss": 0.3428, + "step": 821 + }, + { + "epoch": 0.3958345873713357, + "grad_norm": 1.430720567703247, + "learning_rate": 0.00013787167924412112, + "loss": 0.6646, + "step": 822 + }, + { + "epoch": 0.39631613796424486, + "grad_norm": 0.7564586997032166, + "learning_rate": 0.0001377272646270709, + "loss": 0.3042, + "step": 823 + }, + { + "epoch": 0.396797688557154, + "grad_norm": 0.9115868806838989, + "learning_rate": 0.00013758275821142382, + "loss": 0.2931, + "step": 824 + }, + { + "epoch": 0.3972792391500632, + "grad_norm": 1.3971993923187256, + "learning_rate": 0.00013743816034879523, + "loss": 0.3297, + "step": 825 + }, + { + "epoch": 0.39776078974297235, + "grad_norm": 4.245695114135742, + "learning_rate": 0.000137293471391023, + "loss": 0.8094, + "step": 826 + }, + { + "epoch": 0.3982423403358815, + "grad_norm": 0.7202475666999817, + "learning_rate": 0.00013714869169016667, + "loss": 0.2702, + "step": 827 + }, + { + "epoch": 0.3987238909287907, + "grad_norm": 0.41715875267982483, + "learning_rate": 0.00013700382159850656, + "loss": 0.1824, + "step": 828 + }, + { + "epoch": 0.3992054415216999, + "grad_norm": 1.313673734664917, + "learning_rate": 0.00013685886146854297, + "loss": 0.3776, + "step": 829 + }, + { + "epoch": 0.39968699211460906, + "grad_norm": 1.509114384651184, + "learning_rate": 0.00013671381165299525, + "loss": 0.5278, + "step": 830 + }, + { + "epoch": 0.4001685427075182, + "grad_norm": 0.9306924939155579, + "learning_rate": 0.00013656867250480098, + "loss": 0.5351, + "step": 831 + }, + { + "epoch": 0.4006500933004274, + "grad_norm": 0.5094513297080994, + "learning_rate": 0.00013642344437711512, + "loss": 0.2092, + "step": 832 + }, + { + "epoch": 0.40113164389333655, + "grad_norm": 0.696160078048706, + "learning_rate": 0.00013627812762330912, + "loss": 0.6603, + "step": 833 + }, + { + "epoch": 0.4016131944862457, + "grad_norm": 0.7971479892730713, + "learning_rate": 0.00013613272259697007, + "loss": 0.739, + "step": 834 + }, + { + "epoch": 0.4020947450791549, + "grad_norm": 0.8289906978607178, + "learning_rate": 0.00013598722965189986, + "loss": 0.8819, + "step": 835 + }, + { + "epoch": 0.40257629567206404, + "grad_norm": 0.8531100749969482, + "learning_rate": 0.0001358416491421143, + "loss": 0.6332, + "step": 836 + }, + { + "epoch": 0.4030578462649732, + "grad_norm": 0.7515624165534973, + "learning_rate": 0.00013569598142184225, + "loss": 0.7196, + "step": 837 + }, + { + "epoch": 0.40353939685788237, + "grad_norm": 0.8925199508666992, + "learning_rate": 0.00013555022684552483, + "loss": 1.0533, + "step": 838 + }, + { + "epoch": 0.40402094745079153, + "grad_norm": 0.9382127523422241, + "learning_rate": 0.00013540438576781441, + "loss": 0.5954, + "step": 839 + }, + { + "epoch": 0.4045024980437007, + "grad_norm": 0.7742651104927063, + "learning_rate": 0.0001352584585435739, + "loss": 0.7777, + "step": 840 + }, + { + "epoch": 0.40498404863660986, + "grad_norm": 0.7185132503509521, + "learning_rate": 0.00013511244552787583, + "loss": 0.2072, + "step": 841 + }, + { + "epoch": 0.405465599229519, + "grad_norm": 1.293835163116455, + "learning_rate": 0.00013496634707600147, + "loss": 0.6124, + "step": 842 + }, + { + "epoch": 0.40594714982242824, + "grad_norm": 0.7155901789665222, + "learning_rate": 0.0001348201635434399, + "loss": 0.4266, + "step": 843 + }, + { + "epoch": 0.4064287004153374, + "grad_norm": 1.3010177612304688, + "learning_rate": 0.0001346738952858873, + "loss": 0.9076, + "step": 844 + }, + { + "epoch": 0.40691025100824657, + "grad_norm": 0.3817487955093384, + "learning_rate": 0.000134527542659246, + "loss": 1.4111, + "step": 845 + }, + { + "epoch": 0.40739180160115573, + "grad_norm": 0.8231790661811829, + "learning_rate": 0.00013438110601962362, + "loss": 0.7397, + "step": 846 + }, + { + "epoch": 0.4078733521940649, + "grad_norm": 0.5291070342063904, + "learning_rate": 0.00013423458572333214, + "loss": 0.7572, + "step": 847 + }, + { + "epoch": 0.40835490278697406, + "grad_norm": 0.3156173527240753, + "learning_rate": 0.0001340879821268872, + "loss": 0.2825, + "step": 848 + }, + { + "epoch": 0.4088364533798832, + "grad_norm": 1.1031380891799927, + "learning_rate": 0.000133941295587007, + "loss": 0.2755, + "step": 849 + }, + { + "epoch": 0.4093180039727924, + "grad_norm": 0.8826498985290527, + "learning_rate": 0.00013379452646061164, + "loss": 0.183, + "step": 850 + }, + { + "epoch": 0.40979955456570155, + "grad_norm": 0.3365316390991211, + "learning_rate": 0.0001336476751048222, + "loss": 0.4803, + "step": 851 + }, + { + "epoch": 0.4102811051586107, + "grad_norm": 1.348456621170044, + "learning_rate": 0.00013350074187695979, + "loss": 0.4099, + "step": 852 + }, + { + "epoch": 0.4107626557515199, + "grad_norm": 0.9401395320892334, + "learning_rate": 0.00013335372713454467, + "loss": 0.4082, + "step": 853 + }, + { + "epoch": 0.41124420634442904, + "grad_norm": 0.8428338766098022, + "learning_rate": 0.0001332066312352956, + "loss": 0.26, + "step": 854 + }, + { + "epoch": 0.4117257569373382, + "grad_norm": 0.9355554580688477, + "learning_rate": 0.00013305945453712868, + "loss": 0.7623, + "step": 855 + }, + { + "epoch": 0.4122073075302474, + "grad_norm": 0.7551222443580627, + "learning_rate": 0.0001329121973981567, + "loss": 0.8114, + "step": 856 + }, + { + "epoch": 0.4126888581231566, + "grad_norm": 0.9181913733482361, + "learning_rate": 0.00013276486017668807, + "loss": 0.5794, + "step": 857 + }, + { + "epoch": 0.41317040871606575, + "grad_norm": 0.7318640351295471, + "learning_rate": 0.0001326174432312262, + "loss": 0.8141, + "step": 858 + }, + { + "epoch": 0.4136519593089749, + "grad_norm": 1.257271409034729, + "learning_rate": 0.00013246994692046836, + "loss": 0.6784, + "step": 859 + }, + { + "epoch": 0.4141335099018841, + "grad_norm": 1.720740795135498, + "learning_rate": 0.000132322371603305, + "loss": 0.5045, + "step": 860 + }, + { + "epoch": 0.41461506049479324, + "grad_norm": 0.9632263779640198, + "learning_rate": 0.0001321747176388188, + "loss": 0.3151, + "step": 861 + }, + { + "epoch": 0.4150966110877024, + "grad_norm": 0.6033685207366943, + "learning_rate": 0.00013202698538628376, + "loss": 0.6374, + "step": 862 + }, + { + "epoch": 0.41557816168061157, + "grad_norm": 0.4648841917514801, + "learning_rate": 0.00013187917520516448, + "loss": 0.4025, + "step": 863 + }, + { + "epoch": 0.41605971227352073, + "grad_norm": 0.4462638795375824, + "learning_rate": 0.00013173128745511508, + "loss": 0.6247, + "step": 864 + }, + { + "epoch": 0.4165412628664299, + "grad_norm": 0.8332130908966064, + "learning_rate": 0.0001315833224959784, + "loss": 0.4929, + "step": 865 + }, + { + "epoch": 0.41702281345933906, + "grad_norm": 0.7156938314437866, + "learning_rate": 0.00013143528068778525, + "loss": 0.9011, + "step": 866 + }, + { + "epoch": 0.4175043640522482, + "grad_norm": 0.7190894484519958, + "learning_rate": 0.00013128716239075338, + "loss": 0.2251, + "step": 867 + }, + { + "epoch": 0.4179859146451574, + "grad_norm": 0.8374449610710144, + "learning_rate": 0.00013113896796528664, + "loss": 0.5085, + "step": 868 + }, + { + "epoch": 0.41846746523806655, + "grad_norm": 0.6537851691246033, + "learning_rate": 0.00013099069777197412, + "loss": 0.5477, + "step": 869 + }, + { + "epoch": 0.41894901583097577, + "grad_norm": 0.8489950895309448, + "learning_rate": 0.0001308423521715893, + "loss": 0.5662, + "step": 870 + }, + { + "epoch": 0.41943056642388493, + "grad_norm": 0.5562260746955872, + "learning_rate": 0.00013069393152508906, + "loss": 0.5672, + "step": 871 + }, + { + "epoch": 0.4199121170167941, + "grad_norm": 0.32547298073768616, + "learning_rate": 0.00013054543619361303, + "loss": 0.2162, + "step": 872 + }, + { + "epoch": 0.42039366760970326, + "grad_norm": 0.6085301637649536, + "learning_rate": 0.0001303968665384824, + "loss": 0.5814, + "step": 873 + }, + { + "epoch": 0.4208752182026124, + "grad_norm": 0.4705708622932434, + "learning_rate": 0.00013024822292119934, + "loss": 0.112, + "step": 874 + }, + { + "epoch": 0.4213567687955216, + "grad_norm": 1.040982961654663, + "learning_rate": 0.0001300995057034459, + "loss": 0.5431, + "step": 875 + }, + { + "epoch": 0.42183831938843075, + "grad_norm": 0.4855748414993286, + "learning_rate": 0.00012995071524708325, + "loss": 0.313, + "step": 876 + }, + { + "epoch": 0.4223198699813399, + "grad_norm": 1.086227297782898, + "learning_rate": 0.00012980185191415074, + "loss": 0.4464, + "step": 877 + }, + { + "epoch": 0.4228014205742491, + "grad_norm": 0.7331801652908325, + "learning_rate": 0.0001296529160668651, + "loss": 0.3815, + "step": 878 + }, + { + "epoch": 0.42328297116715824, + "grad_norm": 0.5674242377281189, + "learning_rate": 0.00012950390806761944, + "loss": 0.4583, + "step": 879 + }, + { + "epoch": 0.4237645217600674, + "grad_norm": 0.424491822719574, + "learning_rate": 0.0001293548282789825, + "loss": 0.1895, + "step": 880 + }, + { + "epoch": 0.42424607235297657, + "grad_norm": 1.1760071516036987, + "learning_rate": 0.00012920567706369758, + "loss": 0.9323, + "step": 881 + }, + { + "epoch": 0.42472762294588573, + "grad_norm": 0.36800864338874817, + "learning_rate": 0.00012905645478468192, + "loss": 0.3391, + "step": 882 + }, + { + "epoch": 0.4252091735387949, + "grad_norm": 0.8725740909576416, + "learning_rate": 0.00012890716180502564, + "loss": 0.3287, + "step": 883 + }, + { + "epoch": 0.4256907241317041, + "grad_norm": 0.9771085381507874, + "learning_rate": 0.00012875779848799078, + "loss": 0.8337, + "step": 884 + }, + { + "epoch": 0.4261722747246133, + "grad_norm": 2.2332262992858887, + "learning_rate": 0.00012860836519701063, + "loss": 0.807, + "step": 885 + }, + { + "epoch": 0.42665382531752244, + "grad_norm": 0.37155234813690186, + "learning_rate": 0.00012845886229568873, + "loss": 0.5302, + "step": 886 + }, + { + "epoch": 0.4271353759104316, + "grad_norm": 0.9327018857002258, + "learning_rate": 0.00012830929014779797, + "loss": 0.7355, + "step": 887 + }, + { + "epoch": 0.42761692650334077, + "grad_norm": 0.7010428309440613, + "learning_rate": 0.0001281596491172797, + "loss": 0.1841, + "step": 888 + }, + { + "epoch": 0.42809847709624993, + "grad_norm": 1.0356031656265259, + "learning_rate": 0.00012800993956824303, + "loss": 0.804, + "step": 889 + }, + { + "epoch": 0.4285800276891591, + "grad_norm": 0.6280069947242737, + "learning_rate": 0.00012786016186496358, + "loss": 0.3072, + "step": 890 + }, + { + "epoch": 0.42906157828206826, + "grad_norm": 0.6832895278930664, + "learning_rate": 0.000127710316371883, + "loss": 0.296, + "step": 891 + }, + { + "epoch": 0.4295431288749774, + "grad_norm": 0.34949618577957153, + "learning_rate": 0.0001275604034536077, + "loss": 0.2742, + "step": 892 + }, + { + "epoch": 0.4300246794678866, + "grad_norm": 0.8701428174972534, + "learning_rate": 0.0001274104234749083, + "loss": 0.3236, + "step": 893 + }, + { + "epoch": 0.43050623006079575, + "grad_norm": 0.5376414656639099, + "learning_rate": 0.00012726037680071853, + "loss": 0.3901, + "step": 894 + }, + { + "epoch": 0.4309877806537049, + "grad_norm": 0.7584906816482544, + "learning_rate": 0.00012711026379613434, + "loss": 0.3883, + "step": 895 + }, + { + "epoch": 0.4314693312466141, + "grad_norm": 0.40378719568252563, + "learning_rate": 0.00012696008482641325, + "loss": 0.2355, + "step": 896 + }, + { + "epoch": 0.43195088183952324, + "grad_norm": 0.4962182939052582, + "learning_rate": 0.00012680984025697313, + "loss": 0.2314, + "step": 897 + }, + { + "epoch": 0.43243243243243246, + "grad_norm": 0.8789747953414917, + "learning_rate": 0.00012665953045339152, + "loss": 0.5726, + "step": 898 + }, + { + "epoch": 0.4329139830253416, + "grad_norm": 0.6019532680511475, + "learning_rate": 0.0001265091557814047, + "loss": 0.5493, + "step": 899 + }, + { + "epoch": 0.4333955336182508, + "grad_norm": 0.8721777200698853, + "learning_rate": 0.00012635871660690676, + "loss": 0.2309, + "step": 900 + }, + { + "epoch": 0.43387708421115995, + "grad_norm": 0.45335182547569275, + "learning_rate": 0.0001262082132959488, + "loss": 0.4267, + "step": 901 + }, + { + "epoch": 0.4343586348040691, + "grad_norm": 0.8860352635383606, + "learning_rate": 0.00012605764621473792, + "loss": 0.5802, + "step": 902 + }, + { + "epoch": 0.4348401853969783, + "grad_norm": 1.1893390417099, + "learning_rate": 0.00012590701572963642, + "loss": 0.6207, + "step": 903 + }, + { + "epoch": 0.43532173598988744, + "grad_norm": 0.8116459846496582, + "learning_rate": 0.00012575632220716078, + "loss": 0.5837, + "step": 904 + }, + { + "epoch": 0.4358032865827966, + "grad_norm": 0.6930440664291382, + "learning_rate": 0.000125605566013981, + "loss": 0.6638, + "step": 905 + }, + { + "epoch": 0.43628483717570576, + "grad_norm": 1.0739424228668213, + "learning_rate": 0.00012545474751691953, + "loss": 0.737, + "step": 906 + }, + { + "epoch": 0.4367663877686149, + "grad_norm": 0.6963366270065308, + "learning_rate": 0.00012530386708295036, + "loss": 0.6046, + "step": 907 + }, + { + "epoch": 0.4372479383615241, + "grad_norm": 0.9975454211235046, + "learning_rate": 0.00012515292507919829, + "loss": 0.6361, + "step": 908 + }, + { + "epoch": 0.43772948895443325, + "grad_norm": 0.2808389663696289, + "learning_rate": 0.0001250019218729378, + "loss": 0.3588, + "step": 909 + }, + { + "epoch": 0.4382110395473424, + "grad_norm": 0.6349584460258484, + "learning_rate": 0.00012485085783159238, + "loss": 0.2915, + "step": 910 + }, + { + "epoch": 0.43869259014025164, + "grad_norm": 0.8626769185066223, + "learning_rate": 0.00012469973332273354, + "loss": 0.4145, + "step": 911 + }, + { + "epoch": 0.4391741407331608, + "grad_norm": 0.6504733562469482, + "learning_rate": 0.00012454854871407994, + "loss": 0.3794, + "step": 912 + }, + { + "epoch": 0.43965569132606996, + "grad_norm": 0.24684692919254303, + "learning_rate": 0.00012439730437349635, + "loss": 0.1905, + "step": 913 + }, + { + "epoch": 0.4401372419189791, + "grad_norm": 0.34664681553840637, + "learning_rate": 0.00012424600066899302, + "loss": 0.5098, + "step": 914 + }, + { + "epoch": 0.4406187925118883, + "grad_norm": 1.0257880687713623, + "learning_rate": 0.00012409463796872464, + "loss": 0.5341, + "step": 915 + }, + { + "epoch": 0.44110034310479745, + "grad_norm": 0.44256392121315, + "learning_rate": 0.0001239432166409893, + "loss": 0.7754, + "step": 916 + }, + { + "epoch": 0.4415818936977066, + "grad_norm": 1.0881763696670532, + "learning_rate": 0.00012379173705422795, + "loss": 0.7198, + "step": 917 + }, + { + "epoch": 0.4420634442906158, + "grad_norm": 0.43400460481643677, + "learning_rate": 0.00012364019957702315, + "loss": 0.3588, + "step": 918 + }, + { + "epoch": 0.44254499488352494, + "grad_norm": 0.7865203619003296, + "learning_rate": 0.00012348860457809838, + "loss": 0.6539, + "step": 919 + }, + { + "epoch": 0.4430265454764341, + "grad_norm": 0.45927029848098755, + "learning_rate": 0.00012333695242631705, + "loss": 0.5931, + "step": 920 + }, + { + "epoch": 0.44350809606934327, + "grad_norm": 1.8216434717178345, + "learning_rate": 0.0001231852434906817, + "loss": 0.6212, + "step": 921 + }, + { + "epoch": 0.44398964666225244, + "grad_norm": 0.6446197032928467, + "learning_rate": 0.00012303347814033292, + "loss": 0.3774, + "step": 922 + }, + { + "epoch": 0.4444711972551616, + "grad_norm": 0.7606348991394043, + "learning_rate": 0.0001228816567445487, + "loss": 0.5753, + "step": 923 + }, + { + "epoch": 0.44495274784807076, + "grad_norm": 0.8907598853111267, + "learning_rate": 0.0001227297796727433, + "loss": 0.4346, + "step": 924 + }, + { + "epoch": 0.44543429844098, + "grad_norm": 0.5921023488044739, + "learning_rate": 0.00012257784729446656, + "loss": 0.5826, + "step": 925 + }, + { + "epoch": 0.44591584903388914, + "grad_norm": 0.5872268676757812, + "learning_rate": 0.00012242585997940275, + "loss": 0.4471, + "step": 926 + }, + { + "epoch": 0.4463973996267983, + "grad_norm": 0.48712849617004395, + "learning_rate": 0.0001222738180973699, + "loss": 0.4569, + "step": 927 + }, + { + "epoch": 0.44687895021970747, + "grad_norm": 1.1745750904083252, + "learning_rate": 0.00012212172201831885, + "loss": 0.4972, + "step": 928 + }, + { + "epoch": 0.44736050081261663, + "grad_norm": 0.7469773292541504, + "learning_rate": 0.00012196957211233222, + "loss": 0.5233, + "step": 929 + }, + { + "epoch": 0.4478420514055258, + "grad_norm": 0.8049497008323669, + "learning_rate": 0.00012181736874962371, + "loss": 0.9911, + "step": 930 + }, + { + "epoch": 0.44832360199843496, + "grad_norm": 2.365861415863037, + "learning_rate": 0.00012166511230053696, + "loss": 0.4886, + "step": 931 + }, + { + "epoch": 0.4488051525913441, + "grad_norm": 0.8397821187973022, + "learning_rate": 0.00012151280313554486, + "loss": 0.8172, + "step": 932 + }, + { + "epoch": 0.4492867031842533, + "grad_norm": 1.2924667596817017, + "learning_rate": 0.00012136044162524858, + "loss": 0.7059, + "step": 933 + }, + { + "epoch": 0.44976825377716245, + "grad_norm": 0.3638656735420227, + "learning_rate": 0.00012120802814037663, + "loss": 0.2305, + "step": 934 + }, + { + "epoch": 0.4502498043700716, + "grad_norm": 0.5385504364967346, + "learning_rate": 0.00012105556305178399, + "loss": 0.3047, + "step": 935 + }, + { + "epoch": 0.4507313549629808, + "grad_norm": 0.518642008304596, + "learning_rate": 0.00012090304673045123, + "loss": 0.3156, + "step": 936 + }, + { + "epoch": 0.45121290555588994, + "grad_norm": 0.8530938029289246, + "learning_rate": 0.00012075047954748353, + "loss": 0.7173, + "step": 937 + }, + { + "epoch": 0.4516944561487991, + "grad_norm": 1.7362003326416016, + "learning_rate": 0.00012059786187410984, + "loss": 0.7523, + "step": 938 + }, + { + "epoch": 0.4521760067417083, + "grad_norm": 0.8717551827430725, + "learning_rate": 0.000120445194081682, + "loss": 0.3223, + "step": 939 + }, + { + "epoch": 0.4526575573346175, + "grad_norm": 0.6551015973091125, + "learning_rate": 0.00012029247654167379, + "loss": 0.1679, + "step": 940 + }, + { + "epoch": 0.45313910792752665, + "grad_norm": 0.40582114458084106, + "learning_rate": 0.00012013970962568002, + "loss": 0.311, + "step": 941 + }, + { + "epoch": 0.4536206585204358, + "grad_norm": 0.5943724513053894, + "learning_rate": 0.00011998689370541562, + "loss": 0.4904, + "step": 942 + }, + { + "epoch": 0.454102209113345, + "grad_norm": 0.5120828151702881, + "learning_rate": 0.00011983402915271478, + "loss": 0.2732, + "step": 943 + }, + { + "epoch": 0.45458375970625414, + "grad_norm": 1.7192312479019165, + "learning_rate": 0.00011968111633953007, + "loss": 0.2739, + "step": 944 + }, + { + "epoch": 0.4550653102991633, + "grad_norm": 0.7463230490684509, + "learning_rate": 0.0001195281556379314, + "loss": 0.403, + "step": 945 + }, + { + "epoch": 0.45554686089207247, + "grad_norm": 0.4721458852291107, + "learning_rate": 0.0001193751474201053, + "loss": 0.1038, + "step": 946 + }, + { + "epoch": 0.45602841148498163, + "grad_norm": 0.667733907699585, + "learning_rate": 0.00011922209205835382, + "loss": 0.1751, + "step": 947 + }, + { + "epoch": 0.4565099620778908, + "grad_norm": 1.2947291135787964, + "learning_rate": 0.0001190689899250938, + "loss": 0.6013, + "step": 948 + }, + { + "epoch": 0.45699151267079996, + "grad_norm": 1.1071447134017944, + "learning_rate": 0.00011891584139285582, + "loss": 0.8119, + "step": 949 + }, + { + "epoch": 0.4574730632637091, + "grad_norm": 0.6729226112365723, + "learning_rate": 0.00011876264683428344, + "loss": 0.3608, + "step": 950 + }, + { + "epoch": 0.4579546138566183, + "grad_norm": 0.711564838886261, + "learning_rate": 0.00011860940662213211, + "loss": 0.2355, + "step": 951 + }, + { + "epoch": 0.4584361644495275, + "grad_norm": 0.8322364091873169, + "learning_rate": 0.00011845612112926843, + "loss": 0.6612, + "step": 952 + }, + { + "epoch": 0.45891771504243667, + "grad_norm": 0.6473928093910217, + "learning_rate": 0.00011830279072866921, + "loss": 0.4662, + "step": 953 + }, + { + "epoch": 0.45939926563534583, + "grad_norm": 1.3833335638046265, + "learning_rate": 0.00011814941579342044, + "loss": 0.4493, + "step": 954 + }, + { + "epoch": 0.459880816228255, + "grad_norm": 0.3148946166038513, + "learning_rate": 0.00011799599669671654, + "loss": 0.1462, + "step": 955 + }, + { + "epoch": 0.46036236682116416, + "grad_norm": 1.1402287483215332, + "learning_rate": 0.00011784253381185937, + "loss": 0.5964, + "step": 956 + }, + { + "epoch": 0.4608439174140733, + "grad_norm": 0.3628185987472534, + "learning_rate": 0.0001176890275122573, + "loss": 0.267, + "step": 957 + }, + { + "epoch": 0.4613254680069825, + "grad_norm": 0.8085645437240601, + "learning_rate": 0.0001175354781714244, + "loss": 0.248, + "step": 958 + }, + { + "epoch": 0.46180701859989165, + "grad_norm": 1.0674470663070679, + "learning_rate": 0.0001173818861629794, + "loss": 0.8876, + "step": 959 + }, + { + "epoch": 0.4622885691928008, + "grad_norm": 0.9414990544319153, + "learning_rate": 0.00011722825186064494, + "loss": 0.3078, + "step": 960 + }, + { + "epoch": 0.46277011978571, + "grad_norm": 0.8779419660568237, + "learning_rate": 0.00011707457563824646, + "loss": 0.4243, + "step": 961 + }, + { + "epoch": 0.46325167037861914, + "grad_norm": 0.46584588289260864, + "learning_rate": 0.00011692085786971149, + "loss": 0.2216, + "step": 962 + }, + { + "epoch": 0.4637332209715283, + "grad_norm": 1.5375487804412842, + "learning_rate": 0.00011676709892906858, + "loss": 0.2592, + "step": 963 + }, + { + "epoch": 0.46421477156443747, + "grad_norm": 0.9770283102989197, + "learning_rate": 0.00011661329919044656, + "loss": 0.6696, + "step": 964 + }, + { + "epoch": 0.46469632215734663, + "grad_norm": 0.6235101222991943, + "learning_rate": 0.00011645945902807341, + "loss": 0.256, + "step": 965 + }, + { + "epoch": 0.46517787275025585, + "grad_norm": 0.5217929482460022, + "learning_rate": 0.00011630557881627553, + "loss": 0.4376, + "step": 966 + }, + { + "epoch": 0.465659423343165, + "grad_norm": 0.4468024969100952, + "learning_rate": 0.0001161516589294768, + "loss": 0.7535, + "step": 967 + }, + { + "epoch": 0.4661409739360742, + "grad_norm": 1.1552485227584839, + "learning_rate": 0.00011599769974219757, + "loss": 0.5093, + "step": 968 + }, + { + "epoch": 0.46662252452898334, + "grad_norm": 0.5790265202522278, + "learning_rate": 0.0001158437016290539, + "loss": 0.4748, + "step": 969 + }, + { + "epoch": 0.4671040751218925, + "grad_norm": 1.0100438594818115, + "learning_rate": 0.00011568966496475649, + "loss": 0.2544, + "step": 970 + }, + { + "epoch": 0.46758562571480167, + "grad_norm": 0.7288883328437805, + "learning_rate": 0.00011553559012410984, + "loss": 0.2405, + "step": 971 + }, + { + "epoch": 0.46806717630771083, + "grad_norm": 1.0639899969100952, + "learning_rate": 0.00011538147748201138, + "loss": 0.4906, + "step": 972 + }, + { + "epoch": 0.46854872690062, + "grad_norm": 0.8311783075332642, + "learning_rate": 0.00011522732741345053, + "loss": 0.7508, + "step": 973 + }, + { + "epoch": 0.46903027749352916, + "grad_norm": 0.6320204138755798, + "learning_rate": 0.00011507314029350776, + "loss": 0.4786, + "step": 974 + }, + { + "epoch": 0.4695118280864383, + "grad_norm": 0.5712886452674866, + "learning_rate": 0.00011491891649735366, + "loss": 0.3774, + "step": 975 + }, + { + "epoch": 0.4699933786793475, + "grad_norm": 0.8952656984329224, + "learning_rate": 0.00011476465640024814, + "loss": 0.7809, + "step": 976 + }, + { + "epoch": 0.47047492927225665, + "grad_norm": 0.17890042066574097, + "learning_rate": 0.00011461036037753934, + "loss": 0.1091, + "step": 977 + }, + { + "epoch": 0.4709564798651658, + "grad_norm": 0.5014618635177612, + "learning_rate": 0.00011445602880466288, + "loss": 0.2685, + "step": 978 + }, + { + "epoch": 0.471438030458075, + "grad_norm": 3.074442148208618, + "learning_rate": 0.00011430166205714088, + "loss": 0.7062, + "step": 979 + }, + { + "epoch": 0.4719195810509842, + "grad_norm": 0.3122480809688568, + "learning_rate": 0.00011414726051058102, + "loss": 0.241, + "step": 980 + }, + { + "epoch": 0.47240113164389336, + "grad_norm": 1.3021327257156372, + "learning_rate": 0.0001139928245406757, + "loss": 0.4192, + "step": 981 + }, + { + "epoch": 0.4728826822368025, + "grad_norm": 0.9721456170082092, + "learning_rate": 0.00011383835452320097, + "loss": 0.6748, + "step": 982 + }, + { + "epoch": 0.4733642328297117, + "grad_norm": 1.8247392177581787, + "learning_rate": 0.00011368385083401585, + "loss": 0.9576, + "step": 983 + }, + { + "epoch": 0.47384578342262085, + "grad_norm": 0.8173697590827942, + "learning_rate": 0.00011352931384906125, + "loss": 0.2753, + "step": 984 + }, + { + "epoch": 0.47432733401553, + "grad_norm": 0.523826539516449, + "learning_rate": 0.00011337474394435908, + "loss": 0.4806, + "step": 985 + }, + { + "epoch": 0.4748088846084392, + "grad_norm": 0.9230586886405945, + "learning_rate": 0.00011322014149601136, + "loss": 0.4541, + "step": 986 + }, + { + "epoch": 0.47529043520134834, + "grad_norm": 1.0293413400650024, + "learning_rate": 0.00011306550688019926, + "loss": 0.5029, + "step": 987 + }, + { + "epoch": 0.4757719857942575, + "grad_norm": 0.7343544363975525, + "learning_rate": 0.0001129108404731823, + "loss": 0.6484, + "step": 988 + }, + { + "epoch": 0.47625353638716666, + "grad_norm": 1.6450482606887817, + "learning_rate": 0.0001127561426512973, + "loss": 0.6556, + "step": 989 + }, + { + "epoch": 0.47673508698007583, + "grad_norm": 0.7142669558525085, + "learning_rate": 0.0001126014137909575, + "loss": 0.4598, + "step": 990 + }, + { + "epoch": 0.477216637572985, + "grad_norm": 1.1104168891906738, + "learning_rate": 0.00011244665426865174, + "loss": 0.4199, + "step": 991 + }, + { + "epoch": 0.47769818816589416, + "grad_norm": 1.0826833248138428, + "learning_rate": 0.00011229186446094338, + "loss": 0.4716, + "step": 992 + }, + { + "epoch": 0.4781797387588034, + "grad_norm": 0.5963808298110962, + "learning_rate": 0.00011213704474446951, + "loss": 0.4477, + "step": 993 + }, + { + "epoch": 0.47866128935171254, + "grad_norm": 1.3157355785369873, + "learning_rate": 0.00011198219549594, + "loss": 0.6884, + "step": 994 + }, + { + "epoch": 0.4791428399446217, + "grad_norm": 0.7174386382102966, + "learning_rate": 0.00011182731709213659, + "loss": 0.2761, + "step": 995 + }, + { + "epoch": 0.47962439053753086, + "grad_norm": 0.7296092510223389, + "learning_rate": 0.00011167240990991192, + "loss": 0.3344, + "step": 996 + }, + { + "epoch": 0.48010594113044003, + "grad_norm": 0.9254800081253052, + "learning_rate": 0.00011151747432618871, + "loss": 0.5412, + "step": 997 + }, + { + "epoch": 0.4805874917233492, + "grad_norm": 1.1276819705963135, + "learning_rate": 0.00011136251071795871, + "loss": 0.4734, + "step": 998 + }, + { + "epoch": 0.48106904231625836, + "grad_norm": 0.9666963815689087, + "learning_rate": 0.00011120751946228197, + "loss": 0.6614, + "step": 999 + }, + { + "epoch": 0.4815505929091675, + "grad_norm": 0.3752553462982178, + "learning_rate": 0.00011105250093628565, + "loss": 0.4351, + "step": 1000 + }, + { + "epoch": 0.4820321435020767, + "grad_norm": 0.2955859899520874, + "learning_rate": 0.00011089745551716344, + "loss": 0.3715, + "step": 1001 + }, + { + "epoch": 0.48251369409498585, + "grad_norm": 0.8061214089393616, + "learning_rate": 0.00011074238358217437, + "loss": 0.8212, + "step": 1002 + }, + { + "epoch": 0.482995244687895, + "grad_norm": 0.7790119051933289, + "learning_rate": 0.00011058728550864197, + "loss": 0.3227, + "step": 1003 + }, + { + "epoch": 0.4834767952808042, + "grad_norm": 0.9455148577690125, + "learning_rate": 0.00011043216167395344, + "loss": 0.5179, + "step": 1004 + }, + { + "epoch": 0.48395834587371334, + "grad_norm": 0.8301224708557129, + "learning_rate": 0.00011027701245555865, + "loss": 0.6837, + "step": 1005 + }, + { + "epoch": 0.4844398964666225, + "grad_norm": 0.7724424004554749, + "learning_rate": 0.00011012183823096917, + "loss": 0.2709, + "step": 1006 + }, + { + "epoch": 0.4849214470595317, + "grad_norm": 0.7331794500350952, + "learning_rate": 0.00010996663937775751, + "loss": 0.3501, + "step": 1007 + }, + { + "epoch": 0.4854029976524409, + "grad_norm": 0.5707409977912903, + "learning_rate": 0.000109811416273556, + "loss": 0.9209, + "step": 1008 + }, + { + "epoch": 0.48588454824535005, + "grad_norm": 0.7642019987106323, + "learning_rate": 0.00010965616929605609, + "loss": 0.6996, + "step": 1009 + }, + { + "epoch": 0.4863660988382592, + "grad_norm": 1.3429776430130005, + "learning_rate": 0.0001095008988230072, + "loss": 0.5364, + "step": 1010 + }, + { + "epoch": 0.48684764943116837, + "grad_norm": 1.9589768648147583, + "learning_rate": 0.00010934560523221602, + "loss": 0.6098, + "step": 1011 + }, + { + "epoch": 0.48732920002407754, + "grad_norm": 0.7877257466316223, + "learning_rate": 0.00010919028890154543, + "loss": 0.503, + "step": 1012 + }, + { + "epoch": 0.4878107506169867, + "grad_norm": 0.6318466067314148, + "learning_rate": 0.00010903495020891375, + "loss": 0.5696, + "step": 1013 + }, + { + "epoch": 0.48829230120989586, + "grad_norm": 0.564456045627594, + "learning_rate": 0.00010887958953229349, + "loss": 0.6454, + "step": 1014 + }, + { + "epoch": 0.488773851802805, + "grad_norm": 1.516335368156433, + "learning_rate": 0.00010872420724971088, + "loss": 0.6529, + "step": 1015 + }, + { + "epoch": 0.4892554023957142, + "grad_norm": 0.6357146501541138, + "learning_rate": 0.0001085688037392446, + "loss": 0.3831, + "step": 1016 + }, + { + "epoch": 0.48973695298862335, + "grad_norm": 0.7643784284591675, + "learning_rate": 0.000108413379379025, + "loss": 0.3758, + "step": 1017 + }, + { + "epoch": 0.4902185035815325, + "grad_norm": 0.6214609146118164, + "learning_rate": 0.00010825793454723325, + "loss": 0.366, + "step": 1018 + }, + { + "epoch": 0.4907000541744417, + "grad_norm": 0.8940101265907288, + "learning_rate": 0.00010810246962210018, + "loss": 0.8318, + "step": 1019 + }, + { + "epoch": 0.49118160476735084, + "grad_norm": 0.7920047044754028, + "learning_rate": 0.00010794698498190557, + "loss": 0.302, + "step": 1020 + }, + { + "epoch": 0.49166315536026006, + "grad_norm": 1.0612096786499023, + "learning_rate": 0.00010779148100497722, + "loss": 0.5407, + "step": 1021 + }, + { + "epoch": 0.4921447059531692, + "grad_norm": 1.0016112327575684, + "learning_rate": 0.00010763595806968996, + "loss": 0.7287, + "step": 1022 + }, + { + "epoch": 0.4926262565460784, + "grad_norm": 1.0954660177230835, + "learning_rate": 0.00010748041655446473, + "loss": 0.9089, + "step": 1023 + }, + { + "epoch": 0.49310780713898755, + "grad_norm": 0.6982045769691467, + "learning_rate": 0.00010732485683776768, + "loss": 0.8901, + "step": 1024 + }, + { + "epoch": 0.4935893577318967, + "grad_norm": 0.721612811088562, + "learning_rate": 0.00010716927929810925, + "loss": 0.6705, + "step": 1025 + }, + { + "epoch": 0.4940709083248059, + "grad_norm": 0.6114217042922974, + "learning_rate": 0.00010701368431404326, + "loss": 0.4329, + "step": 1026 + }, + { + "epoch": 0.49455245891771504, + "grad_norm": 0.695100724697113, + "learning_rate": 0.00010685807226416598, + "loss": 0.2266, + "step": 1027 + }, + { + "epoch": 0.4950340095106242, + "grad_norm": 1.1315324306488037, + "learning_rate": 0.00010670244352711518, + "loss": 0.2882, + "step": 1028 + }, + { + "epoch": 0.49551556010353337, + "grad_norm": 0.4670594334602356, + "learning_rate": 0.00010654679848156925, + "loss": 0.4369, + "step": 1029 + }, + { + "epoch": 0.49599711069644253, + "grad_norm": 1.1333835124969482, + "learning_rate": 0.00010639113750624625, + "loss": 0.2842, + "step": 1030 + }, + { + "epoch": 0.4964786612893517, + "grad_norm": 0.558685839176178, + "learning_rate": 0.00010623546097990303, + "loss": 0.4857, + "step": 1031 + }, + { + "epoch": 0.49696021188226086, + "grad_norm": 0.723378598690033, + "learning_rate": 0.00010607976928133423, + "loss": 0.2795, + "step": 1032 + }, + { + "epoch": 0.49744176247517, + "grad_norm": 0.857216477394104, + "learning_rate": 0.00010592406278937144, + "loss": 0.2808, + "step": 1033 + }, + { + "epoch": 0.4979233130680792, + "grad_norm": 0.8478689789772034, + "learning_rate": 0.00010576834188288226, + "loss": 0.3679, + "step": 1034 + }, + { + "epoch": 0.4984048636609884, + "grad_norm": 0.4889993369579315, + "learning_rate": 0.00010561260694076935, + "loss": 0.3931, + "step": 1035 + }, + { + "epoch": 0.49888641425389757, + "grad_norm": 0.6549670696258545, + "learning_rate": 0.00010545685834196948, + "loss": 0.4135, + "step": 1036 + }, + { + "epoch": 0.49936796484680673, + "grad_norm": 1.3779197931289673, + "learning_rate": 0.00010530109646545272, + "loss": 0.6626, + "step": 1037 + }, + { + "epoch": 0.4998495154397159, + "grad_norm": 1.0434162616729736, + "learning_rate": 0.0001051453216902214, + "loss": 0.485, + "step": 1038 + }, + { + "epoch": 0.500331066032625, + "grad_norm": 1.2191532850265503, + "learning_rate": 0.00010498953439530925, + "loss": 0.807, + "step": 1039 + }, + { + "epoch": 0.5008126166255342, + "grad_norm": 0.3626213073730469, + "learning_rate": 0.00010483373495978046, + "loss": 0.256, + "step": 1040 + }, + { + "epoch": 0.5012941672184434, + "grad_norm": 0.9381643533706665, + "learning_rate": 0.00010467792376272877, + "loss": 0.8559, + "step": 1041 + }, + { + "epoch": 0.5017757178113526, + "grad_norm": 0.5074480175971985, + "learning_rate": 0.00010452210118327652, + "loss": 0.262, + "step": 1042 + }, + { + "epoch": 0.5022572684042618, + "grad_norm": 2.5238029956817627, + "learning_rate": 0.00010436626760057378, + "loss": 0.6286, + "step": 1043 + }, + { + "epoch": 0.5027388189971709, + "grad_norm": 0.5911102294921875, + "learning_rate": 0.00010421042339379732, + "loss": 0.2346, + "step": 1044 + }, + { + "epoch": 0.5032203695900801, + "grad_norm": 0.6336544752120972, + "learning_rate": 0.00010405456894214987, + "loss": 0.3963, + "step": 1045 + }, + { + "epoch": 0.5037019201829892, + "grad_norm": 0.6145181655883789, + "learning_rate": 0.00010389870462485902, + "loss": 1.338, + "step": 1046 + }, + { + "epoch": 0.5041834707758984, + "grad_norm": 0.47496849298477173, + "learning_rate": 0.00010374283082117635, + "loss": 0.2233, + "step": 1047 + }, + { + "epoch": 0.5046650213688075, + "grad_norm": 1.6015764474868774, + "learning_rate": 0.00010358694791037653, + "loss": 0.6885, + "step": 1048 + }, + { + "epoch": 0.5051465719617168, + "grad_norm": 1.224536418914795, + "learning_rate": 0.00010343105627175644, + "loss": 0.3626, + "step": 1049 + }, + { + "epoch": 0.5056281225546259, + "grad_norm": 0.3561343848705292, + "learning_rate": 0.00010327515628463415, + "loss": 0.2957, + "step": 1050 + }, + { + "epoch": 0.5061096731475351, + "grad_norm": 0.981721818447113, + "learning_rate": 0.00010311924832834808, + "loss": 0.7594, + "step": 1051 + }, + { + "epoch": 0.5065912237404442, + "grad_norm": 0.7709305882453918, + "learning_rate": 0.00010296333278225599, + "loss": 0.2734, + "step": 1052 + }, + { + "epoch": 0.5070727743333534, + "grad_norm": 0.44871535897254944, + "learning_rate": 0.00010280741002573413, + "loss": 0.1702, + "step": 1053 + }, + { + "epoch": 0.5075543249262625, + "grad_norm": 0.9567520618438721, + "learning_rate": 0.00010265148043817632, + "loss": 0.4612, + "step": 1054 + }, + { + "epoch": 0.5080358755191717, + "grad_norm": 0.39994680881500244, + "learning_rate": 0.00010249554439899298, + "loss": 0.7037, + "step": 1055 + }, + { + "epoch": 0.508517426112081, + "grad_norm": 0.6158464550971985, + "learning_rate": 0.00010233960228761022, + "loss": 0.5785, + "step": 1056 + }, + { + "epoch": 0.5089989767049901, + "grad_norm": 0.9697803258895874, + "learning_rate": 0.00010218365448346893, + "loss": 0.7367, + "step": 1057 + }, + { + "epoch": 0.5094805272978993, + "grad_norm": 1.062404990196228, + "learning_rate": 0.00010202770136602388, + "loss": 0.5547, + "step": 1058 + }, + { + "epoch": 0.5099620778908084, + "grad_norm": 0.5862473249435425, + "learning_rate": 0.00010187174331474271, + "loss": 0.3475, + "step": 1059 + }, + { + "epoch": 0.5104436284837176, + "grad_norm": 0.7131486535072327, + "learning_rate": 0.00010171578070910512, + "loss": 0.2428, + "step": 1060 + }, + { + "epoch": 0.5109251790766267, + "grad_norm": 0.7112643122673035, + "learning_rate": 0.00010155981392860185, + "loss": 0.4194, + "step": 1061 + }, + { + "epoch": 0.5114067296695359, + "grad_norm": 0.7364946007728577, + "learning_rate": 0.00010140384335273386, + "loss": 0.5395, + "step": 1062 + }, + { + "epoch": 0.511888280262445, + "grad_norm": 0.6657571792602539, + "learning_rate": 0.00010124786936101127, + "loss": 0.2897, + "step": 1063 + }, + { + "epoch": 0.5123698308553543, + "grad_norm": 0.7754364609718323, + "learning_rate": 0.00010109189233295255, + "loss": 0.4791, + "step": 1064 + }, + { + "epoch": 0.5128513814482634, + "grad_norm": 0.6396986842155457, + "learning_rate": 0.00010093591264808358, + "loss": 0.5902, + "step": 1065 + }, + { + "epoch": 0.5133329320411726, + "grad_norm": 0.9689805507659912, + "learning_rate": 0.00010077993068593663, + "loss": 0.383, + "step": 1066 + }, + { + "epoch": 0.5138144826340817, + "grad_norm": 0.9902385473251343, + "learning_rate": 0.00010062394682604963, + "loss": 0.3116, + "step": 1067 + }, + { + "epoch": 0.5142960332269909, + "grad_norm": 0.496856153011322, + "learning_rate": 0.00010046796144796497, + "loss": 0.4288, + "step": 1068 + }, + { + "epoch": 0.5147775838199001, + "grad_norm": 0.4943179786205292, + "learning_rate": 0.0001003119749312289, + "loss": 0.3586, + "step": 1069 + }, + { + "epoch": 0.5152591344128092, + "grad_norm": 1.3680517673492432, + "learning_rate": 0.00010015598765539031, + "loss": 0.6428, + "step": 1070 + }, + { + "epoch": 0.5157406850057185, + "grad_norm": 0.6808015704154968, + "learning_rate": 0.0001, + "loss": 0.4651, + "step": 1071 + }, + { + "epoch": 0.5162222355986276, + "grad_norm": 0.6616686582565308, + "learning_rate": 9.984401234460971e-05, + "loss": 0.293, + "step": 1072 + }, + { + "epoch": 0.5167037861915368, + "grad_norm": 0.4129731357097626, + "learning_rate": 9.968802506877111e-05, + "loss": 0.6314, + "step": 1073 + }, + { + "epoch": 0.5171853367844459, + "grad_norm": 1.0533828735351562, + "learning_rate": 9.953203855203504e-05, + "loss": 0.3499, + "step": 1074 + }, + { + "epoch": 0.5176668873773551, + "grad_norm": 0.7815319895744324, + "learning_rate": 9.93760531739504e-05, + "loss": 0.5837, + "step": 1075 + }, + { + "epoch": 0.5181484379702642, + "grad_norm": 1.0250219106674194, + "learning_rate": 9.922006931406338e-05, + "loss": 0.7357, + "step": 1076 + }, + { + "epoch": 0.5186299885631734, + "grad_norm": 0.4391312599182129, + "learning_rate": 9.906408735191643e-05, + "loss": 0.3415, + "step": 1077 + }, + { + "epoch": 0.5191115391560825, + "grad_norm": 0.5005767941474915, + "learning_rate": 9.890810766704745e-05, + "loss": 0.4639, + "step": 1078 + }, + { + "epoch": 0.5195930897489918, + "grad_norm": 2.065234661102295, + "learning_rate": 9.875213063898875e-05, + "loss": 0.4626, + "step": 1079 + }, + { + "epoch": 0.5200746403419009, + "grad_norm": 0.7594066262245178, + "learning_rate": 9.859615664726615e-05, + "loss": 0.7789, + "step": 1080 + }, + { + "epoch": 0.5205561909348101, + "grad_norm": 1.4543653726577759, + "learning_rate": 9.844018607139818e-05, + "loss": 0.6125, + "step": 1081 + }, + { + "epoch": 0.5210377415277193, + "grad_norm": 0.6688056588172913, + "learning_rate": 9.828421929089493e-05, + "loss": 0.251, + "step": 1082 + }, + { + "epoch": 0.5215192921206284, + "grad_norm": 1.0090996026992798, + "learning_rate": 9.812825668525733e-05, + "loss": 0.4581, + "step": 1083 + }, + { + "epoch": 0.5220008427135376, + "grad_norm": 0.9608261585235596, + "learning_rate": 9.797229863397615e-05, + "loss": 0.4541, + "step": 1084 + }, + { + "epoch": 0.5224823933064467, + "grad_norm": 0.40178394317626953, + "learning_rate": 9.781634551653108e-05, + "loss": 0.3175, + "step": 1085 + }, + { + "epoch": 0.522963943899356, + "grad_norm": 0.8387685418128967, + "learning_rate": 9.766039771238982e-05, + "loss": 0.8164, + "step": 1086 + }, + { + "epoch": 0.5234454944922651, + "grad_norm": 0.637122631072998, + "learning_rate": 9.750445560100706e-05, + "loss": 0.6988, + "step": 1087 + }, + { + "epoch": 0.5239270450851743, + "grad_norm": 1.0129690170288086, + "learning_rate": 9.73485195618237e-05, + "loss": 0.8151, + "step": 1088 + }, + { + "epoch": 0.5244085956780834, + "grad_norm": 0.7991225719451904, + "learning_rate": 9.719258997426588e-05, + "loss": 0.5955, + "step": 1089 + }, + { + "epoch": 0.5248901462709926, + "grad_norm": 1.574089765548706, + "learning_rate": 9.703666721774402e-05, + "loss": 0.2897, + "step": 1090 + }, + { + "epoch": 0.5253716968639017, + "grad_norm": 0.9370477199554443, + "learning_rate": 9.688075167165194e-05, + "loss": 0.5444, + "step": 1091 + }, + { + "epoch": 0.525853247456811, + "grad_norm": 0.7514247298240662, + "learning_rate": 9.672484371536586e-05, + "loss": 0.2897, + "step": 1092 + }, + { + "epoch": 0.52633479804972, + "grad_norm": 0.6659570932388306, + "learning_rate": 9.656894372824358e-05, + "loss": 0.755, + "step": 1093 + }, + { + "epoch": 0.5268163486426293, + "grad_norm": 0.7851242423057556, + "learning_rate": 9.64130520896235e-05, + "loss": 0.4844, + "step": 1094 + }, + { + "epoch": 0.5272978992355384, + "grad_norm": 0.416088730096817, + "learning_rate": 9.625716917882367e-05, + "loss": 0.4613, + "step": 1095 + }, + { + "epoch": 0.5277794498284476, + "grad_norm": 0.3750472068786621, + "learning_rate": 9.6101295375141e-05, + "loss": 0.6192, + "step": 1096 + }, + { + "epoch": 0.5282610004213568, + "grad_norm": 0.6323158144950867, + "learning_rate": 9.594543105785013e-05, + "loss": 0.4593, + "step": 1097 + }, + { + "epoch": 0.5287425510142659, + "grad_norm": 0.6383538842201233, + "learning_rate": 9.578957660620267e-05, + "loss": 0.4207, + "step": 1098 + }, + { + "epoch": 0.5292241016071751, + "grad_norm": 3.022141218185425, + "learning_rate": 9.563373239942623e-05, + "loss": 0.6448, + "step": 1099 + }, + { + "epoch": 0.5297056522000843, + "grad_norm": 0.5980287194252014, + "learning_rate": 9.547789881672348e-05, + "loss": 0.2678, + "step": 1100 + }, + { + "epoch": 0.5301872027929935, + "grad_norm": 1.0514696836471558, + "learning_rate": 9.532207623727126e-05, + "loss": 0.7311, + "step": 1101 + }, + { + "epoch": 0.5306687533859026, + "grad_norm": 4.502137660980225, + "learning_rate": 9.516626504021957e-05, + "loss": 0.4885, + "step": 1102 + }, + { + "epoch": 0.5311503039788118, + "grad_norm": 0.9058294892311096, + "learning_rate": 9.501046560469079e-05, + "loss": 0.6202, + "step": 1103 + }, + { + "epoch": 0.5316318545717209, + "grad_norm": 4.422435283660889, + "learning_rate": 9.485467830977864e-05, + "loss": 0.9249, + "step": 1104 + }, + { + "epoch": 0.5321134051646301, + "grad_norm": 0.8262627720832825, + "learning_rate": 9.469890353454732e-05, + "loss": 0.2968, + "step": 1105 + }, + { + "epoch": 0.5325949557575392, + "grad_norm": 0.8587914705276489, + "learning_rate": 9.454314165803054e-05, + "loss": 0.269, + "step": 1106 + }, + { + "epoch": 0.5330765063504485, + "grad_norm": 0.5949798822402954, + "learning_rate": 9.438739305923067e-05, + "loss": 0.4063, + "step": 1107 + }, + { + "epoch": 0.5335580569433576, + "grad_norm": 0.8733230829238892, + "learning_rate": 9.423165811711777e-05, + "loss": 0.622, + "step": 1108 + }, + { + "epoch": 0.5340396075362668, + "grad_norm": 1.0437642335891724, + "learning_rate": 9.407593721062859e-05, + "loss": 0.3839, + "step": 1109 + }, + { + "epoch": 0.534521158129176, + "grad_norm": 0.2439364790916443, + "learning_rate": 9.39202307186658e-05, + "loss": 0.1443, + "step": 1110 + }, + { + "epoch": 0.5350027087220851, + "grad_norm": 1.0091365575790405, + "learning_rate": 9.3764539020097e-05, + "loss": 0.2337, + "step": 1111 + }, + { + "epoch": 0.5354842593149943, + "grad_norm": 0.8624956011772156, + "learning_rate": 9.360886249375376e-05, + "loss": 0.9408, + "step": 1112 + }, + { + "epoch": 0.5359658099079034, + "grad_norm": 0.16920357942581177, + "learning_rate": 9.345320151843078e-05, + "loss": 0.2112, + "step": 1113 + }, + { + "epoch": 0.5364473605008127, + "grad_norm": 1.2505950927734375, + "learning_rate": 9.329755647288485e-05, + "loss": 0.7937, + "step": 1114 + }, + { + "epoch": 0.5369289110937218, + "grad_norm": 0.9290667176246643, + "learning_rate": 9.314192773583403e-05, + "loss": 0.4367, + "step": 1115 + }, + { + "epoch": 0.537410461686631, + "grad_norm": 0.922398030757904, + "learning_rate": 9.298631568595674e-05, + "loss": 0.4056, + "step": 1116 + }, + { + "epoch": 0.5378920122795401, + "grad_norm": 0.6957153081893921, + "learning_rate": 9.283072070189075e-05, + "loss": 0.2602, + "step": 1117 + }, + { + "epoch": 0.5383735628724493, + "grad_norm": 0.7812706232070923, + "learning_rate": 9.267514316223234e-05, + "loss": 0.5729, + "step": 1118 + }, + { + "epoch": 0.5388551134653584, + "grad_norm": 0.7393254637718201, + "learning_rate": 9.251958344553528e-05, + "loss": 0.3636, + "step": 1119 + }, + { + "epoch": 0.5393366640582676, + "grad_norm": 0.7986420392990112, + "learning_rate": 9.23640419303101e-05, + "loss": 0.5024, + "step": 1120 + }, + { + "epoch": 0.5398182146511767, + "grad_norm": 1.174703598022461, + "learning_rate": 9.220851899502283e-05, + "loss": 0.8046, + "step": 1121 + }, + { + "epoch": 0.540299765244086, + "grad_norm": 0.6943523287773132, + "learning_rate": 9.205301501809448e-05, + "loss": 0.5785, + "step": 1122 + }, + { + "epoch": 0.5407813158369951, + "grad_norm": 1.8074625730514526, + "learning_rate": 9.189753037789987e-05, + "loss": 0.574, + "step": 1123 + }, + { + "epoch": 0.5412628664299043, + "grad_norm": 0.7253779172897339, + "learning_rate": 9.174206545276677e-05, + "loss": 0.7158, + "step": 1124 + }, + { + "epoch": 0.5417444170228135, + "grad_norm": 0.8387247323989868, + "learning_rate": 9.158662062097501e-05, + "loss": 0.7756, + "step": 1125 + }, + { + "epoch": 0.5422259676157226, + "grad_norm": 0.6356840133666992, + "learning_rate": 9.143119626075542e-05, + "loss": 0.2435, + "step": 1126 + }, + { + "epoch": 0.5427075182086318, + "grad_norm": 1.1372162103652954, + "learning_rate": 9.127579275028914e-05, + "loss": 0.9096, + "step": 1127 + }, + { + "epoch": 0.5431890688015409, + "grad_norm": 0.5616921186447144, + "learning_rate": 9.112041046770653e-05, + "loss": 0.5741, + "step": 1128 + }, + { + "epoch": 0.5436706193944502, + "grad_norm": 0.7209728360176086, + "learning_rate": 9.096504979108629e-05, + "loss": 0.7035, + "step": 1129 + }, + { + "epoch": 0.5441521699873593, + "grad_norm": 0.907452404499054, + "learning_rate": 9.080971109845458e-05, + "loss": 0.4857, + "step": 1130 + }, + { + "epoch": 0.5446337205802685, + "grad_norm": 0.7207451462745667, + "learning_rate": 9.0654394767784e-05, + "loss": 0.3176, + "step": 1131 + }, + { + "epoch": 0.5451152711731776, + "grad_norm": 0.76964271068573, + "learning_rate": 9.049910117699281e-05, + "loss": 0.3447, + "step": 1132 + }, + { + "epoch": 0.5455968217660868, + "grad_norm": 0.5925887823104858, + "learning_rate": 9.034383070394393e-05, + "loss": 0.4471, + "step": 1133 + }, + { + "epoch": 0.5460783723589959, + "grad_norm": 0.6649512648582458, + "learning_rate": 9.0188583726444e-05, + "loss": 0.6966, + "step": 1134 + }, + { + "epoch": 0.5465599229519051, + "grad_norm": 0.4797343909740448, + "learning_rate": 9.00333606222425e-05, + "loss": 0.3058, + "step": 1135 + }, + { + "epoch": 0.5470414735448142, + "grad_norm": 0.44993117451667786, + "learning_rate": 8.987816176903082e-05, + "loss": 0.2101, + "step": 1136 + }, + { + "epoch": 0.5475230241377235, + "grad_norm": 1.0100607872009277, + "learning_rate": 8.972298754444136e-05, + "loss": 0.8622, + "step": 1137 + }, + { + "epoch": 0.5480045747306327, + "grad_norm": 0.5623632073402405, + "learning_rate": 8.956783832604654e-05, + "loss": 0.225, + "step": 1138 + }, + { + "epoch": 0.5484861253235418, + "grad_norm": 0.7404162883758545, + "learning_rate": 8.941271449135806e-05, + "loss": 0.9008, + "step": 1139 + }, + { + "epoch": 0.548967675916451, + "grad_norm": 0.5419902801513672, + "learning_rate": 8.925761641782567e-05, + "loss": 0.5679, + "step": 1140 + }, + { + "epoch": 0.5494492265093601, + "grad_norm": 0.6613001823425293, + "learning_rate": 8.910254448283659e-05, + "loss": 0.2438, + "step": 1141 + }, + { + "epoch": 0.5499307771022693, + "grad_norm": 2.0353126525878906, + "learning_rate": 8.894749906371439e-05, + "loss": 0.6597, + "step": 1142 + }, + { + "epoch": 0.5504123276951784, + "grad_norm": 0.7912282943725586, + "learning_rate": 8.87924805377181e-05, + "loss": 0.3856, + "step": 1143 + }, + { + "epoch": 0.5508938782880877, + "grad_norm": 0.7536735534667969, + "learning_rate": 8.863748928204131e-05, + "loss": 0.1915, + "step": 1144 + }, + { + "epoch": 0.5513754288809968, + "grad_norm": 1.1171406507492065, + "learning_rate": 8.848252567381131e-05, + "loss": 0.3394, + "step": 1145 + }, + { + "epoch": 0.551856979473906, + "grad_norm": 1.4895752668380737, + "learning_rate": 8.83275900900881e-05, + "loss": 0.497, + "step": 1146 + }, + { + "epoch": 0.5523385300668151, + "grad_norm": 0.8278349041938782, + "learning_rate": 8.817268290786343e-05, + "loss": 0.3802, + "step": 1147 + }, + { + "epoch": 0.5528200806597243, + "grad_norm": 1.5251246690750122, + "learning_rate": 8.801780450406002e-05, + "loss": 0.3321, + "step": 1148 + }, + { + "epoch": 0.5533016312526334, + "grad_norm": 1.3434091806411743, + "learning_rate": 8.786295525553053e-05, + "loss": 0.2525, + "step": 1149 + }, + { + "epoch": 0.5537831818455426, + "grad_norm": 0.8560492992401123, + "learning_rate": 8.770813553905664e-05, + "loss": 0.2281, + "step": 1150 + }, + { + "epoch": 0.5542647324384519, + "grad_norm": 3.152848958969116, + "learning_rate": 8.755334573134829e-05, + "loss": 0.4125, + "step": 1151 + }, + { + "epoch": 0.554746283031361, + "grad_norm": 0.3480580151081085, + "learning_rate": 8.739858620904251e-05, + "loss": 0.4035, + "step": 1152 + }, + { + "epoch": 0.5552278336242702, + "grad_norm": 0.35887765884399414, + "learning_rate": 8.724385734870271e-05, + "loss": 0.6556, + "step": 1153 + }, + { + "epoch": 0.5557093842171793, + "grad_norm": 2.1073386669158936, + "learning_rate": 8.708915952681769e-05, + "loss": 0.3513, + "step": 1154 + }, + { + "epoch": 0.5561909348100885, + "grad_norm": 0.5265167355537415, + "learning_rate": 8.693449311980074e-05, + "loss": 0.7842, + "step": 1155 + }, + { + "epoch": 0.5566724854029976, + "grad_norm": 0.49484220147132874, + "learning_rate": 8.677985850398866e-05, + "loss": 0.2603, + "step": 1156 + }, + { + "epoch": 0.5571540359959068, + "grad_norm": 0.8155922889709473, + "learning_rate": 8.662525605564093e-05, + "loss": 0.6978, + "step": 1157 + }, + { + "epoch": 0.557635586588816, + "grad_norm": 1.1594356298446655, + "learning_rate": 8.647068615093875e-05, + "loss": 1.0918, + "step": 1158 + }, + { + "epoch": 0.5581171371817252, + "grad_norm": 1.142989993095398, + "learning_rate": 8.631614916598419e-05, + "loss": 0.4544, + "step": 1159 + }, + { + "epoch": 0.5585986877746343, + "grad_norm": 0.8753609657287598, + "learning_rate": 8.616164547679906e-05, + "loss": 0.1695, + "step": 1160 + }, + { + "epoch": 0.5590802383675435, + "grad_norm": 1.0129923820495605, + "learning_rate": 8.600717545932435e-05, + "loss": 0.5372, + "step": 1161 + }, + { + "epoch": 0.5595617889604526, + "grad_norm": 0.6155772805213928, + "learning_rate": 8.5852739489419e-05, + "loss": 0.1198, + "step": 1162 + }, + { + "epoch": 0.5600433395533618, + "grad_norm": 1.3019598722457886, + "learning_rate": 8.569833794285915e-05, + "loss": 0.6993, + "step": 1163 + }, + { + "epoch": 0.5605248901462709, + "grad_norm": 0.6143357157707214, + "learning_rate": 8.554397119533714e-05, + "loss": 0.686, + "step": 1164 + }, + { + "epoch": 0.5610064407391802, + "grad_norm": 1.0003910064697266, + "learning_rate": 8.538963962246069e-05, + "loss": 0.2625, + "step": 1165 + }, + { + "epoch": 0.5614879913320894, + "grad_norm": 1.33318030834198, + "learning_rate": 8.523534359975189e-05, + "loss": 0.5983, + "step": 1166 + }, + { + "epoch": 0.5619695419249985, + "grad_norm": 4.280618190765381, + "learning_rate": 8.508108350264635e-05, + "loss": 0.4626, + "step": 1167 + }, + { + "epoch": 0.5624510925179077, + "grad_norm": 0.9768639802932739, + "learning_rate": 8.492685970649228e-05, + "loss": 0.3368, + "step": 1168 + }, + { + "epoch": 0.5629326431108168, + "grad_norm": 1.5351608991622925, + "learning_rate": 8.477267258654949e-05, + "loss": 0.454, + "step": 1169 + }, + { + "epoch": 0.563414193703726, + "grad_norm": 0.7636793851852417, + "learning_rate": 8.461852251798866e-05, + "loss": 0.7084, + "step": 1170 + }, + { + "epoch": 0.5638957442966351, + "grad_norm": 1.2424840927124023, + "learning_rate": 8.44644098758902e-05, + "loss": 0.2788, + "step": 1171 + }, + { + "epoch": 0.5643772948895444, + "grad_norm": 0.42953938245773315, + "learning_rate": 8.431033503524354e-05, + "loss": 0.2539, + "step": 1172 + }, + { + "epoch": 0.5648588454824535, + "grad_norm": 1.4571208953857422, + "learning_rate": 8.415629837094611e-05, + "loss": 0.9114, + "step": 1173 + }, + { + "epoch": 0.5653403960753627, + "grad_norm": 0.6140320301055908, + "learning_rate": 8.400230025780243e-05, + "loss": 0.4315, + "step": 1174 + }, + { + "epoch": 0.5658219466682718, + "grad_norm": 0.7121503353118896, + "learning_rate": 8.384834107052321e-05, + "loss": 0.3664, + "step": 1175 + }, + { + "epoch": 0.566303497261181, + "grad_norm": 0.7272537350654602, + "learning_rate": 8.369442118372447e-05, + "loss": 0.5133, + "step": 1176 + }, + { + "epoch": 0.5667850478540901, + "grad_norm": 0.7526095509529114, + "learning_rate": 8.35405409719266e-05, + "loss": 0.6136, + "step": 1177 + }, + { + "epoch": 0.5672665984469993, + "grad_norm": 1.0880045890808105, + "learning_rate": 8.338670080955349e-05, + "loss": 0.3161, + "step": 1178 + }, + { + "epoch": 0.5677481490399086, + "grad_norm": 1.5565310716629028, + "learning_rate": 8.323290107093143e-05, + "loss": 0.3287, + "step": 1179 + }, + { + "epoch": 0.5682296996328177, + "grad_norm": 0.5715327858924866, + "learning_rate": 8.307914213028856e-05, + "loss": 0.5322, + "step": 1180 + }, + { + "epoch": 0.5687112502257269, + "grad_norm": 0.5156280398368835, + "learning_rate": 8.292542436175356e-05, + "loss": 0.3414, + "step": 1181 + }, + { + "epoch": 0.569192800818636, + "grad_norm": 1.4075266122817993, + "learning_rate": 8.277174813935508e-05, + "loss": 0.6612, + "step": 1182 + }, + { + "epoch": 0.5696743514115452, + "grad_norm": 0.9587284326553345, + "learning_rate": 8.261811383702061e-05, + "loss": 0.5021, + "step": 1183 + }, + { + "epoch": 0.5701559020044543, + "grad_norm": 0.9386144280433655, + "learning_rate": 8.246452182857562e-05, + "loss": 0.3912, + "step": 1184 + }, + { + "epoch": 0.5706374525973635, + "grad_norm": 1.3160654306411743, + "learning_rate": 8.231097248774274e-05, + "loss": 0.6266, + "step": 1185 + }, + { + "epoch": 0.5711190031902726, + "grad_norm": 1.9529427289962769, + "learning_rate": 8.215746618814067e-05, + "loss": 0.4088, + "step": 1186 + }, + { + "epoch": 0.5716005537831819, + "grad_norm": 1.5948363542556763, + "learning_rate": 8.200400330328348e-05, + "loss": 0.8994, + "step": 1187 + }, + { + "epoch": 0.572082104376091, + "grad_norm": 0.8197068572044373, + "learning_rate": 8.185058420657957e-05, + "loss": 1.1295, + "step": 1188 + }, + { + "epoch": 0.5725636549690002, + "grad_norm": 0.7190425395965576, + "learning_rate": 8.16972092713308e-05, + "loss": 0.5654, + "step": 1189 + }, + { + "epoch": 0.5730452055619093, + "grad_norm": 0.6366353034973145, + "learning_rate": 8.154387887073158e-05, + "loss": 0.2643, + "step": 1190 + }, + { + "epoch": 0.5735267561548185, + "grad_norm": 0.830741822719574, + "learning_rate": 8.139059337786792e-05, + "loss": 0.4566, + "step": 1191 + }, + { + "epoch": 0.5740083067477277, + "grad_norm": 1.1643511056900024, + "learning_rate": 8.12373531657166e-05, + "loss": 0.6259, + "step": 1192 + }, + { + "epoch": 0.5744898573406368, + "grad_norm": 0.43332046270370483, + "learning_rate": 8.108415860714418e-05, + "loss": 0.2925, + "step": 1193 + }, + { + "epoch": 0.5749714079335461, + "grad_norm": 0.6459894180297852, + "learning_rate": 8.093101007490622e-05, + "loss": 0.5729, + "step": 1194 + }, + { + "epoch": 0.5754529585264552, + "grad_norm": 0.8762726783752441, + "learning_rate": 8.077790794164619e-05, + "loss": 0.2891, + "step": 1195 + }, + { + "epoch": 0.5759345091193644, + "grad_norm": 0.8213867545127869, + "learning_rate": 8.062485257989471e-05, + "loss": 0.9006, + "step": 1196 + }, + { + "epoch": 0.5764160597122735, + "grad_norm": 0.5547890663146973, + "learning_rate": 8.047184436206864e-05, + "loss": 0.4767, + "step": 1197 + }, + { + "epoch": 0.5768976103051827, + "grad_norm": 0.8731846213340759, + "learning_rate": 8.031888366046998e-05, + "loss": 0.5808, + "step": 1198 + }, + { + "epoch": 0.5773791608980918, + "grad_norm": 0.8570541143417358, + "learning_rate": 8.016597084728526e-05, + "loss": 0.5455, + "step": 1199 + }, + { + "epoch": 0.577860711491001, + "grad_norm": 0.6910862922668457, + "learning_rate": 8.001310629458443e-05, + "loss": 0.7892, + "step": 1200 + }, + { + "epoch": 0.5783422620839102, + "grad_norm": 0.552251935005188, + "learning_rate": 7.986029037432002e-05, + "loss": 0.6565, + "step": 1201 + }, + { + "epoch": 0.5788238126768194, + "grad_norm": 0.759572446346283, + "learning_rate": 7.970752345832623e-05, + "loss": 0.3392, + "step": 1202 + }, + { + "epoch": 0.5793053632697285, + "grad_norm": 1.0076072216033936, + "learning_rate": 7.9554805918318e-05, + "loss": 0.3423, + "step": 1203 + }, + { + "epoch": 0.5797869138626377, + "grad_norm": 0.4598049521446228, + "learning_rate": 7.940213812589018e-05, + "loss": 0.4327, + "step": 1204 + }, + { + "epoch": 0.5802684644555468, + "grad_norm": 0.7870919108390808, + "learning_rate": 7.92495204525165e-05, + "loss": 0.4448, + "step": 1205 + }, + { + "epoch": 0.580750015048456, + "grad_norm": 1.163459300994873, + "learning_rate": 7.909695326954878e-05, + "loss": 0.8113, + "step": 1206 + }, + { + "epoch": 0.5812315656413652, + "grad_norm": 0.6732601523399353, + "learning_rate": 7.894443694821602e-05, + "loss": 0.3975, + "step": 1207 + }, + { + "epoch": 0.5817131162342744, + "grad_norm": 0.6366457343101501, + "learning_rate": 7.879197185962339e-05, + "loss": 0.5208, + "step": 1208 + }, + { + "epoch": 0.5821946668271836, + "grad_norm": 1.1025159358978271, + "learning_rate": 7.863955837475144e-05, + "loss": 0.5071, + "step": 1209 + }, + { + "epoch": 0.5826762174200927, + "grad_norm": 0.701779842376709, + "learning_rate": 7.848719686445515e-05, + "loss": 0.3318, + "step": 1210 + }, + { + "epoch": 0.5831577680130019, + "grad_norm": 0.9084488749504089, + "learning_rate": 7.833488769946306e-05, + "loss": 0.7136, + "step": 1211 + }, + { + "epoch": 0.583639318605911, + "grad_norm": 0.8296939730644226, + "learning_rate": 7.818263125037633e-05, + "loss": 0.5052, + "step": 1212 + }, + { + "epoch": 0.5841208691988202, + "grad_norm": 0.9085122346878052, + "learning_rate": 7.803042788766777e-05, + "loss": 0.3964, + "step": 1213 + }, + { + "epoch": 0.5846024197917293, + "grad_norm": 0.5029633641242981, + "learning_rate": 7.787827798168115e-05, + "loss": 0.2406, + "step": 1214 + }, + { + "epoch": 0.5850839703846386, + "grad_norm": 0.5231782793998718, + "learning_rate": 7.772618190263009e-05, + "loss": 0.4461, + "step": 1215 + }, + { + "epoch": 0.5855655209775477, + "grad_norm": 0.6706518530845642, + "learning_rate": 7.757414002059726e-05, + "loss": 0.2944, + "step": 1216 + }, + { + "epoch": 0.5860470715704569, + "grad_norm": 0.7979804873466492, + "learning_rate": 7.742215270553349e-05, + "loss": 0.5646, + "step": 1217 + }, + { + "epoch": 0.586528622163366, + "grad_norm": 0.8495470285415649, + "learning_rate": 7.727022032725672e-05, + "loss": 0.622, + "step": 1218 + }, + { + "epoch": 0.5870101727562752, + "grad_norm": 0.42500513792037964, + "learning_rate": 7.711834325545135e-05, + "loss": 0.3364, + "step": 1219 + }, + { + "epoch": 0.5874917233491844, + "grad_norm": 0.32045841217041016, + "learning_rate": 7.696652185966711e-05, + "loss": 0.3829, + "step": 1220 + }, + { + "epoch": 0.5879732739420935, + "grad_norm": 0.9677326083183289, + "learning_rate": 7.681475650931834e-05, + "loss": 0.4683, + "step": 1221 + }, + { + "epoch": 0.5884548245350028, + "grad_norm": 0.9780142307281494, + "learning_rate": 7.666304757368297e-05, + "loss": 0.616, + "step": 1222 + }, + { + "epoch": 0.5889363751279119, + "grad_norm": 0.37944215536117554, + "learning_rate": 7.651139542190164e-05, + "loss": 0.3803, + "step": 1223 + }, + { + "epoch": 0.5894179257208211, + "grad_norm": 1.0254456996917725, + "learning_rate": 7.635980042297687e-05, + "loss": 0.3009, + "step": 1224 + }, + { + "epoch": 0.5898994763137302, + "grad_norm": 0.8622443675994873, + "learning_rate": 7.620826294577208e-05, + "loss": 0.472, + "step": 1225 + }, + { + "epoch": 0.5903810269066394, + "grad_norm": 0.894799530506134, + "learning_rate": 7.605678335901071e-05, + "loss": 0.2618, + "step": 1226 + }, + { + "epoch": 0.5908625774995485, + "grad_norm": 0.7913607954978943, + "learning_rate": 7.59053620312754e-05, + "loss": 0.8074, + "step": 1227 + }, + { + "epoch": 0.5913441280924577, + "grad_norm": 0.2668255865573883, + "learning_rate": 7.575399933100697e-05, + "loss": 0.2157, + "step": 1228 + }, + { + "epoch": 0.5918256786853668, + "grad_norm": 0.6901659965515137, + "learning_rate": 7.560269562650368e-05, + "loss": 0.4497, + "step": 1229 + }, + { + "epoch": 0.5923072292782761, + "grad_norm": 0.5438380241394043, + "learning_rate": 7.54514512859201e-05, + "loss": 0.1374, + "step": 1230 + }, + { + "epoch": 0.5927887798711852, + "grad_norm": 2.4272823333740234, + "learning_rate": 7.530026667726645e-05, + "loss": 0.3198, + "step": 1231 + }, + { + "epoch": 0.5932703304640944, + "grad_norm": 0.6136943697929382, + "learning_rate": 7.51491421684076e-05, + "loss": 0.825, + "step": 1232 + }, + { + "epoch": 0.5937518810570036, + "grad_norm": 0.9989371299743652, + "learning_rate": 7.49980781270622e-05, + "loss": 0.5049, + "step": 1233 + }, + { + "epoch": 0.5942334316499127, + "grad_norm": 0.8432647585868835, + "learning_rate": 7.484707492080172e-05, + "loss": 0.6384, + "step": 1234 + }, + { + "epoch": 0.5947149822428219, + "grad_norm": 0.94101482629776, + "learning_rate": 7.469613291704962e-05, + "loss": 0.8086, + "step": 1235 + }, + { + "epoch": 0.595196532835731, + "grad_norm": 0.645421028137207, + "learning_rate": 7.45452524830805e-05, + "loss": 0.4266, + "step": 1236 + }, + { + "epoch": 0.5956780834286403, + "grad_norm": 0.7342570424079895, + "learning_rate": 7.439443398601903e-05, + "loss": 0.3519, + "step": 1237 + }, + { + "epoch": 0.5961596340215494, + "grad_norm": 0.994813859462738, + "learning_rate": 7.424367779283926e-05, + "loss": 0.7938, + "step": 1238 + }, + { + "epoch": 0.5966411846144586, + "grad_norm": 1.16128408908844, + "learning_rate": 7.409298427036364e-05, + "loss": 0.3157, + "step": 1239 + }, + { + "epoch": 0.5971227352073677, + "grad_norm": 0.6799389123916626, + "learning_rate": 7.39423537852621e-05, + "loss": 0.3694, + "step": 1240 + }, + { + "epoch": 0.5976042858002769, + "grad_norm": 1.0635805130004883, + "learning_rate": 7.379178670405123e-05, + "loss": 0.4749, + "step": 1241 + }, + { + "epoch": 0.598085836393186, + "grad_norm": 1.7486920356750488, + "learning_rate": 7.364128339309326e-05, + "loss": 0.2807, + "step": 1242 + }, + { + "epoch": 0.5985673869860952, + "grad_norm": 1.2017595767974854, + "learning_rate": 7.349084421859533e-05, + "loss": 0.5348, + "step": 1243 + }, + { + "epoch": 0.5990489375790043, + "grad_norm": 0.5318276882171631, + "learning_rate": 7.334046954660852e-05, + "loss": 0.3619, + "step": 1244 + }, + { + "epoch": 0.5995304881719136, + "grad_norm": 0.8598198890686035, + "learning_rate": 7.31901597430269e-05, + "loss": 0.506, + "step": 1245 + }, + { + "epoch": 0.6000120387648227, + "grad_norm": 0.9996898770332336, + "learning_rate": 7.303991517358678e-05, + "loss": 0.6772, + "step": 1246 + }, + { + "epoch": 0.6004935893577319, + "grad_norm": 1.0808875560760498, + "learning_rate": 7.288973620386568e-05, + "loss": 0.6257, + "step": 1247 + }, + { + "epoch": 0.6009751399506411, + "grad_norm": 1.6466535329818726, + "learning_rate": 7.273962319928151e-05, + "loss": 0.4992, + "step": 1248 + }, + { + "epoch": 0.6014566905435502, + "grad_norm": 0.42734822630882263, + "learning_rate": 7.258957652509171e-05, + "loss": 0.4389, + "step": 1249 + }, + { + "epoch": 0.6019382411364594, + "grad_norm": 0.6383512020111084, + "learning_rate": 7.24395965463923e-05, + "loss": 0.3114, + "step": 1250 + }, + { + "epoch": 0.6024197917293685, + "grad_norm": 0.7204129099845886, + "learning_rate": 7.228968362811702e-05, + "loss": 0.2887, + "step": 1251 + }, + { + "epoch": 0.6029013423222778, + "grad_norm": 1.2571090459823608, + "learning_rate": 7.21398381350364e-05, + "loss": 0.5472, + "step": 1252 + }, + { + "epoch": 0.6033828929151869, + "grad_norm": 0.40661323070526123, + "learning_rate": 7.199006043175698e-05, + "loss": 0.6301, + "step": 1253 + }, + { + "epoch": 0.6038644435080961, + "grad_norm": 0.24985218048095703, + "learning_rate": 7.184035088272028e-05, + "loss": 0.1607, + "step": 1254 + }, + { + "epoch": 0.6043459941010052, + "grad_norm": 0.6477149724960327, + "learning_rate": 7.169070985220208e-05, + "loss": 0.3072, + "step": 1255 + }, + { + "epoch": 0.6048275446939144, + "grad_norm": 1.7258812189102173, + "learning_rate": 7.154113770431132e-05, + "loss": 0.5717, + "step": 1256 + }, + { + "epoch": 0.6053090952868235, + "grad_norm": 0.40395277738571167, + "learning_rate": 7.13916348029894e-05, + "loss": 0.4178, + "step": 1257 + }, + { + "epoch": 0.6057906458797327, + "grad_norm": 1.11382257938385, + "learning_rate": 7.124220151200926e-05, + "loss": 0.3039, + "step": 1258 + }, + { + "epoch": 0.6062721964726419, + "grad_norm": 1.0327476263046265, + "learning_rate": 7.10928381949744e-05, + "loss": 0.2497, + "step": 1259 + }, + { + "epoch": 0.6067537470655511, + "grad_norm": 1.168166995048523, + "learning_rate": 7.094354521531807e-05, + "loss": 0.592, + "step": 1260 + }, + { + "epoch": 0.6072352976584603, + "grad_norm": 0.7183307409286499, + "learning_rate": 7.079432293630244e-05, + "loss": 0.7905, + "step": 1261 + }, + { + "epoch": 0.6077168482513694, + "grad_norm": 0.63597172498703, + "learning_rate": 7.064517172101753e-05, + "loss": 0.6588, + "step": 1262 + }, + { + "epoch": 0.6081983988442786, + "grad_norm": 0.5656642317771912, + "learning_rate": 7.04960919323806e-05, + "loss": 0.2496, + "step": 1263 + }, + { + "epoch": 0.6086799494371877, + "grad_norm": 0.7504541873931885, + "learning_rate": 7.034708393313493e-05, + "loss": 0.6481, + "step": 1264 + }, + { + "epoch": 0.609161500030097, + "grad_norm": 0.9031791687011719, + "learning_rate": 7.019814808584928e-05, + "loss": 0.5999, + "step": 1265 + }, + { + "epoch": 0.609643050623006, + "grad_norm": 0.44127368927001953, + "learning_rate": 7.004928475291678e-05, + "loss": 0.3241, + "step": 1266 + }, + { + "epoch": 0.6101246012159153, + "grad_norm": 1.1644855737686157, + "learning_rate": 6.990049429655412e-05, + "loss": 0.7637, + "step": 1267 + }, + { + "epoch": 0.6106061518088244, + "grad_norm": 0.7400465607643127, + "learning_rate": 6.97517770788007e-05, + "loss": 0.7192, + "step": 1268 + }, + { + "epoch": 0.6110877024017336, + "grad_norm": 0.4853924810886383, + "learning_rate": 6.960313346151761e-05, + "loss": 0.5061, + "step": 1269 + }, + { + "epoch": 0.6115692529946427, + "grad_norm": 0.9253974556922913, + "learning_rate": 6.9454563806387e-05, + "loss": 0.7121, + "step": 1270 + }, + { + "epoch": 0.6120508035875519, + "grad_norm": 0.42263108491897583, + "learning_rate": 6.930606847491094e-05, + "loss": 0.5735, + "step": 1271 + }, + { + "epoch": 0.612532354180461, + "grad_norm": 0.6725876927375793, + "learning_rate": 6.915764782841072e-05, + "loss": 0.5999, + "step": 1272 + }, + { + "epoch": 0.6130139047733703, + "grad_norm": 0.6031161546707153, + "learning_rate": 6.900930222802588e-05, + "loss": 0.4829, + "step": 1273 + }, + { + "epoch": 0.6134954553662795, + "grad_norm": 0.7813351154327393, + "learning_rate": 6.886103203471337e-05, + "loss": 0.5447, + "step": 1274 + }, + { + "epoch": 0.6139770059591886, + "grad_norm": 0.8013120889663696, + "learning_rate": 6.871283760924665e-05, + "loss": 0.4516, + "step": 1275 + }, + { + "epoch": 0.6144585565520978, + "grad_norm": 0.6383388042449951, + "learning_rate": 6.856471931221478e-05, + "loss": 0.5968, + "step": 1276 + }, + { + "epoch": 0.6149401071450069, + "grad_norm": 0.782723605632782, + "learning_rate": 6.841667750402162e-05, + "loss": 0.2112, + "step": 1277 + }, + { + "epoch": 0.6154216577379161, + "grad_norm": 0.4500112235546112, + "learning_rate": 6.826871254488496e-05, + "loss": 0.4242, + "step": 1278 + }, + { + "epoch": 0.6159032083308252, + "grad_norm": 0.34776097536087036, + "learning_rate": 6.812082479483553e-05, + "loss": 0.1855, + "step": 1279 + }, + { + "epoch": 0.6163847589237345, + "grad_norm": 0.7532259225845337, + "learning_rate": 6.797301461371625e-05, + "loss": 0.4524, + "step": 1280 + }, + { + "epoch": 0.6168663095166436, + "grad_norm": 1.1935185194015503, + "learning_rate": 6.782528236118124e-05, + "loss": 0.3109, + "step": 1281 + }, + { + "epoch": 0.6173478601095528, + "grad_norm": 0.6757370829582214, + "learning_rate": 6.767762839669503e-05, + "loss": 0.5126, + "step": 1282 + }, + { + "epoch": 0.6178294107024619, + "grad_norm": 1.0861924886703491, + "learning_rate": 6.753005307953167e-05, + "loss": 0.637, + "step": 1283 + }, + { + "epoch": 0.6183109612953711, + "grad_norm": 1.4494441747665405, + "learning_rate": 6.738255676877381e-05, + "loss": 0.3537, + "step": 1284 + }, + { + "epoch": 0.6187925118882802, + "grad_norm": 1.862773060798645, + "learning_rate": 6.723513982331195e-05, + "loss": 0.5789, + "step": 1285 + }, + { + "epoch": 0.6192740624811894, + "grad_norm": 1.0698832273483276, + "learning_rate": 6.708780260184333e-05, + "loss": 0.3281, + "step": 1286 + }, + { + "epoch": 0.6197556130740985, + "grad_norm": 0.7767219543457031, + "learning_rate": 6.694054546287132e-05, + "loss": 0.2786, + "step": 1287 + }, + { + "epoch": 0.6202371636670078, + "grad_norm": 0.8477858304977417, + "learning_rate": 6.679336876470441e-05, + "loss": 0.6544, + "step": 1288 + }, + { + "epoch": 0.620718714259917, + "grad_norm": 0.9157706499099731, + "learning_rate": 6.664627286545535e-05, + "loss": 0.7001, + "step": 1289 + }, + { + "epoch": 0.6212002648528261, + "grad_norm": 0.7171075344085693, + "learning_rate": 6.649925812304025e-05, + "loss": 0.3359, + "step": 1290 + }, + { + "epoch": 0.6216818154457353, + "grad_norm": 0.5645914673805237, + "learning_rate": 6.635232489517782e-05, + "loss": 0.6452, + "step": 1291 + }, + { + "epoch": 0.6221633660386444, + "grad_norm": 0.7226665019989014, + "learning_rate": 6.620547353938836e-05, + "loss": 0.4353, + "step": 1292 + }, + { + "epoch": 0.6226449166315536, + "grad_norm": 0.5722493529319763, + "learning_rate": 6.605870441299302e-05, + "loss": 0.4712, + "step": 1293 + }, + { + "epoch": 0.6231264672244627, + "grad_norm": 2.9842445850372314, + "learning_rate": 6.591201787311285e-05, + "loss": 0.7964, + "step": 1294 + }, + { + "epoch": 0.623608017817372, + "grad_norm": 0.7371240258216858, + "learning_rate": 6.57654142766679e-05, + "loss": 0.4837, + "step": 1295 + }, + { + "epoch": 0.6240895684102811, + "grad_norm": 0.6846769452095032, + "learning_rate": 6.561889398037643e-05, + "loss": 0.3781, + "step": 1296 + }, + { + "epoch": 0.6245711190031903, + "grad_norm": 0.618120014667511, + "learning_rate": 6.547245734075403e-05, + "loss": 0.2455, + "step": 1297 + }, + { + "epoch": 0.6250526695960994, + "grad_norm": 0.5149625539779663, + "learning_rate": 6.532610471411274e-05, + "loss": 0.233, + "step": 1298 + }, + { + "epoch": 0.6255342201890086, + "grad_norm": 1.0907325744628906, + "learning_rate": 6.517983645656014e-05, + "loss": 0.4209, + "step": 1299 + }, + { + "epoch": 0.6260157707819177, + "grad_norm": 0.8387037515640259, + "learning_rate": 6.503365292399857e-05, + "loss": 0.8553, + "step": 1300 + }, + { + "epoch": 0.6264973213748269, + "grad_norm": 0.6564305424690247, + "learning_rate": 6.488755447212418e-05, + "loss": 0.4474, + "step": 1301 + }, + { + "epoch": 0.6269788719677362, + "grad_norm": 0.5387388467788696, + "learning_rate": 6.474154145642612e-05, + "loss": 0.9949, + "step": 1302 + }, + { + "epoch": 0.6274604225606453, + "grad_norm": 0.6690697073936462, + "learning_rate": 6.459561423218561e-05, + "loss": 0.7197, + "step": 1303 + }, + { + "epoch": 0.6279419731535545, + "grad_norm": 0.7778486013412476, + "learning_rate": 6.444977315447521e-05, + "loss": 0.3269, + "step": 1304 + }, + { + "epoch": 0.6284235237464636, + "grad_norm": 0.9561223983764648, + "learning_rate": 6.430401857815776e-05, + "loss": 0.5035, + "step": 1305 + }, + { + "epoch": 0.6289050743393728, + "grad_norm": 0.6702042818069458, + "learning_rate": 6.415835085788575e-05, + "loss": 0.4164, + "step": 1306 + }, + { + "epoch": 0.6293866249322819, + "grad_norm": 1.7345411777496338, + "learning_rate": 6.401277034810017e-05, + "loss": 0.2831, + "step": 1307 + }, + { + "epoch": 0.6298681755251911, + "grad_norm": 0.5290318727493286, + "learning_rate": 6.386727740302994e-05, + "loss": 0.5828, + "step": 1308 + }, + { + "epoch": 0.6303497261181003, + "grad_norm": 0.5355732440948486, + "learning_rate": 6.37218723766909e-05, + "loss": 0.2325, + "step": 1309 + }, + { + "epoch": 0.6308312767110095, + "grad_norm": 1.2181891202926636, + "learning_rate": 6.357655562288488e-05, + "loss": 0.7642, + "step": 1310 + }, + { + "epoch": 0.6313128273039186, + "grad_norm": 0.8350191712379456, + "learning_rate": 6.343132749519902e-05, + "loss": 0.7899, + "step": 1311 + }, + { + "epoch": 0.6317943778968278, + "grad_norm": 1.0079563856124878, + "learning_rate": 6.328618834700474e-05, + "loss": 0.7318, + "step": 1312 + }, + { + "epoch": 0.6322759284897369, + "grad_norm": 0.9546399116516113, + "learning_rate": 6.314113853145703e-05, + "loss": 0.4967, + "step": 1313 + }, + { + "epoch": 0.6327574790826461, + "grad_norm": 0.5168749094009399, + "learning_rate": 6.299617840149349e-05, + "loss": 0.385, + "step": 1314 + }, + { + "epoch": 0.6332390296755553, + "grad_norm": 0.40252628922462463, + "learning_rate": 6.285130830983339e-05, + "loss": 0.1764, + "step": 1315 + }, + { + "epoch": 0.6337205802684645, + "grad_norm": 1.2077138423919678, + "learning_rate": 6.270652860897704e-05, + "loss": 0.6442, + "step": 1316 + }, + { + "epoch": 0.6342021308613737, + "grad_norm": 0.8444836139678955, + "learning_rate": 6.25618396512048e-05, + "loss": 0.4704, + "step": 1317 + }, + { + "epoch": 0.6346836814542828, + "grad_norm": 0.5546010732650757, + "learning_rate": 6.24172417885762e-05, + "loss": 0.486, + "step": 1318 + }, + { + "epoch": 0.635165232047192, + "grad_norm": 0.6542741060256958, + "learning_rate": 6.227273537292911e-05, + "loss": 0.6625, + "step": 1319 + }, + { + "epoch": 0.6356467826401011, + "grad_norm": 1.0051288604736328, + "learning_rate": 6.212832075587891e-05, + "loss": 0.2804, + "step": 1320 + }, + { + "epoch": 0.6361283332330103, + "grad_norm": 1.0045028924942017, + "learning_rate": 6.19839982888176e-05, + "loss": 0.3209, + "step": 1321 + }, + { + "epoch": 0.6366098838259194, + "grad_norm": 1.248894453048706, + "learning_rate": 6.183976832291296e-05, + "loss": 0.3171, + "step": 1322 + }, + { + "epoch": 0.6370914344188287, + "grad_norm": 0.5783565640449524, + "learning_rate": 6.169563120910775e-05, + "loss": 0.6025, + "step": 1323 + }, + { + "epoch": 0.6375729850117378, + "grad_norm": 1.0445884466171265, + "learning_rate": 6.155158729811867e-05, + "loss": 0.5089, + "step": 1324 + }, + { + "epoch": 0.638054535604647, + "grad_norm": 0.43559569120407104, + "learning_rate": 6.140763694043578e-05, + "loss": 0.3683, + "step": 1325 + }, + { + "epoch": 0.6385360861975561, + "grad_norm": 0.41853922605514526, + "learning_rate": 6.126378048632139e-05, + "loss": 0.6438, + "step": 1326 + }, + { + "epoch": 0.6390176367904653, + "grad_norm": 0.5190677046775818, + "learning_rate": 6.112001828580944e-05, + "loss": 0.7269, + "step": 1327 + }, + { + "epoch": 0.6394991873833744, + "grad_norm": 0.6790060997009277, + "learning_rate": 6.0976350688704455e-05, + "loss": 0.1729, + "step": 1328 + }, + { + "epoch": 0.6399807379762836, + "grad_norm": 0.9905605316162109, + "learning_rate": 6.083277804458072e-05, + "loss": 0.803, + "step": 1329 + }, + { + "epoch": 0.6404622885691929, + "grad_norm": 1.024671196937561, + "learning_rate": 6.068930070278159e-05, + "loss": 0.4274, + "step": 1330 + }, + { + "epoch": 0.640943839162102, + "grad_norm": 0.5543073415756226, + "learning_rate": 6.054591901241846e-05, + "loss": 0.4442, + "step": 1331 + }, + { + "epoch": 0.6414253897550112, + "grad_norm": 0.5827720761299133, + "learning_rate": 6.040263332237002e-05, + "loss": 0.6295, + "step": 1332 + }, + { + "epoch": 0.6419069403479203, + "grad_norm": 0.47777485847473145, + "learning_rate": 6.025944398128137e-05, + "loss": 0.4328, + "step": 1333 + }, + { + "epoch": 0.6423884909408295, + "grad_norm": 1.5523085594177246, + "learning_rate": 6.011635133756309e-05, + "loss": 0.3494, + "step": 1334 + }, + { + "epoch": 0.6428700415337386, + "grad_norm": 0.43694812059402466, + "learning_rate": 5.99733557393906e-05, + "loss": 0.686, + "step": 1335 + }, + { + "epoch": 0.6433515921266478, + "grad_norm": 0.913146436214447, + "learning_rate": 5.983045753470308e-05, + "loss": 0.7183, + "step": 1336 + }, + { + "epoch": 0.6438331427195569, + "grad_norm": 0.566568911075592, + "learning_rate": 5.96876570712028e-05, + "loss": 0.8939, + "step": 1337 + }, + { + "epoch": 0.6443146933124662, + "grad_norm": 0.5812140703201294, + "learning_rate": 5.954495469635417e-05, + "loss": 0.3638, + "step": 1338 + }, + { + "epoch": 0.6447962439053753, + "grad_norm": 1.3928197622299194, + "learning_rate": 5.940235075738296e-05, + "loss": 0.5093, + "step": 1339 + }, + { + "epoch": 0.6452777944982845, + "grad_norm": 0.8183367848396301, + "learning_rate": 5.925984560127542e-05, + "loss": 0.8822, + "step": 1340 + }, + { + "epoch": 0.6457593450911936, + "grad_norm": 2.0084786415100098, + "learning_rate": 5.911743957477739e-05, + "loss": 1.0481, + "step": 1341 + }, + { + "epoch": 0.6462408956841028, + "grad_norm": 1.2464501857757568, + "learning_rate": 5.897513302439355e-05, + "loss": 0.5321, + "step": 1342 + }, + { + "epoch": 0.646722446277012, + "grad_norm": 0.937422513961792, + "learning_rate": 5.883292629638651e-05, + "loss": 0.8166, + "step": 1343 + }, + { + "epoch": 0.6472039968699211, + "grad_norm": 0.3697885274887085, + "learning_rate": 5.869081973677604e-05, + "loss": 0.4237, + "step": 1344 + }, + { + "epoch": 0.6476855474628304, + "grad_norm": 0.9014120697975159, + "learning_rate": 5.8548813691338134e-05, + "loss": 0.9163, + "step": 1345 + }, + { + "epoch": 0.6481670980557395, + "grad_norm": 0.7115447521209717, + "learning_rate": 5.84069085056042e-05, + "loss": 0.779, + "step": 1346 + }, + { + "epoch": 0.6486486486486487, + "grad_norm": 0.7296338677406311, + "learning_rate": 5.826510452486027e-05, + "loss": 0.3634, + "step": 1347 + }, + { + "epoch": 0.6491301992415578, + "grad_norm": 0.5319170951843262, + "learning_rate": 5.81234020941461e-05, + "loss": 0.3393, + "step": 1348 + }, + { + "epoch": 0.649611749834467, + "grad_norm": 1.6224805116653442, + "learning_rate": 5.798180155825437e-05, + "loss": 0.8028, + "step": 1349 + }, + { + "epoch": 0.6500933004273761, + "grad_norm": 0.7396529316902161, + "learning_rate": 5.784030326172981e-05, + "loss": 0.8123, + "step": 1350 + }, + { + "epoch": 0.6505748510202853, + "grad_norm": 0.44958508014678955, + "learning_rate": 5.7698907548868395e-05, + "loss": 0.2658, + "step": 1351 + }, + { + "epoch": 0.6510564016131944, + "grad_norm": 0.33610963821411133, + "learning_rate": 5.755761476371653e-05, + "loss": 0.3137, + "step": 1352 + }, + { + "epoch": 0.6515379522061037, + "grad_norm": 0.8860417008399963, + "learning_rate": 5.741642525007003e-05, + "loss": 0.9078, + "step": 1353 + }, + { + "epoch": 0.6520195027990128, + "grad_norm": 1.453182339668274, + "learning_rate": 5.727533935147359e-05, + "loss": 0.5685, + "step": 1354 + }, + { + "epoch": 0.652501053391922, + "grad_norm": 0.7702388167381287, + "learning_rate": 5.713435741121975e-05, + "loss": 0.3646, + "step": 1355 + }, + { + "epoch": 0.6529826039848312, + "grad_norm": 0.399646133184433, + "learning_rate": 5.699347977234799e-05, + "loss": 0.5033, + "step": 1356 + }, + { + "epoch": 0.6534641545777403, + "grad_norm": 0.7341455221176147, + "learning_rate": 5.685270677764412e-05, + "loss": 0.3869, + "step": 1357 + }, + { + "epoch": 0.6539457051706495, + "grad_norm": 0.4470628798007965, + "learning_rate": 5.671203876963931e-05, + "loss": 0.4914, + "step": 1358 + }, + { + "epoch": 0.6544272557635586, + "grad_norm": 1.1166383028030396, + "learning_rate": 5.657147609060924e-05, + "loss": 0.5032, + "step": 1359 + }, + { + "epoch": 0.6549088063564679, + "grad_norm": 1.1750547885894775, + "learning_rate": 5.643101908257333e-05, + "loss": 0.8003, + "step": 1360 + }, + { + "epoch": 0.655390356949377, + "grad_norm": 0.3405994176864624, + "learning_rate": 5.629066808729385e-05, + "loss": 0.1429, + "step": 1361 + }, + { + "epoch": 0.6558719075422862, + "grad_norm": 0.8223943710327148, + "learning_rate": 5.6150423446275144e-05, + "loss": 0.4109, + "step": 1362 + }, + { + "epoch": 0.6563534581351953, + "grad_norm": 1.1468275785446167, + "learning_rate": 5.601028550076277e-05, + "loss": 0.3162, + "step": 1363 + }, + { + "epoch": 0.6568350087281045, + "grad_norm": 0.4989483058452606, + "learning_rate": 5.587025459174271e-05, + "loss": 0.2402, + "step": 1364 + }, + { + "epoch": 0.6573165593210136, + "grad_norm": 0.6205065846443176, + "learning_rate": 5.573033105994038e-05, + "loss": 0.2549, + "step": 1365 + }, + { + "epoch": 0.6577981099139228, + "grad_norm": 0.6344619989395142, + "learning_rate": 5.559051524582002e-05, + "loss": 0.7323, + "step": 1366 + }, + { + "epoch": 0.658279660506832, + "grad_norm": 0.528626024723053, + "learning_rate": 5.5450807489583777e-05, + "loss": 0.5712, + "step": 1367 + }, + { + "epoch": 0.6587612110997412, + "grad_norm": 0.47747474908828735, + "learning_rate": 5.531120813117085e-05, + "loss": 0.1822, + "step": 1368 + }, + { + "epoch": 0.6592427616926503, + "grad_norm": 0.5578919649124146, + "learning_rate": 5.517171751025667e-05, + "loss": 0.6269, + "step": 1369 + }, + { + "epoch": 0.6597243122855595, + "grad_norm": 0.5875039100646973, + "learning_rate": 5.5032335966252103e-05, + "loss": 0.2919, + "step": 1370 + }, + { + "epoch": 0.6602058628784687, + "grad_norm": 0.6543027758598328, + "learning_rate": 5.489306383830258e-05, + "loss": 0.3357, + "step": 1371 + }, + { + "epoch": 0.6606874134713778, + "grad_norm": 0.8339424133300781, + "learning_rate": 5.475390146528738e-05, + "loss": 0.3277, + "step": 1372 + }, + { + "epoch": 0.661168964064287, + "grad_norm": 0.7859684228897095, + "learning_rate": 5.461484918581858e-05, + "loss": 0.295, + "step": 1373 + }, + { + "epoch": 0.6616505146571962, + "grad_norm": 0.5152652263641357, + "learning_rate": 5.4475907338240494e-05, + "loss": 0.218, + "step": 1374 + }, + { + "epoch": 0.6621320652501054, + "grad_norm": 0.7320000529289246, + "learning_rate": 5.43370762606287e-05, + "loss": 0.5355, + "step": 1375 + }, + { + "epoch": 0.6626136158430145, + "grad_norm": 0.6813570261001587, + "learning_rate": 5.4198356290789276e-05, + "loss": 0.5312, + "step": 1376 + }, + { + "epoch": 0.6630951664359237, + "grad_norm": 1.7592676877975464, + "learning_rate": 5.405974776625785e-05, + "loss": 0.6173, + "step": 1377 + }, + { + "epoch": 0.6635767170288328, + "grad_norm": 0.4485854506492615, + "learning_rate": 5.392125102429899e-05, + "loss": 0.5352, + "step": 1378 + }, + { + "epoch": 0.664058267621742, + "grad_norm": 0.47546130418777466, + "learning_rate": 5.378286640190522e-05, + "loss": 0.4701, + "step": 1379 + }, + { + "epoch": 0.6645398182146511, + "grad_norm": 0.6833990216255188, + "learning_rate": 5.364459423579629e-05, + "loss": 0.2812, + "step": 1380 + }, + { + "epoch": 0.6650213688075604, + "grad_norm": 0.43509840965270996, + "learning_rate": 5.350643486241825e-05, + "loss": 0.2162, + "step": 1381 + }, + { + "epoch": 0.6655029194004695, + "grad_norm": 0.32101714611053467, + "learning_rate": 5.33683886179428e-05, + "loss": 0.2993, + "step": 1382 + }, + { + "epoch": 0.6659844699933787, + "grad_norm": 0.4640980660915375, + "learning_rate": 5.3230455838266266e-05, + "loss": 0.2373, + "step": 1383 + }, + { + "epoch": 0.6664660205862879, + "grad_norm": 0.7368037700653076, + "learning_rate": 5.309263685900898e-05, + "loss": 0.1837, + "step": 1384 + }, + { + "epoch": 0.666947571179197, + "grad_norm": 0.4618796110153198, + "learning_rate": 5.295493201551433e-05, + "loss": 0.1103, + "step": 1385 + }, + { + "epoch": 0.6674291217721062, + "grad_norm": 0.5317411422729492, + "learning_rate": 5.281734164284802e-05, + "loss": 0.5675, + "step": 1386 + }, + { + "epoch": 0.6679106723650153, + "grad_norm": 0.8421313166618347, + "learning_rate": 5.26798660757971e-05, + "loss": 0.8043, + "step": 1387 + }, + { + "epoch": 0.6683922229579246, + "grad_norm": 0.6979146003723145, + "learning_rate": 5.2542505648869434e-05, + "loss": 0.5956, + "step": 1388 + }, + { + "epoch": 0.6688737735508337, + "grad_norm": 0.9298615455627441, + "learning_rate": 5.240526069629265e-05, + "loss": 0.6499, + "step": 1389 + }, + { + "epoch": 0.6693553241437429, + "grad_norm": 0.9852579832077026, + "learning_rate": 5.22681315520134e-05, + "loss": 0.9238, + "step": 1390 + }, + { + "epoch": 0.669836874736652, + "grad_norm": 0.6682729125022888, + "learning_rate": 5.213111854969661e-05, + "loss": 0.3948, + "step": 1391 + }, + { + "epoch": 0.6703184253295612, + "grad_norm": 2.567547082901001, + "learning_rate": 5.199422202272448e-05, + "loss": 0.2861, + "step": 1392 + }, + { + "epoch": 0.6707999759224703, + "grad_norm": 0.8241885304450989, + "learning_rate": 5.185744230419589e-05, + "loss": 0.4432, + "step": 1393 + }, + { + "epoch": 0.6712815265153795, + "grad_norm": 0.4601392447948456, + "learning_rate": 5.172077972692553e-05, + "loss": 0.4492, + "step": 1394 + }, + { + "epoch": 0.6717630771082886, + "grad_norm": 0.5011596083641052, + "learning_rate": 5.1584234623442974e-05, + "loss": 0.4165, + "step": 1395 + }, + { + "epoch": 0.6722446277011979, + "grad_norm": 0.47886374592781067, + "learning_rate": 5.1447807325992025e-05, + "loss": 0.1681, + "step": 1396 + }, + { + "epoch": 0.6727261782941071, + "grad_norm": 0.5967696309089661, + "learning_rate": 5.13114981665298e-05, + "loss": 0.7393, + "step": 1397 + }, + { + "epoch": 0.6732077288870162, + "grad_norm": 0.3016923666000366, + "learning_rate": 5.117530747672603e-05, + "loss": 0.5512, + "step": 1398 + }, + { + "epoch": 0.6736892794799254, + "grad_norm": 1.5658845901489258, + "learning_rate": 5.103923558796203e-05, + "loss": 0.6599, + "step": 1399 + }, + { + "epoch": 0.6741708300728345, + "grad_norm": 0.9608722925186157, + "learning_rate": 5.090328283133019e-05, + "loss": 0.3163, + "step": 1400 + }, + { + "epoch": 0.6746523806657437, + "grad_norm": 0.9786484241485596, + "learning_rate": 5.0767449537632986e-05, + "loss": 0.5629, + "step": 1401 + }, + { + "epoch": 0.6751339312586528, + "grad_norm": 0.482991486787796, + "learning_rate": 5.06317360373822e-05, + "loss": 0.2949, + "step": 1402 + }, + { + "epoch": 0.6756154818515621, + "grad_norm": 0.43300551176071167, + "learning_rate": 5.049614266079813e-05, + "loss": 0.1639, + "step": 1403 + }, + { + "epoch": 0.6760970324444712, + "grad_norm": 0.6290706396102905, + "learning_rate": 5.036066973780882e-05, + "loss": 0.3752, + "step": 1404 + }, + { + "epoch": 0.6765785830373804, + "grad_norm": 0.8291651606559753, + "learning_rate": 5.022531759804918e-05, + "loss": 0.6369, + "step": 1405 + }, + { + "epoch": 0.6770601336302895, + "grad_norm": 2.136549472808838, + "learning_rate": 5.009008657086025e-05, + "loss": 0.7522, + "step": 1406 + }, + { + "epoch": 0.6775416842231987, + "grad_norm": 1.0618244409561157, + "learning_rate": 4.9954976985288395e-05, + "loss": 0.407, + "step": 1407 + }, + { + "epoch": 0.6780232348161078, + "grad_norm": 1.9885526895523071, + "learning_rate": 4.981998917008448e-05, + "loss": 0.3765, + "step": 1408 + }, + { + "epoch": 0.678504785409017, + "grad_norm": 0.5625970363616943, + "learning_rate": 4.9685123453703e-05, + "loss": 0.501, + "step": 1409 + }, + { + "epoch": 0.6789863360019261, + "grad_norm": 1.6479942798614502, + "learning_rate": 4.955038016430149e-05, + "loss": 0.2305, + "step": 1410 + }, + { + "epoch": 0.6794678865948354, + "grad_norm": 0.7730649709701538, + "learning_rate": 4.9415759629739455e-05, + "loss": 0.3156, + "step": 1411 + }, + { + "epoch": 0.6799494371877446, + "grad_norm": 1.4092258214950562, + "learning_rate": 4.928126217757782e-05, + "loss": 0.9064, + "step": 1412 + }, + { + "epoch": 0.6804309877806537, + "grad_norm": 0.9801766276359558, + "learning_rate": 4.914688813507797e-05, + "loss": 0.7185, + "step": 1413 + }, + { + "epoch": 0.6809125383735629, + "grad_norm": 0.7995579838752747, + "learning_rate": 4.901263782920105e-05, + "loss": 0.4971, + "step": 1414 + }, + { + "epoch": 0.681394088966472, + "grad_norm": 0.7919400930404663, + "learning_rate": 4.887851158660706e-05, + "loss": 0.2881, + "step": 1415 + }, + { + "epoch": 0.6818756395593812, + "grad_norm": 0.809403121471405, + "learning_rate": 4.8744509733654184e-05, + "loss": 0.6397, + "step": 1416 + }, + { + "epoch": 0.6823571901522903, + "grad_norm": 0.5449767112731934, + "learning_rate": 4.861063259639793e-05, + "loss": 0.5728, + "step": 1417 + }, + { + "epoch": 0.6828387407451996, + "grad_norm": 0.7562658190727234, + "learning_rate": 4.847688050059033e-05, + "loss": 0.334, + "step": 1418 + }, + { + "epoch": 0.6833202913381087, + "grad_norm": 0.6870490908622742, + "learning_rate": 4.8343253771679155e-05, + "loss": 0.4203, + "step": 1419 + }, + { + "epoch": 0.6838018419310179, + "grad_norm": 0.9385795593261719, + "learning_rate": 4.82097527348072e-05, + "loss": 0.7597, + "step": 1420 + }, + { + "epoch": 0.684283392523927, + "grad_norm": 2.248929500579834, + "learning_rate": 4.8076377714811284e-05, + "loss": 0.7038, + "step": 1421 + }, + { + "epoch": 0.6847649431168362, + "grad_norm": 0.3908019959926605, + "learning_rate": 4.7943129036221735e-05, + "loss": 0.3123, + "step": 1422 + }, + { + "epoch": 0.6852464937097453, + "grad_norm": 0.7321159243583679, + "learning_rate": 4.781000702326142e-05, + "loss": 0.2466, + "step": 1423 + }, + { + "epoch": 0.6857280443026545, + "grad_norm": 0.7561706900596619, + "learning_rate": 4.767701199984497e-05, + "loss": 0.4239, + "step": 1424 + }, + { + "epoch": 0.6862095948955638, + "grad_norm": 0.39677685499191284, + "learning_rate": 4.7544144289578066e-05, + "loss": 0.2374, + "step": 1425 + }, + { + "epoch": 0.6866911454884729, + "grad_norm": 0.7446651458740234, + "learning_rate": 4.7411404215756594e-05, + "loss": 0.3571, + "step": 1426 + }, + { + "epoch": 0.6871726960813821, + "grad_norm": 0.6086783409118652, + "learning_rate": 4.7278792101365866e-05, + "loss": 0.5634, + "step": 1427 + }, + { + "epoch": 0.6876542466742912, + "grad_norm": 0.7406178712844849, + "learning_rate": 4.714630826907985e-05, + "loss": 0.4682, + "step": 1428 + }, + { + "epoch": 0.6881357972672004, + "grad_norm": 0.7877326011657715, + "learning_rate": 4.701395304126038e-05, + "loss": 0.8032, + "step": 1429 + }, + { + "epoch": 0.6886173478601095, + "grad_norm": 0.7456687092781067, + "learning_rate": 4.6881726739956375e-05, + "loss": 0.3922, + "step": 1430 + }, + { + "epoch": 0.6890988984530187, + "grad_norm": 0.5746781826019287, + "learning_rate": 4.6749629686902984e-05, + "loss": 0.9062, + "step": 1431 + }, + { + "epoch": 0.6895804490459279, + "grad_norm": 0.4844946563243866, + "learning_rate": 4.661766220352097e-05, + "loss": 0.2085, + "step": 1432 + }, + { + "epoch": 0.6900619996388371, + "grad_norm": 0.7245216965675354, + "learning_rate": 4.64858246109157e-05, + "loss": 0.7562, + "step": 1433 + }, + { + "epoch": 0.6905435502317462, + "grad_norm": 0.5512113571166992, + "learning_rate": 4.63541172298766e-05, + "loss": 0.617, + "step": 1434 + }, + { + "epoch": 0.6910251008246554, + "grad_norm": 0.6752376556396484, + "learning_rate": 4.622254038087622e-05, + "loss": 0.3755, + "step": 1435 + }, + { + "epoch": 0.6915066514175645, + "grad_norm": 0.32432499527931213, + "learning_rate": 4.60910943840695e-05, + "loss": 0.1497, + "step": 1436 + }, + { + "epoch": 0.6919882020104737, + "grad_norm": 0.6702204346656799, + "learning_rate": 4.5959779559292985e-05, + "loss": 0.2568, + "step": 1437 + }, + { + "epoch": 0.6924697526033828, + "grad_norm": 0.45060908794403076, + "learning_rate": 4.582859622606406e-05, + "loss": 0.5037, + "step": 1438 + }, + { + "epoch": 0.6929513031962921, + "grad_norm": 0.6259094476699829, + "learning_rate": 4.569754470358014e-05, + "loss": 0.7697, + "step": 1439 + }, + { + "epoch": 0.6934328537892013, + "grad_norm": 0.5580172538757324, + "learning_rate": 4.556662531071796e-05, + "loss": 0.1985, + "step": 1440 + }, + { + "epoch": 0.6939144043821104, + "grad_norm": 0.7737247943878174, + "learning_rate": 4.54358383660327e-05, + "loss": 0.3384, + "step": 1441 + }, + { + "epoch": 0.6943959549750196, + "grad_norm": 1.8386163711547852, + "learning_rate": 4.530518418775733e-05, + "loss": 0.6139, + "step": 1442 + }, + { + "epoch": 0.6948775055679287, + "grad_norm": 1.2202191352844238, + "learning_rate": 4.5174663093801674e-05, + "loss": 0.8419, + "step": 1443 + }, + { + "epoch": 0.6953590561608379, + "grad_norm": 0.33283334970474243, + "learning_rate": 4.504427540175181e-05, + "loss": 0.3924, + "step": 1444 + }, + { + "epoch": 0.695840606753747, + "grad_norm": 0.3331388235092163, + "learning_rate": 4.491402142886922e-05, + "loss": 0.4334, + "step": 1445 + }, + { + "epoch": 0.6963221573466563, + "grad_norm": 0.7208126187324524, + "learning_rate": 4.4783901492089984e-05, + "loss": 0.5102, + "step": 1446 + }, + { + "epoch": 0.6968037079395654, + "grad_norm": 0.327195405960083, + "learning_rate": 4.465391590802407e-05, + "loss": 0.3288, + "step": 1447 + }, + { + "epoch": 0.6972852585324746, + "grad_norm": 1.1266981363296509, + "learning_rate": 4.4524064992954516e-05, + "loss": 0.6236, + "step": 1448 + }, + { + "epoch": 0.6977668091253837, + "grad_norm": 1.2946836948394775, + "learning_rate": 4.4394349062836736e-05, + "loss": 0.668, + "step": 1449 + }, + { + "epoch": 0.6982483597182929, + "grad_norm": 0.7417237758636475, + "learning_rate": 4.4264768433297565e-05, + "loss": 0.3679, + "step": 1450 + }, + { + "epoch": 0.698729910311202, + "grad_norm": 0.6367408037185669, + "learning_rate": 4.4135323419634766e-05, + "loss": 0.6145, + "step": 1451 + }, + { + "epoch": 0.6992114609041112, + "grad_norm": 0.8463262915611267, + "learning_rate": 4.4006014336816035e-05, + "loss": 0.9093, + "step": 1452 + }, + { + "epoch": 0.6996930114970205, + "grad_norm": 0.14879071712493896, + "learning_rate": 4.387684149947837e-05, + "loss": 0.1764, + "step": 1453 + }, + { + "epoch": 0.7001745620899296, + "grad_norm": 1.1316916942596436, + "learning_rate": 4.374780522192726e-05, + "loss": 0.7289, + "step": 1454 + }, + { + "epoch": 0.7006561126828388, + "grad_norm": 0.4988424479961395, + "learning_rate": 4.3618905818135805e-05, + "loss": 0.1411, + "step": 1455 + }, + { + "epoch": 0.7011376632757479, + "grad_norm": 0.5783279538154602, + "learning_rate": 4.349014360174417e-05, + "loss": 0.4581, + "step": 1456 + }, + { + "epoch": 0.7016192138686571, + "grad_norm": 0.5452697277069092, + "learning_rate": 4.336151888605871e-05, + "loss": 0.2324, + "step": 1457 + }, + { + "epoch": 0.7021007644615662, + "grad_norm": 0.8302545547485352, + "learning_rate": 4.323303198405117e-05, + "loss": 0.8533, + "step": 1458 + }, + { + "epoch": 0.7025823150544754, + "grad_norm": 1.5463045835494995, + "learning_rate": 4.310468320835796e-05, + "loss": 0.4431, + "step": 1459 + }, + { + "epoch": 0.7030638656473845, + "grad_norm": 0.2754375636577606, + "learning_rate": 4.297647287127946e-05, + "loss": 0.2402, + "step": 1460 + }, + { + "epoch": 0.7035454162402938, + "grad_norm": 1.202061653137207, + "learning_rate": 4.284840128477913e-05, + "loss": 0.5382, + "step": 1461 + }, + { + "epoch": 0.7040269668332029, + "grad_norm": 0.516628623008728, + "learning_rate": 4.2720468760482854e-05, + "loss": 0.5933, + "step": 1462 + }, + { + "epoch": 0.7045085174261121, + "grad_norm": 0.35636791586875916, + "learning_rate": 4.2592675609678135e-05, + "loss": 0.1753, + "step": 1463 + }, + { + "epoch": 0.7049900680190212, + "grad_norm": 0.5912004113197327, + "learning_rate": 4.24650221433134e-05, + "loss": 0.5556, + "step": 1464 + }, + { + "epoch": 0.7054716186119304, + "grad_norm": 0.3655238449573517, + "learning_rate": 4.2337508671997086e-05, + "loss": 0.2003, + "step": 1465 + }, + { + "epoch": 0.7059531692048396, + "grad_norm": 0.7989886999130249, + "learning_rate": 4.221013550599707e-05, + "loss": 0.3002, + "step": 1466 + }, + { + "epoch": 0.7064347197977487, + "grad_norm": 1.0552719831466675, + "learning_rate": 4.208290295523984e-05, + "loss": 0.7653, + "step": 1467 + }, + { + "epoch": 0.706916270390658, + "grad_norm": 1.0404950380325317, + "learning_rate": 4.1955811329309746e-05, + "loss": 0.961, + "step": 1468 + }, + { + "epoch": 0.7073978209835671, + "grad_norm": 0.6837364435195923, + "learning_rate": 4.182886093744813e-05, + "loss": 0.9657, + "step": 1469 + }, + { + "epoch": 0.7078793715764763, + "grad_norm": 1.162905216217041, + "learning_rate": 4.170205208855281e-05, + "loss": 0.5844, + "step": 1470 + }, + { + "epoch": 0.7083609221693854, + "grad_norm": 0.7432567477226257, + "learning_rate": 4.157538509117714e-05, + "loss": 0.6088, + "step": 1471 + }, + { + "epoch": 0.7088424727622946, + "grad_norm": 1.116654634475708, + "learning_rate": 4.144886025352934e-05, + "loss": 0.3983, + "step": 1472 + }, + { + "epoch": 0.7093240233552037, + "grad_norm": 0.8999528884887695, + "learning_rate": 4.13224778834717e-05, + "loss": 0.2352, + "step": 1473 + }, + { + "epoch": 0.7098055739481129, + "grad_norm": 0.38564515113830566, + "learning_rate": 4.1196238288519874e-05, + "loss": 0.2324, + "step": 1474 + }, + { + "epoch": 0.710287124541022, + "grad_norm": 1.3238146305084229, + "learning_rate": 4.107014177584211e-05, + "loss": 0.278, + "step": 1475 + }, + { + "epoch": 0.7107686751339313, + "grad_norm": 0.8142977952957153, + "learning_rate": 4.094418865225853e-05, + "loss": 0.4148, + "step": 1476 + }, + { + "epoch": 0.7112502257268404, + "grad_norm": 0.8902896642684937, + "learning_rate": 4.081837922424027e-05, + "loss": 0.4187, + "step": 1477 + }, + { + "epoch": 0.7117317763197496, + "grad_norm": 0.6116800308227539, + "learning_rate": 4.069271379790891e-05, + "loss": 0.1332, + "step": 1478 + }, + { + "epoch": 0.7122133269126587, + "grad_norm": 0.6485881209373474, + "learning_rate": 4.0567192679035636e-05, + "loss": 0.4729, + "step": 1479 + }, + { + "epoch": 0.7126948775055679, + "grad_norm": 0.5892083048820496, + "learning_rate": 4.044181617304048e-05, + "loss": 0.2822, + "step": 1480 + }, + { + "epoch": 0.7131764280984771, + "grad_norm": 0.5735858678817749, + "learning_rate": 4.03165845849916e-05, + "loss": 0.2439, + "step": 1481 + }, + { + "epoch": 0.7136579786913863, + "grad_norm": 0.5941423773765564, + "learning_rate": 4.019149821960455e-05, + "loss": 0.3502, + "step": 1482 + }, + { + "epoch": 0.7141395292842955, + "grad_norm": 1.1638046503067017, + "learning_rate": 4.006655738124152e-05, + "loss": 0.5438, + "step": 1483 + }, + { + "epoch": 0.7146210798772046, + "grad_norm": 0.4477090835571289, + "learning_rate": 3.9941762373910586e-05, + "loss": 0.3187, + "step": 1484 + }, + { + "epoch": 0.7151026304701138, + "grad_norm": 0.6522412896156311, + "learning_rate": 3.9817113501265016e-05, + "loss": 0.3332, + "step": 1485 + }, + { + "epoch": 0.7155841810630229, + "grad_norm": 0.8694091439247131, + "learning_rate": 3.9692611066602516e-05, + "loss": 0.5145, + "step": 1486 + }, + { + "epoch": 0.7160657316559321, + "grad_norm": 0.604214072227478, + "learning_rate": 3.956825537286436e-05, + "loss": 0.1848, + "step": 1487 + }, + { + "epoch": 0.7165472822488412, + "grad_norm": 0.5920756459236145, + "learning_rate": 3.944404672263494e-05, + "loss": 0.6276, + "step": 1488 + }, + { + "epoch": 0.7170288328417505, + "grad_norm": 0.5562595129013062, + "learning_rate": 3.931998541814069e-05, + "loss": 0.5277, + "step": 1489 + }, + { + "epoch": 0.7175103834346596, + "grad_norm": 0.7895872592926025, + "learning_rate": 3.919607176124966e-05, + "loss": 0.7741, + "step": 1490 + }, + { + "epoch": 0.7179919340275688, + "grad_norm": 0.38465070724487305, + "learning_rate": 3.9072306053470566e-05, + "loss": 0.3366, + "step": 1491 + }, + { + "epoch": 0.7184734846204779, + "grad_norm": 0.5632419586181641, + "learning_rate": 3.8948688595952164e-05, + "loss": 0.3859, + "step": 1492 + }, + { + "epoch": 0.7189550352133871, + "grad_norm": 1.333335041999817, + "learning_rate": 3.882521968948246e-05, + "loss": 0.8737, + "step": 1493 + }, + { + "epoch": 0.7194365858062963, + "grad_norm": 1.0718291997909546, + "learning_rate": 3.8701899634488014e-05, + "loss": 0.5208, + "step": 1494 + }, + { + "epoch": 0.7199181363992054, + "grad_norm": 0.3761247396469116, + "learning_rate": 3.857872873103322e-05, + "loss": 0.3304, + "step": 1495 + }, + { + "epoch": 0.7203996869921147, + "grad_norm": 0.9332675933837891, + "learning_rate": 3.8455707278819507e-05, + "loss": 0.2981, + "step": 1496 + }, + { + "epoch": 0.7208812375850238, + "grad_norm": 0.6915299892425537, + "learning_rate": 3.833283557718471e-05, + "loss": 0.2249, + "step": 1497 + }, + { + "epoch": 0.721362788177933, + "grad_norm": 1.1629884243011475, + "learning_rate": 3.821011392510228e-05, + "loss": 0.3798, + "step": 1498 + }, + { + "epoch": 0.7218443387708421, + "grad_norm": 0.3219270706176758, + "learning_rate": 3.808754262118046e-05, + "loss": 0.1493, + "step": 1499 + }, + { + "epoch": 0.7223258893637513, + "grad_norm": 0.566983699798584, + "learning_rate": 3.796512196366182e-05, + "loss": 0.6169, + "step": 1500 + }, + { + "epoch": 0.7228074399566604, + "grad_norm": 0.7285815477371216, + "learning_rate": 3.784285225042229e-05, + "loss": 0.746, + "step": 1501 + }, + { + "epoch": 0.7232889905495696, + "grad_norm": 2.9887161254882812, + "learning_rate": 3.772073377897052e-05, + "loss": 0.5847, + "step": 1502 + }, + { + "epoch": 0.7237705411424787, + "grad_norm": 1.7412497997283936, + "learning_rate": 3.7598766846447184e-05, + "loss": 0.2889, + "step": 1503 + }, + { + "epoch": 0.724252091735388, + "grad_norm": 0.7092990875244141, + "learning_rate": 3.747695174962423e-05, + "loss": 0.5911, + "step": 1504 + }, + { + "epoch": 0.7247336423282971, + "grad_norm": 1.5063307285308838, + "learning_rate": 3.7355288784904116e-05, + "loss": 0.5123, + "step": 1505 + }, + { + "epoch": 0.7252151929212063, + "grad_norm": 0.6491876244544983, + "learning_rate": 3.7233778248319176e-05, + "loss": 0.3482, + "step": 1506 + }, + { + "epoch": 0.7256967435141155, + "grad_norm": 0.29861709475517273, + "learning_rate": 3.7112420435530845e-05, + "loss": 0.3769, + "step": 1507 + }, + { + "epoch": 0.7261782941070246, + "grad_norm": 0.8814902901649475, + "learning_rate": 3.69912156418289e-05, + "loss": 0.1484, + "step": 1508 + }, + { + "epoch": 0.7266598446999338, + "grad_norm": 0.3816862106323242, + "learning_rate": 3.687016416213084e-05, + "loss": 0.295, + "step": 1509 + }, + { + "epoch": 0.7271413952928429, + "grad_norm": 0.3857170045375824, + "learning_rate": 3.674926629098113e-05, + "loss": 0.154, + "step": 1510 + }, + { + "epoch": 0.7276229458857522, + "grad_norm": 0.43591180443763733, + "learning_rate": 3.6628522322550394e-05, + "loss": 0.2302, + "step": 1511 + }, + { + "epoch": 0.7281044964786613, + "grad_norm": 0.8347920775413513, + "learning_rate": 3.6507932550634846e-05, + "loss": 0.5597, + "step": 1512 + }, + { + "epoch": 0.7285860470715705, + "grad_norm": 0.5086283683776855, + "learning_rate": 3.638749726865552e-05, + "loss": 0.2547, + "step": 1513 + }, + { + "epoch": 0.7290675976644796, + "grad_norm": 0.660691499710083, + "learning_rate": 3.6267216769657485e-05, + "loss": 0.4473, + "step": 1514 + }, + { + "epoch": 0.7295491482573888, + "grad_norm": 0.8285390138626099, + "learning_rate": 3.6147091346309224e-05, + "loss": 0.5904, + "step": 1515 + }, + { + "epoch": 0.7300306988502979, + "grad_norm": 0.541466236114502, + "learning_rate": 3.602712129090189e-05, + "loss": 0.1929, + "step": 1516 + }, + { + "epoch": 0.7305122494432071, + "grad_norm": 0.3030177652835846, + "learning_rate": 3.590730689534857e-05, + "loss": 0.2024, + "step": 1517 + }, + { + "epoch": 0.7309938000361162, + "grad_norm": 0.6796524524688721, + "learning_rate": 3.578764845118362e-05, + "loss": 0.3566, + "step": 1518 + }, + { + "epoch": 0.7314753506290255, + "grad_norm": 0.5066099762916565, + "learning_rate": 3.566814624956194e-05, + "loss": 0.3388, + "step": 1519 + }, + { + "epoch": 0.7319569012219346, + "grad_norm": 0.4785514771938324, + "learning_rate": 3.554880058125819e-05, + "loss": 0.351, + "step": 1520 + }, + { + "epoch": 0.7324384518148438, + "grad_norm": 1.521462321281433, + "learning_rate": 3.5429611736666235e-05, + "loss": 0.4318, + "step": 1521 + }, + { + "epoch": 0.732920002407753, + "grad_norm": 0.4550098180770874, + "learning_rate": 3.53105800057983e-05, + "loss": 0.196, + "step": 1522 + }, + { + "epoch": 0.7334015530006621, + "grad_norm": 0.36239540576934814, + "learning_rate": 3.519170567828435e-05, + "loss": 0.3128, + "step": 1523 + }, + { + "epoch": 0.7338831035935713, + "grad_norm": 0.8800835609436035, + "learning_rate": 3.507298904337134e-05, + "loss": 0.6886, + "step": 1524 + }, + { + "epoch": 0.7343646541864804, + "grad_norm": 0.6008750796318054, + "learning_rate": 3.495443038992253e-05, + "loss": 0.2997, + "step": 1525 + }, + { + "epoch": 0.7348462047793897, + "grad_norm": 0.6403049230575562, + "learning_rate": 3.4836030006416775e-05, + "loss": 0.3154, + "step": 1526 + }, + { + "epoch": 0.7353277553722988, + "grad_norm": 0.7455939650535583, + "learning_rate": 3.471778818094785e-05, + "loss": 0.5283, + "step": 1527 + }, + { + "epoch": 0.735809305965208, + "grad_norm": 0.8892818689346313, + "learning_rate": 3.459970520122364e-05, + "loss": 0.2674, + "step": 1528 + }, + { + "epoch": 0.7362908565581171, + "grad_norm": 0.34111905097961426, + "learning_rate": 3.44817813545656e-05, + "loss": 0.4165, + "step": 1529 + }, + { + "epoch": 0.7367724071510263, + "grad_norm": 1.0294443368911743, + "learning_rate": 3.4364016927907974e-05, + "loss": 0.3695, + "step": 1530 + }, + { + "epoch": 0.7372539577439354, + "grad_norm": 0.9741241931915283, + "learning_rate": 3.424641220779711e-05, + "loss": 0.7115, + "step": 1531 + }, + { + "epoch": 0.7377355083368446, + "grad_norm": 1.782731533050537, + "learning_rate": 3.412896748039067e-05, + "loss": 0.4554, + "step": 1532 + }, + { + "epoch": 0.7382170589297538, + "grad_norm": 0.3588056266307831, + "learning_rate": 3.401168303145713e-05, + "loss": 0.3425, + "step": 1533 + }, + { + "epoch": 0.738698609522663, + "grad_norm": 0.6079331636428833, + "learning_rate": 3.3894559146374924e-05, + "loss": 0.3928, + "step": 1534 + }, + { + "epoch": 0.7391801601155722, + "grad_norm": 0.8715238571166992, + "learning_rate": 3.37775961101318e-05, + "loss": 0.5777, + "step": 1535 + }, + { + "epoch": 0.7396617107084813, + "grad_norm": 0.9030262231826782, + "learning_rate": 3.366079420732413e-05, + "loss": 0.2604, + "step": 1536 + }, + { + "epoch": 0.7401432613013905, + "grad_norm": 0.8700314164161682, + "learning_rate": 3.3544153722156216e-05, + "loss": 0.3672, + "step": 1537 + }, + { + "epoch": 0.7406248118942996, + "grad_norm": 0.3231861889362335, + "learning_rate": 3.3427674938439594e-05, + "loss": 0.4551, + "step": 1538 + }, + { + "epoch": 0.7411063624872088, + "grad_norm": 0.4491250216960907, + "learning_rate": 3.3311358139592317e-05, + "loss": 0.2578, + "step": 1539 + }, + { + "epoch": 0.741587913080118, + "grad_norm": 0.4604054093360901, + "learning_rate": 3.319520360863837e-05, + "loss": 0.3493, + "step": 1540 + }, + { + "epoch": 0.7420694636730272, + "grad_norm": 0.80861496925354, + "learning_rate": 3.3079211628206854e-05, + "loss": 0.3235, + "step": 1541 + }, + { + "epoch": 0.7425510142659363, + "grad_norm": 0.43499043583869934, + "learning_rate": 3.296338248053129e-05, + "loss": 0.3353, + "step": 1542 + }, + { + "epoch": 0.7430325648588455, + "grad_norm": 0.6633376479148865, + "learning_rate": 3.2847716447449096e-05, + "loss": 0.6597, + "step": 1543 + }, + { + "epoch": 0.7435141154517546, + "grad_norm": 0.5152623057365417, + "learning_rate": 3.2732213810400745e-05, + "loss": 0.3735, + "step": 1544 + }, + { + "epoch": 0.7439956660446638, + "grad_norm": 0.5857446193695068, + "learning_rate": 3.261687485042915e-05, + "loss": 0.4942, + "step": 1545 + }, + { + "epoch": 0.7444772166375729, + "grad_norm": 0.7267798781394958, + "learning_rate": 3.250169984817897e-05, + "loss": 0.3593, + "step": 1546 + }, + { + "epoch": 0.7449587672304822, + "grad_norm": 0.8395870327949524, + "learning_rate": 3.238668908389586e-05, + "loss": 0.7682, + "step": 1547 + }, + { + "epoch": 0.7454403178233914, + "grad_norm": 1.0239869356155396, + "learning_rate": 3.227184283742591e-05, + "loss": 0.4636, + "step": 1548 + }, + { + "epoch": 0.7459218684163005, + "grad_norm": 0.7630646228790283, + "learning_rate": 3.215716138821488e-05, + "loss": 0.3733, + "step": 1549 + }, + { + "epoch": 0.7464034190092097, + "grad_norm": 0.7075035572052002, + "learning_rate": 3.204264501530756e-05, + "loss": 0.3556, + "step": 1550 + }, + { + "epoch": 0.7468849696021188, + "grad_norm": 1.2842384576797485, + "learning_rate": 3.192829399734706e-05, + "loss": 0.6017, + "step": 1551 + }, + { + "epoch": 0.747366520195028, + "grad_norm": 0.4235416054725647, + "learning_rate": 3.181410861257413e-05, + "loss": 0.6632, + "step": 1552 + }, + { + "epoch": 0.7478480707879371, + "grad_norm": 1.266859531402588, + "learning_rate": 3.170008913882656e-05, + "loss": 0.5082, + "step": 1553 + }, + { + "epoch": 0.7483296213808464, + "grad_norm": 0.3138764798641205, + "learning_rate": 3.1586235853538325e-05, + "loss": 0.4967, + "step": 1554 + }, + { + "epoch": 0.7488111719737555, + "grad_norm": 0.8375988006591797, + "learning_rate": 3.1472549033739126e-05, + "loss": 0.294, + "step": 1555 + }, + { + "epoch": 0.7492927225666647, + "grad_norm": 0.364234983921051, + "learning_rate": 3.1359028956053615e-05, + "loss": 0.203, + "step": 1556 + }, + { + "epoch": 0.7497742731595738, + "grad_norm": 0.9482399821281433, + "learning_rate": 3.1245675896700685e-05, + "loss": 0.8518, + "step": 1557 + }, + { + "epoch": 0.750255823752483, + "grad_norm": 0.49903255701065063, + "learning_rate": 3.113249013149284e-05, + "loss": 0.2111, + "step": 1558 + }, + { + "epoch": 0.7507373743453921, + "grad_norm": 1.1432161331176758, + "learning_rate": 3.101947193583557e-05, + "loss": 0.495, + "step": 1559 + }, + { + "epoch": 0.7512189249383013, + "grad_norm": 0.5520155429840088, + "learning_rate": 3.0906621584726546e-05, + "loss": 0.7413, + "step": 1560 + }, + { + "epoch": 0.7517004755312104, + "grad_norm": 1.4275282621383667, + "learning_rate": 3.079393935275513e-05, + "loss": 0.4253, + "step": 1561 + }, + { + "epoch": 0.7521820261241197, + "grad_norm": 0.9763407111167908, + "learning_rate": 3.068142551410155e-05, + "loss": 0.3456, + "step": 1562 + }, + { + "epoch": 0.7526635767170289, + "grad_norm": 0.4486462473869324, + "learning_rate": 3.0569080342536347e-05, + "loss": 0.352, + "step": 1563 + }, + { + "epoch": 0.753145127309938, + "grad_norm": 0.6054459810256958, + "learning_rate": 3.0456904111419572e-05, + "loss": 0.7228, + "step": 1564 + }, + { + "epoch": 0.7536266779028472, + "grad_norm": 0.3461880087852478, + "learning_rate": 3.034489709370033e-05, + "loss": 0.5269, + "step": 1565 + }, + { + "epoch": 0.7541082284957563, + "grad_norm": 0.7280091047286987, + "learning_rate": 3.0233059561915855e-05, + "loss": 0.4963, + "step": 1566 + }, + { + "epoch": 0.7545897790886655, + "grad_norm": 1.4291431903839111, + "learning_rate": 3.01213917881911e-05, + "loss": 0.8309, + "step": 1567 + }, + { + "epoch": 0.7550713296815746, + "grad_norm": 0.4737418591976166, + "learning_rate": 3.0009894044237907e-05, + "loss": 0.4143, + "step": 1568 + }, + { + "epoch": 0.7555528802744839, + "grad_norm": 0.7274969220161438, + "learning_rate": 2.9898566601354418e-05, + "loss": 0.5881, + "step": 1569 + }, + { + "epoch": 0.756034430867393, + "grad_norm": 0.6631147861480713, + "learning_rate": 2.9787409730424374e-05, + "loss": 0.3593, + "step": 1570 + }, + { + "epoch": 0.7565159814603022, + "grad_norm": 1.1353744268417358, + "learning_rate": 2.96764237019165e-05, + "loss": 0.5946, + "step": 1571 + }, + { + "epoch": 0.7569975320532113, + "grad_norm": 0.8798652291297913, + "learning_rate": 2.9565608785883815e-05, + "loss": 0.6565, + "step": 1572 + }, + { + "epoch": 0.7574790826461205, + "grad_norm": 1.0067402124404907, + "learning_rate": 2.9454965251962973e-05, + "loss": 0.8568, + "step": 1573 + }, + { + "epoch": 0.7579606332390296, + "grad_norm": 0.5103989839553833, + "learning_rate": 2.9344493369373637e-05, + "loss": 0.3789, + "step": 1574 + }, + { + "epoch": 0.7584421838319388, + "grad_norm": 1.0349833965301514, + "learning_rate": 2.9234193406917833e-05, + "loss": 0.509, + "step": 1575 + }, + { + "epoch": 0.7589237344248481, + "grad_norm": 1.722801685333252, + "learning_rate": 2.912406563297916e-05, + "loss": 0.5278, + "step": 1576 + }, + { + "epoch": 0.7594052850177572, + "grad_norm": 0.5121159553527832, + "learning_rate": 2.901411031552236e-05, + "loss": 0.2526, + "step": 1577 + }, + { + "epoch": 0.7598868356106664, + "grad_norm": 1.0947937965393066, + "learning_rate": 2.8904327722092495e-05, + "loss": 0.6184, + "step": 1578 + }, + { + "epoch": 0.7603683862035755, + "grad_norm": 0.9110963344573975, + "learning_rate": 2.879471811981437e-05, + "loss": 0.6719, + "step": 1579 + }, + { + "epoch": 0.7608499367964847, + "grad_norm": 1.661855936050415, + "learning_rate": 2.868528177539187e-05, + "loss": 0.7398, + "step": 1580 + }, + { + "epoch": 0.7613314873893938, + "grad_norm": 0.9211600422859192, + "learning_rate": 2.8576018955107285e-05, + "loss": 0.5384, + "step": 1581 + }, + { + "epoch": 0.761813037982303, + "grad_norm": 0.7545797824859619, + "learning_rate": 2.8466929924820705e-05, + "loss": 0.7929, + "step": 1582 + }, + { + "epoch": 0.7622945885752122, + "grad_norm": 0.6031236052513123, + "learning_rate": 2.8358014949969334e-05, + "loss": 0.5341, + "step": 1583 + }, + { + "epoch": 0.7627761391681214, + "grad_norm": 0.5444250106811523, + "learning_rate": 2.8249274295566864e-05, + "loss": 0.2411, + "step": 1584 + }, + { + "epoch": 0.7632576897610305, + "grad_norm": 0.605685830116272, + "learning_rate": 2.8140708226202884e-05, + "loss": 0.4822, + "step": 1585 + }, + { + "epoch": 0.7637392403539397, + "grad_norm": 0.38985756039619446, + "learning_rate": 2.803231700604204e-05, + "loss": 0.2867, + "step": 1586 + }, + { + "epoch": 0.7642207909468488, + "grad_norm": 0.9031787514686584, + "learning_rate": 2.7924100898823702e-05, + "loss": 0.4912, + "step": 1587 + }, + { + "epoch": 0.764702341539758, + "grad_norm": 0.3621733784675598, + "learning_rate": 2.7816060167861002e-05, + "loss": 0.532, + "step": 1588 + }, + { + "epoch": 0.7651838921326672, + "grad_norm": 0.5640154480934143, + "learning_rate": 2.7708195076040445e-05, + "loss": 0.235, + "step": 1589 + }, + { + "epoch": 0.7656654427255764, + "grad_norm": 0.5240148305892944, + "learning_rate": 2.760050588582114e-05, + "loss": 0.6267, + "step": 1590 + }, + { + "epoch": 0.7661469933184856, + "grad_norm": 0.7257108092308044, + "learning_rate": 2.749299285923417e-05, + "loss": 0.4834, + "step": 1591 + }, + { + "epoch": 0.7666285439113947, + "grad_norm": 0.5790515542030334, + "learning_rate": 2.7385656257881997e-05, + "loss": 0.4201, + "step": 1592 + }, + { + "epoch": 0.7671100945043039, + "grad_norm": 0.12647365033626556, + "learning_rate": 2.7278496342937788e-05, + "loss": 0.0339, + "step": 1593 + }, + { + "epoch": 0.767591645097213, + "grad_norm": 0.8625386953353882, + "learning_rate": 2.717151337514482e-05, + "loss": 0.4835, + "step": 1594 + }, + { + "epoch": 0.7680731956901222, + "grad_norm": 0.5701541304588318, + "learning_rate": 2.7064707614815776e-05, + "loss": 0.4525, + "step": 1595 + }, + { + "epoch": 0.7685547462830313, + "grad_norm": 0.5805161595344543, + "learning_rate": 2.6958079321832185e-05, + "loss": 0.2327, + "step": 1596 + }, + { + "epoch": 0.7690362968759406, + "grad_norm": 1.1971231698989868, + "learning_rate": 2.6851628755643776e-05, + "loss": 0.2861, + "step": 1597 + }, + { + "epoch": 0.7695178474688497, + "grad_norm": 0.5243621468544006, + "learning_rate": 2.6745356175267765e-05, + "loss": 0.3722, + "step": 1598 + }, + { + "epoch": 0.7699993980617589, + "grad_norm": 0.7715356945991516, + "learning_rate": 2.6639261839288343e-05, + "loss": 0.4157, + "step": 1599 + }, + { + "epoch": 0.770480948654668, + "grad_norm": 0.6639595031738281, + "learning_rate": 2.6533346005855987e-05, + "loss": 0.7268, + "step": 1600 + }, + { + "epoch": 0.7709624992475772, + "grad_norm": 0.8616638779640198, + "learning_rate": 2.6427608932686843e-05, + "loss": 0.3145, + "step": 1601 + }, + { + "epoch": 0.7714440498404863, + "grad_norm": 0.9900946617126465, + "learning_rate": 2.6322050877062064e-05, + "loss": 0.5062, + "step": 1602 + }, + { + "epoch": 0.7719256004333955, + "grad_norm": 0.5049934983253479, + "learning_rate": 2.6216672095827266e-05, + "loss": 0.5096, + "step": 1603 + }, + { + "epoch": 0.7724071510263048, + "grad_norm": 0.5385169982910156, + "learning_rate": 2.6111472845391827e-05, + "loss": 0.4366, + "step": 1604 + }, + { + "epoch": 0.7728887016192139, + "grad_norm": 0.2244795709848404, + "learning_rate": 2.6006453381728236e-05, + "loss": 0.1752, + "step": 1605 + }, + { + "epoch": 0.7733702522121231, + "grad_norm": 0.7594806551933289, + "learning_rate": 2.5901613960371585e-05, + "loss": 0.3422, + "step": 1606 + }, + { + "epoch": 0.7738518028050322, + "grad_norm": 1.265409231185913, + "learning_rate": 2.5796954836418884e-05, + "loss": 0.5028, + "step": 1607 + }, + { + "epoch": 0.7743333533979414, + "grad_norm": 1.2681912183761597, + "learning_rate": 2.569247626452842e-05, + "loss": 0.7162, + "step": 1608 + }, + { + "epoch": 0.7748149039908505, + "grad_norm": 1.189521312713623, + "learning_rate": 2.558817849891918e-05, + "loss": 0.4165, + "step": 1609 + }, + { + "epoch": 0.7752964545837597, + "grad_norm": 0.6538302302360535, + "learning_rate": 2.548406179337015e-05, + "loss": 0.3612, + "step": 1610 + }, + { + "epoch": 0.7757780051766688, + "grad_norm": 0.5399265289306641, + "learning_rate": 2.5380126401219807e-05, + "loss": 0.267, + "step": 1611 + }, + { + "epoch": 0.7762595557695781, + "grad_norm": 0.48032206296920776, + "learning_rate": 2.527637257536547e-05, + "loss": 0.2217, + "step": 1612 + }, + { + "epoch": 0.7767411063624872, + "grad_norm": 0.9604761600494385, + "learning_rate": 2.517280056826262e-05, + "loss": 0.4565, + "step": 1613 + }, + { + "epoch": 0.7772226569553964, + "grad_norm": 0.5232987999916077, + "learning_rate": 2.5069410631924385e-05, + "loss": 0.4922, + "step": 1614 + }, + { + "epoch": 0.7777042075483055, + "grad_norm": 0.5674952864646912, + "learning_rate": 2.4966203017920818e-05, + "loss": 0.2663, + "step": 1615 + }, + { + "epoch": 0.7781857581412147, + "grad_norm": 0.4071466326713562, + "learning_rate": 2.4863177977378392e-05, + "loss": 0.5091, + "step": 1616 + }, + { + "epoch": 0.7786673087341239, + "grad_norm": 0.31083059310913086, + "learning_rate": 2.4760335760979312e-05, + "loss": 0.4946, + "step": 1617 + }, + { + "epoch": 0.779148859327033, + "grad_norm": 1.048677682876587, + "learning_rate": 2.4657676618960944e-05, + "loss": 0.2767, + "step": 1618 + }, + { + "epoch": 0.7796304099199423, + "grad_norm": 0.5618645548820496, + "learning_rate": 2.455520080111522e-05, + "loss": 0.6579, + "step": 1619 + }, + { + "epoch": 0.7801119605128514, + "grad_norm": 1.3165363073349, + "learning_rate": 2.4452908556787912e-05, + "loss": 0.5388, + "step": 1620 + }, + { + "epoch": 0.7805935111057606, + "grad_norm": 0.5408737063407898, + "learning_rate": 2.4350800134878203e-05, + "loss": 0.4052, + "step": 1621 + }, + { + "epoch": 0.7810750616986697, + "grad_norm": 0.5187255144119263, + "learning_rate": 2.4248875783837987e-05, + "loss": 0.107, + "step": 1622 + }, + { + "epoch": 0.7815566122915789, + "grad_norm": 0.656693160533905, + "learning_rate": 2.414713575167129e-05, + "loss": 0.1242, + "step": 1623 + }, + { + "epoch": 0.782038162884488, + "grad_norm": 0.49979543685913086, + "learning_rate": 2.4045580285933557e-05, + "loss": 0.4696, + "step": 1624 + }, + { + "epoch": 0.7825197134773972, + "grad_norm": 1.2475125789642334, + "learning_rate": 2.3944209633731242e-05, + "loss": 0.5415, + "step": 1625 + }, + { + "epoch": 0.7830012640703063, + "grad_norm": 0.681904137134552, + "learning_rate": 2.3843024041721053e-05, + "loss": 0.2777, + "step": 1626 + }, + { + "epoch": 0.7834828146632156, + "grad_norm": 0.9484771490097046, + "learning_rate": 2.3742023756109456e-05, + "loss": 0.5585, + "step": 1627 + }, + { + "epoch": 0.7839643652561247, + "grad_norm": 1.0727744102478027, + "learning_rate": 2.3641209022651976e-05, + "loss": 0.5435, + "step": 1628 + }, + { + "epoch": 0.7844459158490339, + "grad_norm": 0.5711432099342346, + "learning_rate": 2.3540580086652675e-05, + "loss": 0.336, + "step": 1629 + }, + { + "epoch": 0.7849274664419431, + "grad_norm": 0.8557513952255249, + "learning_rate": 2.344013719296353e-05, + "loss": 0.3167, + "step": 1630 + }, + { + "epoch": 0.7854090170348522, + "grad_norm": 0.7779350280761719, + "learning_rate": 2.3339880585983842e-05, + "loss": 0.4713, + "step": 1631 + }, + { + "epoch": 0.7858905676277614, + "grad_norm": 0.38228800892829895, + "learning_rate": 2.3239810509659597e-05, + "loss": 0.5798, + "step": 1632 + }, + { + "epoch": 0.7863721182206705, + "grad_norm": 0.6690656542778015, + "learning_rate": 2.313992720748295e-05, + "loss": 0.412, + "step": 1633 + }, + { + "epoch": 0.7868536688135798, + "grad_norm": 0.7614961862564087, + "learning_rate": 2.304023092249159e-05, + "loss": 0.5188, + "step": 1634 + }, + { + "epoch": 0.7873352194064889, + "grad_norm": 1.6270629167556763, + "learning_rate": 2.2940721897268136e-05, + "loss": 0.7047, + "step": 1635 + }, + { + "epoch": 0.7878167699993981, + "grad_norm": 1.626470685005188, + "learning_rate": 2.2841400373939592e-05, + "loss": 0.5219, + "step": 1636 + }, + { + "epoch": 0.7882983205923072, + "grad_norm": 1.0363044738769531, + "learning_rate": 2.274226659417671e-05, + "loss": 0.8806, + "step": 1637 + }, + { + "epoch": 0.7887798711852164, + "grad_norm": 0.7885532975196838, + "learning_rate": 2.2643320799193402e-05, + "loss": 0.1965, + "step": 1638 + }, + { + "epoch": 0.7892614217781255, + "grad_norm": 0.5819542407989502, + "learning_rate": 2.2544563229746218e-05, + "loss": 0.5719, + "step": 1639 + }, + { + "epoch": 0.7897429723710347, + "grad_norm": 0.9245462417602539, + "learning_rate": 2.2445994126133708e-05, + "loss": 0.9072, + "step": 1640 + }, + { + "epoch": 0.7902245229639439, + "grad_norm": 1.100356936454773, + "learning_rate": 2.234761372819577e-05, + "loss": 0.347, + "step": 1641 + }, + { + "epoch": 0.7907060735568531, + "grad_norm": 0.5505186319351196, + "learning_rate": 2.2249422275313214e-05, + "loss": 0.354, + "step": 1642 + }, + { + "epoch": 0.7911876241497622, + "grad_norm": 0.6048828959465027, + "learning_rate": 2.215142000640714e-05, + "loss": 0.2226, + "step": 1643 + }, + { + "epoch": 0.7916691747426714, + "grad_norm": 0.8607200384140015, + "learning_rate": 2.2053607159938195e-05, + "loss": 0.5365, + "step": 1644 + }, + { + "epoch": 0.7921507253355806, + "grad_norm": 0.9155515432357788, + "learning_rate": 2.1955983973906236e-05, + "loss": 0.3519, + "step": 1645 + }, + { + "epoch": 0.7926322759284897, + "grad_norm": 1.0758610963821411, + "learning_rate": 2.1858550685849578e-05, + "loss": 0.7433, + "step": 1646 + }, + { + "epoch": 0.793113826521399, + "grad_norm": 0.6789677143096924, + "learning_rate": 2.17613075328445e-05, + "loss": 0.6452, + "step": 1647 + }, + { + "epoch": 0.793595377114308, + "grad_norm": 0.5732631087303162, + "learning_rate": 2.1664254751504642e-05, + "loss": 0.3202, + "step": 1648 + }, + { + "epoch": 0.7940769277072173, + "grad_norm": 0.21045275032520294, + "learning_rate": 2.1567392577980393e-05, + "loss": 0.1431, + "step": 1649 + }, + { + "epoch": 0.7945584783001264, + "grad_norm": 2.5806925296783447, + "learning_rate": 2.1470721247958404e-05, + "loss": 1.0845, + "step": 1650 + }, + { + "epoch": 0.7950400288930356, + "grad_norm": 1.121893048286438, + "learning_rate": 2.137424099666091e-05, + "loss": 0.5774, + "step": 1651 + }, + { + "epoch": 0.7955215794859447, + "grad_norm": 0.4453411102294922, + "learning_rate": 2.1277952058845284e-05, + "loss": 0.3872, + "step": 1652 + }, + { + "epoch": 0.7960031300788539, + "grad_norm": 0.30813685059547424, + "learning_rate": 2.118185466880327e-05, + "loss": 0.5091, + "step": 1653 + }, + { + "epoch": 0.796484680671763, + "grad_norm": 0.4971233010292053, + "learning_rate": 2.1085949060360654e-05, + "loss": 0.6345, + "step": 1654 + }, + { + "epoch": 0.7969662312646723, + "grad_norm": 0.992877185344696, + "learning_rate": 2.0990235466876517e-05, + "loss": 0.5261, + "step": 1655 + }, + { + "epoch": 0.7974477818575814, + "grad_norm": 0.7967448234558105, + "learning_rate": 2.089471412124274e-05, + "loss": 0.4042, + "step": 1656 + }, + { + "epoch": 0.7979293324504906, + "grad_norm": 0.3339238464832306, + "learning_rate": 2.079938525588342e-05, + "loss": 0.3408, + "step": 1657 + }, + { + "epoch": 0.7984108830433998, + "grad_norm": 0.8930954933166504, + "learning_rate": 2.0704249102754324e-05, + "loss": 0.6303, + "step": 1658 + }, + { + "epoch": 0.7988924336363089, + "grad_norm": 0.4596443474292755, + "learning_rate": 2.0609305893342278e-05, + "loss": 0.3944, + "step": 1659 + }, + { + "epoch": 0.7993739842292181, + "grad_norm": 0.9429139494895935, + "learning_rate": 2.0514555858664663e-05, + "loss": 0.68, + "step": 1660 + }, + { + "epoch": 0.7998555348221272, + "grad_norm": 0.6751180291175842, + "learning_rate": 2.0419999229268805e-05, + "loss": 0.3599, + "step": 1661 + }, + { + "epoch": 0.8003370854150365, + "grad_norm": 0.8911116719245911, + "learning_rate": 2.032563623523147e-05, + "loss": 0.7117, + "step": 1662 + }, + { + "epoch": 0.8008186360079456, + "grad_norm": 0.26947221159935, + "learning_rate": 2.0231467106158186e-05, + "loss": 0.1362, + "step": 1663 + }, + { + "epoch": 0.8013001866008548, + "grad_norm": 0.5611744523048401, + "learning_rate": 2.0137492071182863e-05, + "loss": 0.3276, + "step": 1664 + }, + { + "epoch": 0.8017817371937639, + "grad_norm": 1.128632664680481, + "learning_rate": 2.0043711358967043e-05, + "loss": 1.097, + "step": 1665 + }, + { + "epoch": 0.8022632877866731, + "grad_norm": 0.772879421710968, + "learning_rate": 1.9950125197699508e-05, + "loss": 0.4209, + "step": 1666 + }, + { + "epoch": 0.8027448383795822, + "grad_norm": 0.915276050567627, + "learning_rate": 1.985673381509565e-05, + "loss": 0.322, + "step": 1667 + }, + { + "epoch": 0.8032263889724914, + "grad_norm": 0.5723214745521545, + "learning_rate": 1.9763537438396894e-05, + "loss": 0.4794, + "step": 1668 + }, + { + "epoch": 0.8037079395654005, + "grad_norm": 0.3487998843193054, + "learning_rate": 1.96705362943702e-05, + "loss": 0.4236, + "step": 1669 + }, + { + "epoch": 0.8041894901583098, + "grad_norm": 0.5374249219894409, + "learning_rate": 1.9577730609307454e-05, + "loss": 0.3196, + "step": 1670 + }, + { + "epoch": 0.804671040751219, + "grad_norm": 0.5426716208457947, + "learning_rate": 1.9485120609024975e-05, + "loss": 0.2474, + "step": 1671 + }, + { + "epoch": 0.8051525913441281, + "grad_norm": 1.0175453424453735, + "learning_rate": 1.9392706518862935e-05, + "loss": 0.6101, + "step": 1672 + }, + { + "epoch": 0.8056341419370373, + "grad_norm": 0.348254919052124, + "learning_rate": 1.9300488563684804e-05, + "loss": 0.1878, + "step": 1673 + }, + { + "epoch": 0.8061156925299464, + "grad_norm": 0.4428093731403351, + "learning_rate": 1.920846696787684e-05, + "loss": 0.6291, + "step": 1674 + }, + { + "epoch": 0.8065972431228556, + "grad_norm": 0.5906351208686829, + "learning_rate": 1.9116641955347446e-05, + "loss": 0.3328, + "step": 1675 + }, + { + "epoch": 0.8070787937157647, + "grad_norm": 0.3822533190250397, + "learning_rate": 1.9025013749526767e-05, + "loss": 0.2441, + "step": 1676 + }, + { + "epoch": 0.807560344308674, + "grad_norm": 0.6170566082000732, + "learning_rate": 1.8933582573366036e-05, + "loss": 0.5727, + "step": 1677 + }, + { + "epoch": 0.8080418949015831, + "grad_norm": 1.0449187755584717, + "learning_rate": 1.8842348649337116e-05, + "loss": 0.6781, + "step": 1678 + }, + { + "epoch": 0.8085234454944923, + "grad_norm": 2.128098964691162, + "learning_rate": 1.875131219943187e-05, + "loss": 0.7506, + "step": 1679 + }, + { + "epoch": 0.8090049960874014, + "grad_norm": 1.1354010105133057, + "learning_rate": 1.8660473445161663e-05, + "loss": 0.4331, + "step": 1680 + }, + { + "epoch": 0.8094865466803106, + "grad_norm": 0.49953970313072205, + "learning_rate": 1.856983260755686e-05, + "loss": 0.5888, + "step": 1681 + }, + { + "epoch": 0.8099680972732197, + "grad_norm": 1.0825657844543457, + "learning_rate": 1.8479389907166223e-05, + "loss": 0.3173, + "step": 1682 + }, + { + "epoch": 0.8104496478661289, + "grad_norm": 0.9465007781982422, + "learning_rate": 1.8389145564056387e-05, + "loss": 0.3769, + "step": 1683 + }, + { + "epoch": 0.810931198459038, + "grad_norm": 0.3221222460269928, + "learning_rate": 1.829909979781137e-05, + "loss": 0.1618, + "step": 1684 + }, + { + "epoch": 0.8114127490519473, + "grad_norm": 0.5152098536491394, + "learning_rate": 1.820925282753201e-05, + "loss": 0.4083, + "step": 1685 + }, + { + "epoch": 0.8118942996448565, + "grad_norm": 0.5000860095024109, + "learning_rate": 1.8119604871835437e-05, + "loss": 0.4264, + "step": 1686 + }, + { + "epoch": 0.8123758502377656, + "grad_norm": 0.6200466752052307, + "learning_rate": 1.8030156148854492e-05, + "loss": 0.5876, + "step": 1687 + }, + { + "epoch": 0.8128574008306748, + "grad_norm": 0.4836140275001526, + "learning_rate": 1.7940906876237284e-05, + "loss": 0.1737, + "step": 1688 + }, + { + "epoch": 0.8133389514235839, + "grad_norm": 0.7143861651420593, + "learning_rate": 1.78518572711466e-05, + "loss": 0.4566, + "step": 1689 + }, + { + "epoch": 0.8138205020164931, + "grad_norm": 0.617621898651123, + "learning_rate": 1.776300755025939e-05, + "loss": 0.7797, + "step": 1690 + }, + { + "epoch": 0.8143020526094022, + "grad_norm": 0.5795495510101318, + "learning_rate": 1.767435792976626e-05, + "loss": 0.5482, + "step": 1691 + }, + { + "epoch": 0.8147836032023115, + "grad_norm": 0.4850772023200989, + "learning_rate": 1.7585908625370905e-05, + "loss": 0.4406, + "step": 1692 + }, + { + "epoch": 0.8152651537952206, + "grad_norm": 0.6762451529502869, + "learning_rate": 1.749765985228963e-05, + "loss": 0.7253, + "step": 1693 + }, + { + "epoch": 0.8157467043881298, + "grad_norm": 0.44287633895874023, + "learning_rate": 1.740961182525077e-05, + "loss": 0.2367, + "step": 1694 + }, + { + "epoch": 0.8162282549810389, + "grad_norm": 0.7483050227165222, + "learning_rate": 1.7321764758494252e-05, + "loss": 0.7309, + "step": 1695 + }, + { + "epoch": 0.8167098055739481, + "grad_norm": 0.7007144093513489, + "learning_rate": 1.7234118865770987e-05, + "loss": 0.6392, + "step": 1696 + }, + { + "epoch": 0.8171913561668572, + "grad_norm": 1.0513980388641357, + "learning_rate": 1.7146674360342373e-05, + "loss": 0.5053, + "step": 1697 + }, + { + "epoch": 0.8176729067597664, + "grad_norm": 0.5712372064590454, + "learning_rate": 1.7059431454979824e-05, + "loss": 0.6333, + "step": 1698 + }, + { + "epoch": 0.8181544573526757, + "grad_norm": 1.5801523923873901, + "learning_rate": 1.6972390361964195e-05, + "loss": 0.9628, + "step": 1699 + }, + { + "epoch": 0.8186360079455848, + "grad_norm": 0.40049970149993896, + "learning_rate": 1.688555129308531e-05, + "loss": 0.3031, + "step": 1700 + }, + { + "epoch": 0.819117558538494, + "grad_norm": 0.9495404362678528, + "learning_rate": 1.6798914459641434e-05, + "loss": 0.4211, + "step": 1701 + }, + { + "epoch": 0.8195991091314031, + "grad_norm": 0.5109540224075317, + "learning_rate": 1.6712480072438662e-05, + "loss": 0.6788, + "step": 1702 + }, + { + "epoch": 0.8200806597243123, + "grad_norm": 1.5771692991256714, + "learning_rate": 1.6626248341790596e-05, + "loss": 0.7708, + "step": 1703 + }, + { + "epoch": 0.8205622103172214, + "grad_norm": 0.5159006118774414, + "learning_rate": 1.6540219477517684e-05, + "loss": 0.1863, + "step": 1704 + }, + { + "epoch": 0.8210437609101306, + "grad_norm": 0.773666501045227, + "learning_rate": 1.6454393688946767e-05, + "loss": 0.4441, + "step": 1705 + }, + { + "epoch": 0.8215253115030398, + "grad_norm": 0.5018094778060913, + "learning_rate": 1.6368771184910557e-05, + "loss": 0.3367, + "step": 1706 + }, + { + "epoch": 0.822006862095949, + "grad_norm": 0.8685817122459412, + "learning_rate": 1.6283352173747145e-05, + "loss": 0.3977, + "step": 1707 + }, + { + "epoch": 0.8224884126888581, + "grad_norm": 0.6278987526893616, + "learning_rate": 1.619813686329946e-05, + "loss": 0.744, + "step": 1708 + }, + { + "epoch": 0.8229699632817673, + "grad_norm": 0.9344161152839661, + "learning_rate": 1.611312546091476e-05, + "loss": 0.727, + "step": 1709 + }, + { + "epoch": 0.8234515138746764, + "grad_norm": 1.014560341835022, + "learning_rate": 1.6028318173444202e-05, + "loss": 0.6555, + "step": 1710 + }, + { + "epoch": 0.8239330644675856, + "grad_norm": 0.5681801438331604, + "learning_rate": 1.594371520724226e-05, + "loss": 0.4587, + "step": 1711 + }, + { + "epoch": 0.8244146150604948, + "grad_norm": 1.0397344827651978, + "learning_rate": 1.5859316768166244e-05, + "loss": 0.4093, + "step": 1712 + }, + { + "epoch": 0.824896165653404, + "grad_norm": 0.8482609987258911, + "learning_rate": 1.5775123061575836e-05, + "loss": 0.7608, + "step": 1713 + }, + { + "epoch": 0.8253777162463132, + "grad_norm": 1.0026599168777466, + "learning_rate": 1.569113429233252e-05, + "loss": 0.4691, + "step": 1714 + }, + { + "epoch": 0.8258592668392223, + "grad_norm": 0.4746100902557373, + "learning_rate": 1.5607350664799157e-05, + "loss": 0.5994, + "step": 1715 + }, + { + "epoch": 0.8263408174321315, + "grad_norm": 0.5753427743911743, + "learning_rate": 1.552377238283943e-05, + "loss": 0.3074, + "step": 1716 + }, + { + "epoch": 0.8268223680250406, + "grad_norm": 0.49973657727241516, + "learning_rate": 1.5440399649817385e-05, + "loss": 0.24, + "step": 1717 + }, + { + "epoch": 0.8273039186179498, + "grad_norm": 0.751624345779419, + "learning_rate": 1.5357232668596933e-05, + "loss": 0.6863, + "step": 1718 + }, + { + "epoch": 0.8277854692108589, + "grad_norm": 0.7189420461654663, + "learning_rate": 1.5274271641541295e-05, + "loss": 0.3238, + "step": 1719 + }, + { + "epoch": 0.8282670198037682, + "grad_norm": 0.7325754761695862, + "learning_rate": 1.5191516770512649e-05, + "loss": 0.5745, + "step": 1720 + }, + { + "epoch": 0.8287485703966773, + "grad_norm": 1.2671748399734497, + "learning_rate": 1.5108968256871437e-05, + "loss": 0.424, + "step": 1721 + }, + { + "epoch": 0.8292301209895865, + "grad_norm": 0.6884575486183167, + "learning_rate": 1.5026626301476087e-05, + "loss": 0.5716, + "step": 1722 + }, + { + "epoch": 0.8297116715824956, + "grad_norm": 1.9384143352508545, + "learning_rate": 1.4944491104682379e-05, + "loss": 0.6953, + "step": 1723 + }, + { + "epoch": 0.8301932221754048, + "grad_norm": 0.7088760733604431, + "learning_rate": 1.4862562866343034e-05, + "loss": 0.48, + "step": 1724 + }, + { + "epoch": 0.8306747727683139, + "grad_norm": 1.2581175565719604, + "learning_rate": 1.4780841785807164e-05, + "loss": 0.6884, + "step": 1725 + }, + { + "epoch": 0.8311563233612231, + "grad_norm": 0.607943058013916, + "learning_rate": 1.4699328061919848e-05, + "loss": 0.4432, + "step": 1726 + }, + { + "epoch": 0.8316378739541324, + "grad_norm": 0.41834723949432373, + "learning_rate": 1.4618021893021605e-05, + "loss": 0.1638, + "step": 1727 + }, + { + "epoch": 0.8321194245470415, + "grad_norm": 0.3146800398826599, + "learning_rate": 1.453692347694794e-05, + "loss": 0.1871, + "step": 1728 + }, + { + "epoch": 0.8326009751399507, + "grad_norm": 0.6401658654212952, + "learning_rate": 1.4456033011028835e-05, + "loss": 0.2788, + "step": 1729 + }, + { + "epoch": 0.8330825257328598, + "grad_norm": 0.47132959961891174, + "learning_rate": 1.437535069208833e-05, + "loss": 0.1804, + "step": 1730 + }, + { + "epoch": 0.833564076325769, + "grad_norm": 0.8911796808242798, + "learning_rate": 1.4294876716443906e-05, + "loss": 0.2055, + "step": 1731 + }, + { + "epoch": 0.8340456269186781, + "grad_norm": 0.9256196618080139, + "learning_rate": 1.4214611279906187e-05, + "loss": 0.2823, + "step": 1732 + }, + { + "epoch": 0.8345271775115873, + "grad_norm": 0.958602249622345, + "learning_rate": 1.4134554577778337e-05, + "loss": 0.2037, + "step": 1733 + }, + { + "epoch": 0.8350087281044964, + "grad_norm": 0.7972809672355652, + "learning_rate": 1.4054706804855634e-05, + "loss": 0.772, + "step": 1734 + }, + { + "epoch": 0.8354902786974057, + "grad_norm": 1.3465790748596191, + "learning_rate": 1.3975068155424976e-05, + "loss": 0.9341, + "step": 1735 + }, + { + "epoch": 0.8359718292903148, + "grad_norm": 1.1047698259353638, + "learning_rate": 1.3895638823264446e-05, + "loss": 0.3291, + "step": 1736 + }, + { + "epoch": 0.836453379883224, + "grad_norm": 0.6534242033958435, + "learning_rate": 1.3816419001642777e-05, + "loss": 0.3871, + "step": 1737 + }, + { + "epoch": 0.8369349304761331, + "grad_norm": 1.1815705299377441, + "learning_rate": 1.3737408883318948e-05, + "loss": 0.3144, + "step": 1738 + }, + { + "epoch": 0.8374164810690423, + "grad_norm": 0.9447285532951355, + "learning_rate": 1.365860866054165e-05, + "loss": 0.3193, + "step": 1739 + }, + { + "epoch": 0.8378980316619515, + "grad_norm": 0.44447964429855347, + "learning_rate": 1.358001852504891e-05, + "loss": 0.2149, + "step": 1740 + }, + { + "epoch": 0.8383795822548606, + "grad_norm": 2.03056001663208, + "learning_rate": 1.3501638668067485e-05, + "loss": 0.6462, + "step": 1741 + }, + { + "epoch": 0.8388611328477699, + "grad_norm": 2.4036343097686768, + "learning_rate": 1.3423469280312562e-05, + "loss": 0.4178, + "step": 1742 + }, + { + "epoch": 0.839342683440679, + "grad_norm": 0.40062493085861206, + "learning_rate": 1.3345510551987128e-05, + "loss": 0.1285, + "step": 1743 + }, + { + "epoch": 0.8398242340335882, + "grad_norm": 0.8005915880203247, + "learning_rate": 1.326776267278167e-05, + "loss": 0.2389, + "step": 1744 + }, + { + "epoch": 0.8403057846264973, + "grad_norm": 0.8333378434181213, + "learning_rate": 1.3190225831873581e-05, + "loss": 0.3824, + "step": 1745 + }, + { + "epoch": 0.8407873352194065, + "grad_norm": 0.5255361795425415, + "learning_rate": 1.3112900217926782e-05, + "loss": 0.5677, + "step": 1746 + }, + { + "epoch": 0.8412688858123156, + "grad_norm": 1.1877179145812988, + "learning_rate": 1.3035786019091223e-05, + "loss": 0.6215, + "step": 1747 + }, + { + "epoch": 0.8417504364052248, + "grad_norm": 1.4006471633911133, + "learning_rate": 1.2958883423002422e-05, + "loss": 0.9087, + "step": 1748 + }, + { + "epoch": 0.842231986998134, + "grad_norm": 1.2297186851501465, + "learning_rate": 1.288219261678103e-05, + "loss": 0.4453, + "step": 1749 + }, + { + "epoch": 0.8427135375910432, + "grad_norm": 0.6854021549224854, + "learning_rate": 1.2805713787032381e-05, + "loss": 0.3185, + "step": 1750 + }, + { + "epoch": 0.8431950881839523, + "grad_norm": 0.47747164964675903, + "learning_rate": 1.2729447119846016e-05, + "loss": 0.2544, + "step": 1751 + }, + { + "epoch": 0.8436766387768615, + "grad_norm": 0.9931098818778992, + "learning_rate": 1.265339280079525e-05, + "loss": 0.5462, + "step": 1752 + }, + { + "epoch": 0.8441581893697706, + "grad_norm": 0.7005502581596375, + "learning_rate": 1.257755101493665e-05, + "loss": 0.387, + "step": 1753 + }, + { + "epoch": 0.8446397399626798, + "grad_norm": 1.0729082822799683, + "learning_rate": 1.2501921946809714e-05, + "loss": 0.2227, + "step": 1754 + }, + { + "epoch": 0.845121290555589, + "grad_norm": 0.8140488862991333, + "learning_rate": 1.2426505780436326e-05, + "loss": 0.3288, + "step": 1755 + }, + { + "epoch": 0.8456028411484982, + "grad_norm": 0.8334264159202576, + "learning_rate": 1.2351302699320332e-05, + "loss": 0.5944, + "step": 1756 + }, + { + "epoch": 0.8460843917414074, + "grad_norm": 1.14798104763031, + "learning_rate": 1.2276312886447106e-05, + "loss": 0.4334, + "step": 1757 + }, + { + "epoch": 0.8465659423343165, + "grad_norm": 0.7713477611541748, + "learning_rate": 1.2201536524283074e-05, + "loss": 0.3076, + "step": 1758 + }, + { + "epoch": 0.8470474929272257, + "grad_norm": 0.1603490561246872, + "learning_rate": 1.2126973794775343e-05, + "loss": 0.2664, + "step": 1759 + }, + { + "epoch": 0.8475290435201348, + "grad_norm": 0.7709634900093079, + "learning_rate": 1.2052624879351104e-05, + "loss": 0.2441, + "step": 1760 + }, + { + "epoch": 0.848010594113044, + "grad_norm": 1.0762697458267212, + "learning_rate": 1.1978489958917382e-05, + "loss": 0.588, + "step": 1761 + }, + { + "epoch": 0.8484921447059531, + "grad_norm": 0.5720497965812683, + "learning_rate": 1.1904569213860472e-05, + "loss": 0.7051, + "step": 1762 + }, + { + "epoch": 0.8489736952988624, + "grad_norm": 0.7364173531532288, + "learning_rate": 1.1830862824045552e-05, + "loss": 0.6339, + "step": 1763 + }, + { + "epoch": 0.8494552458917715, + "grad_norm": 0.6003912687301636, + "learning_rate": 1.1757370968816217e-05, + "loss": 0.4115, + "step": 1764 + }, + { + "epoch": 0.8499367964846807, + "grad_norm": 0.6767928600311279, + "learning_rate": 1.1684093826994024e-05, + "loss": 0.2144, + "step": 1765 + }, + { + "epoch": 0.8504183470775898, + "grad_norm": 0.44587114453315735, + "learning_rate": 1.1611031576878117e-05, + "loss": 0.3204, + "step": 1766 + }, + { + "epoch": 0.850899897670499, + "grad_norm": 0.822036623954773, + "learning_rate": 1.1538184396244778e-05, + "loss": 0.3795, + "step": 1767 + }, + { + "epoch": 0.8513814482634082, + "grad_norm": 0.5842288732528687, + "learning_rate": 1.146555246234694e-05, + "loss": 0.2728, + "step": 1768 + }, + { + "epoch": 0.8518629988563173, + "grad_norm": 1.0580365657806396, + "learning_rate": 1.1393135951913824e-05, + "loss": 0.403, + "step": 1769 + }, + { + "epoch": 0.8523445494492266, + "grad_norm": 0.622951328754425, + "learning_rate": 1.132093504115046e-05, + "loss": 0.3162, + "step": 1770 + }, + { + "epoch": 0.8528261000421357, + "grad_norm": 0.5782291293144226, + "learning_rate": 1.1248949905737283e-05, + "loss": 0.1503, + "step": 1771 + }, + { + "epoch": 0.8533076506350449, + "grad_norm": 0.24924737215042114, + "learning_rate": 1.1177180720829694e-05, + "loss": 0.1229, + "step": 1772 + }, + { + "epoch": 0.853789201227954, + "grad_norm": 1.2123310565948486, + "learning_rate": 1.1105627661057671e-05, + "loss": 0.3468, + "step": 1773 + }, + { + "epoch": 0.8542707518208632, + "grad_norm": 1.210551381111145, + "learning_rate": 1.103429090052528e-05, + "loss": 0.8837, + "step": 1774 + }, + { + "epoch": 0.8547523024137723, + "grad_norm": 0.6423014998435974, + "learning_rate": 1.096317061281027e-05, + "loss": 0.4842, + "step": 1775 + }, + { + "epoch": 0.8552338530066815, + "grad_norm": 0.5587584972381592, + "learning_rate": 1.0892266970963704e-05, + "loss": 0.5458, + "step": 1776 + }, + { + "epoch": 0.8557154035995906, + "grad_norm": 0.895846426486969, + "learning_rate": 1.082158014750948e-05, + "loss": 0.965, + "step": 1777 + }, + { + "epoch": 0.8561969541924999, + "grad_norm": 0.7418329119682312, + "learning_rate": 1.0751110314443958e-05, + "loss": 0.5443, + "step": 1778 + }, + { + "epoch": 0.856678504785409, + "grad_norm": 0.6139920949935913, + "learning_rate": 1.0680857643235431e-05, + "loss": 0.4587, + "step": 1779 + }, + { + "epoch": 0.8571600553783182, + "grad_norm": 0.5721187591552734, + "learning_rate": 1.0610822304823887e-05, + "loss": 0.3514, + "step": 1780 + }, + { + "epoch": 0.8576416059712274, + "grad_norm": 0.5844043493270874, + "learning_rate": 1.0541004469620452e-05, + "loss": 0.6186, + "step": 1781 + }, + { + "epoch": 0.8581231565641365, + "grad_norm": 1.2044621706008911, + "learning_rate": 1.0471404307507016e-05, + "loss": 1.0049, + "step": 1782 + }, + { + "epoch": 0.8586047071570457, + "grad_norm": 1.6948399543762207, + "learning_rate": 1.0402021987835831e-05, + "loss": 0.5595, + "step": 1783 + }, + { + "epoch": 0.8590862577499548, + "grad_norm": 0.6645305752754211, + "learning_rate": 1.0332857679429098e-05, + "loss": 0.4045, + "step": 1784 + }, + { + "epoch": 0.8595678083428641, + "grad_norm": 1.1216586828231812, + "learning_rate": 1.0263911550578531e-05, + "loss": 0.4346, + "step": 1785 + }, + { + "epoch": 0.8600493589357732, + "grad_norm": 0.849108099937439, + "learning_rate": 1.0195183769045013e-05, + "loss": 0.2343, + "step": 1786 + }, + { + "epoch": 0.8605309095286824, + "grad_norm": 0.5021809339523315, + "learning_rate": 1.0126674502058054e-05, + "loss": 0.73, + "step": 1787 + }, + { + "epoch": 0.8610124601215915, + "grad_norm": 1.3319389820098877, + "learning_rate": 1.005838391631555e-05, + "loss": 0.4096, + "step": 1788 + }, + { + "epoch": 0.8614940107145007, + "grad_norm": 0.4985010027885437, + "learning_rate": 9.990312177983263e-06, + "loss": 0.4735, + "step": 1789 + }, + { + "epoch": 0.8619755613074098, + "grad_norm": 1.0153026580810547, + "learning_rate": 9.922459452694466e-06, + "loss": 0.6298, + "step": 1790 + }, + { + "epoch": 0.862457111900319, + "grad_norm": 0.3122901916503906, + "learning_rate": 9.854825905549503e-06, + "loss": 0.4888, + "step": 1791 + }, + { + "epoch": 0.8629386624932281, + "grad_norm": 0.5830950736999512, + "learning_rate": 9.787411701115456e-06, + "loss": 0.1771, + "step": 1792 + }, + { + "epoch": 0.8634202130861374, + "grad_norm": 0.8445716500282288, + "learning_rate": 9.720217003425647e-06, + "loss": 0.6008, + "step": 1793 + }, + { + "epoch": 0.8639017636790465, + "grad_norm": 0.5693627595901489, + "learning_rate": 9.65324197597931e-06, + "loss": 0.2022, + "step": 1794 + }, + { + "epoch": 0.8643833142719557, + "grad_norm": 0.5486375689506531, + "learning_rate": 9.58648678174121e-06, + "loss": 0.3731, + "step": 1795 + }, + { + "epoch": 0.8648648648648649, + "grad_norm": 0.6075655221939087, + "learning_rate": 9.51995158314113e-06, + "loss": 0.491, + "step": 1796 + }, + { + "epoch": 0.865346415457774, + "grad_norm": 3.1324307918548584, + "learning_rate": 9.45363654207363e-06, + "loss": 0.5391, + "step": 1797 + }, + { + "epoch": 0.8658279660506832, + "grad_norm": 0.23801574110984802, + "learning_rate": 9.387541819897549e-06, + "loss": 0.1798, + "step": 1798 + }, + { + "epoch": 0.8663095166435923, + "grad_norm": 0.5884119868278503, + "learning_rate": 9.321667577435634e-06, + "loss": 0.4136, + "step": 1799 + }, + { + "epoch": 0.8667910672365016, + "grad_norm": 0.7041465044021606, + "learning_rate": 9.256013974974175e-06, + "loss": 0.4429, + "step": 1800 + }, + { + "epoch": 0.8672726178294107, + "grad_norm": 1.2660185098648071, + "learning_rate": 9.19058117226258e-06, + "loss": 0.6472, + "step": 1801 + }, + { + "epoch": 0.8677541684223199, + "grad_norm": 0.41669484972953796, + "learning_rate": 9.125369328513034e-06, + "loss": 0.5933, + "step": 1802 + }, + { + "epoch": 0.868235719015229, + "grad_norm": 1.2108508348464966, + "learning_rate": 9.060378602400054e-06, + "loss": 0.3878, + "step": 1803 + }, + { + "epoch": 0.8687172696081382, + "grad_norm": 0.5257241129875183, + "learning_rate": 8.995609152060136e-06, + "loss": 0.5695, + "step": 1804 + }, + { + "epoch": 0.8691988202010473, + "grad_norm": 0.4414861798286438, + "learning_rate": 8.931061135091357e-06, + "loss": 0.498, + "step": 1805 + }, + { + "epoch": 0.8696803707939565, + "grad_norm": 0.5597625970840454, + "learning_rate": 8.866734708553015e-06, + "loss": 0.7974, + "step": 1806 + }, + { + "epoch": 0.8701619213868657, + "grad_norm": 0.9208630919456482, + "learning_rate": 8.802630028965242e-06, + "loss": 0.3137, + "step": 1807 + }, + { + "epoch": 0.8706434719797749, + "grad_norm": 0.8461756110191345, + "learning_rate": 8.738747252308555e-06, + "loss": 0.2183, + "step": 1808 + }, + { + "epoch": 0.8711250225726841, + "grad_norm": 0.26984959840774536, + "learning_rate": 8.675086534023591e-06, + "loss": 0.2796, + "step": 1809 + }, + { + "epoch": 0.8716065731655932, + "grad_norm": 0.6333009004592896, + "learning_rate": 8.611648029010643e-06, + "loss": 0.2461, + "step": 1810 + }, + { + "epoch": 0.8720881237585024, + "grad_norm": 0.40087515115737915, + "learning_rate": 8.548431891629316e-06, + "loss": 0.2758, + "step": 1811 + }, + { + "epoch": 0.8725696743514115, + "grad_norm": 2.099525213241577, + "learning_rate": 8.485438275698154e-06, + "loss": 0.3495, + "step": 1812 + }, + { + "epoch": 0.8730512249443207, + "grad_norm": 1.6879249811172485, + "learning_rate": 8.422667334494249e-06, + "loss": 0.6519, + "step": 1813 + }, + { + "epoch": 0.8735327755372299, + "grad_norm": 1.2098280191421509, + "learning_rate": 8.360119220752893e-06, + "loss": 0.2281, + "step": 1814 + }, + { + "epoch": 0.8740143261301391, + "grad_norm": 1.2406582832336426, + "learning_rate": 8.297794086667165e-06, + "loss": 0.7037, + "step": 1815 + }, + { + "epoch": 0.8744958767230482, + "grad_norm": 1.2510970830917358, + "learning_rate": 8.235692083887613e-06, + "loss": 0.3557, + "step": 1816 + }, + { + "epoch": 0.8749774273159574, + "grad_norm": 0.9780548214912415, + "learning_rate": 8.173813363521843e-06, + "loss": 0.9645, + "step": 1817 + }, + { + "epoch": 0.8754589779088665, + "grad_norm": 0.6426886320114136, + "learning_rate": 8.112158076134157e-06, + "loss": 0.5133, + "step": 1818 + }, + { + "epoch": 0.8759405285017757, + "grad_norm": 0.45545464754104614, + "learning_rate": 8.05072637174522e-06, + "loss": 0.5361, + "step": 1819 + }, + { + "epoch": 0.8764220790946848, + "grad_norm": 0.5086826682090759, + "learning_rate": 7.989518399831641e-06, + "loss": 0.4464, + "step": 1820 + }, + { + "epoch": 0.876903629687594, + "grad_norm": 0.5799944400787354, + "learning_rate": 7.928534309325675e-06, + "loss": 0.5032, + "step": 1821 + }, + { + "epoch": 0.8773851802805033, + "grad_norm": 0.3758942782878876, + "learning_rate": 7.8677742486148e-06, + "loss": 0.3161, + "step": 1822 + }, + { + "epoch": 0.8778667308734124, + "grad_norm": 0.4136337637901306, + "learning_rate": 7.807238365541391e-06, + "loss": 0.1697, + "step": 1823 + }, + { + "epoch": 0.8783482814663216, + "grad_norm": 0.48314744234085083, + "learning_rate": 7.746926807402344e-06, + "loss": 0.362, + "step": 1824 + }, + { + "epoch": 0.8788298320592307, + "grad_norm": 1.0465835332870483, + "learning_rate": 7.686839720948736e-06, + "loss": 0.8487, + "step": 1825 + }, + { + "epoch": 0.8793113826521399, + "grad_norm": 0.528516948223114, + "learning_rate": 7.6269772523854365e-06, + "loss": 0.1934, + "step": 1826 + }, + { + "epoch": 0.879792933245049, + "grad_norm": 0.8582021594047546, + "learning_rate": 7.567339547370789e-06, + "loss": 0.414, + "step": 1827 + }, + { + "epoch": 0.8802744838379583, + "grad_norm": 0.6423068046569824, + "learning_rate": 7.507926751016248e-06, + "loss": 0.4208, + "step": 1828 + }, + { + "epoch": 0.8807560344308674, + "grad_norm": 0.6383021473884583, + "learning_rate": 7.4487390078859855e-06, + "loss": 0.4822, + "step": 1829 + }, + { + "epoch": 0.8812375850237766, + "grad_norm": 0.41269204020500183, + "learning_rate": 7.389776461996578e-06, + "loss": 0.4108, + "step": 1830 + }, + { + "epoch": 0.8817191356166857, + "grad_norm": 0.9470385909080505, + "learning_rate": 7.331039256816663e-06, + "loss": 0.7163, + "step": 1831 + }, + { + "epoch": 0.8822006862095949, + "grad_norm": 0.9979108572006226, + "learning_rate": 7.27252753526656e-06, + "loss": 0.6167, + "step": 1832 + }, + { + "epoch": 0.882682236802504, + "grad_norm": 1.4892247915267944, + "learning_rate": 7.214241439717962e-06, + "loss": 0.4468, + "step": 1833 + }, + { + "epoch": 0.8831637873954132, + "grad_norm": 0.48063895106315613, + "learning_rate": 7.1561811119935425e-06, + "loss": 0.2955, + "step": 1834 + }, + { + "epoch": 0.8836453379883223, + "grad_norm": 0.7973853945732117, + "learning_rate": 7.098346693366642e-06, + "loss": 0.4488, + "step": 1835 + }, + { + "epoch": 0.8841268885812316, + "grad_norm": 0.48340585827827454, + "learning_rate": 7.0407383245609136e-06, + "loss": 0.1981, + "step": 1836 + }, + { + "epoch": 0.8846084391741408, + "grad_norm": 0.35652226209640503, + "learning_rate": 6.983356145749975e-06, + "loss": 0.224, + "step": 1837 + }, + { + "epoch": 0.8850899897670499, + "grad_norm": 0.7322549223899841, + "learning_rate": 6.9262002965570835e-06, + "loss": 0.4638, + "step": 1838 + }, + { + "epoch": 0.8855715403599591, + "grad_norm": 0.850190281867981, + "learning_rate": 6.869270916054782e-06, + "loss": 0.4978, + "step": 1839 + }, + { + "epoch": 0.8860530909528682, + "grad_norm": 0.9290736317634583, + "learning_rate": 6.812568142764575e-06, + "loss": 0.3012, + "step": 1840 + }, + { + "epoch": 0.8865346415457774, + "grad_norm": 1.7840392589569092, + "learning_rate": 6.756092114656587e-06, + "loss": 0.5285, + "step": 1841 + }, + { + "epoch": 0.8870161921386865, + "grad_norm": 0.8821878433227539, + "learning_rate": 6.699842969149195e-06, + "loss": 0.3903, + "step": 1842 + }, + { + "epoch": 0.8874977427315958, + "grad_norm": 0.676434338092804, + "learning_rate": 6.64382084310875e-06, + "loss": 0.4447, + "step": 1843 + }, + { + "epoch": 0.8879792933245049, + "grad_norm": 0.6128348112106323, + "learning_rate": 6.5880258728491905e-06, + "loss": 0.255, + "step": 1844 + }, + { + "epoch": 0.8884608439174141, + "grad_norm": 0.8936730027198792, + "learning_rate": 6.532458194131763e-06, + "loss": 0.6179, + "step": 1845 + }, + { + "epoch": 0.8889423945103232, + "grad_norm": 0.9091479778289795, + "learning_rate": 6.477117942164657e-06, + "loss": 0.2722, + "step": 1846 + }, + { + "epoch": 0.8894239451032324, + "grad_norm": 0.29884129762649536, + "learning_rate": 6.422005251602658e-06, + "loss": 0.2622, + "step": 1847 + }, + { + "epoch": 0.8899054956961415, + "grad_norm": 0.5838947892189026, + "learning_rate": 6.367120256546888e-06, + "loss": 0.2134, + "step": 1848 + }, + { + "epoch": 0.8903870462890507, + "grad_norm": 0.4674443304538727, + "learning_rate": 6.312463090544396e-06, + "loss": 0.3128, + "step": 1849 + }, + { + "epoch": 0.89086859688196, + "grad_norm": 0.4946456849575043, + "learning_rate": 6.258033886587911e-06, + "loss": 0.8181, + "step": 1850 + }, + { + "epoch": 0.8913501474748691, + "grad_norm": 0.7695437669754028, + "learning_rate": 6.2038327771154485e-06, + "loss": 0.2637, + "step": 1851 + }, + { + "epoch": 0.8918316980677783, + "grad_norm": 0.6477566361427307, + "learning_rate": 6.1498598940100346e-06, + "loss": 0.2117, + "step": 1852 + }, + { + "epoch": 0.8923132486606874, + "grad_norm": 0.4521937072277069, + "learning_rate": 6.0961153685993646e-06, + "loss": 0.2862, + "step": 1853 + }, + { + "epoch": 0.8927947992535966, + "grad_norm": 0.6747299432754517, + "learning_rate": 6.0425993316554965e-06, + "loss": 0.1795, + "step": 1854 + }, + { + "epoch": 0.8932763498465057, + "grad_norm": 0.3627295196056366, + "learning_rate": 5.989311913394546e-06, + "loss": 0.2202, + "step": 1855 + }, + { + "epoch": 0.8937579004394149, + "grad_norm": 0.7173798680305481, + "learning_rate": 5.93625324347632e-06, + "loss": 0.2639, + "step": 1856 + }, + { + "epoch": 0.894239451032324, + "grad_norm": 1.4216406345367432, + "learning_rate": 5.8834234510040335e-06, + "loss": 0.6489, + "step": 1857 + }, + { + "epoch": 0.8947210016252333, + "grad_norm": 1.0960854291915894, + "learning_rate": 5.830822664523994e-06, + "loss": 0.1812, + "step": 1858 + }, + { + "epoch": 0.8952025522181424, + "grad_norm": 1.004361867904663, + "learning_rate": 5.77845101202531e-06, + "loss": 0.6176, + "step": 1859 + }, + { + "epoch": 0.8956841028110516, + "grad_norm": 1.0470167398452759, + "learning_rate": 5.726308620939536e-06, + "loss": 0.3436, + "step": 1860 + }, + { + "epoch": 0.8961656534039607, + "grad_norm": 0.4212130606174469, + "learning_rate": 5.674395618140393e-06, + "loss": 0.1362, + "step": 1861 + }, + { + "epoch": 0.8966472039968699, + "grad_norm": 0.5821614265441895, + "learning_rate": 5.622712129943453e-06, + "loss": 0.1701, + "step": 1862 + }, + { + "epoch": 0.8971287545897791, + "grad_norm": 0.48322948813438416, + "learning_rate": 5.571258282105829e-06, + "loss": 0.6155, + "step": 1863 + }, + { + "epoch": 0.8976103051826883, + "grad_norm": 0.8195532560348511, + "learning_rate": 5.520034199825841e-06, + "loss": 0.8582, + "step": 1864 + }, + { + "epoch": 0.8980918557755975, + "grad_norm": 0.6533116698265076, + "learning_rate": 5.469040007742776e-06, + "loss": 0.3581, + "step": 1865 + }, + { + "epoch": 0.8985734063685066, + "grad_norm": 1.3745543956756592, + "learning_rate": 5.418275829936537e-06, + "loss": 0.87, + "step": 1866 + }, + { + "epoch": 0.8990549569614158, + "grad_norm": 0.5306475758552551, + "learning_rate": 5.36774178992735e-06, + "loss": 0.3363, + "step": 1867 + }, + { + "epoch": 0.8995365075543249, + "grad_norm": 0.37541911005973816, + "learning_rate": 5.317438010675469e-06, + "loss": 0.2695, + "step": 1868 + }, + { + "epoch": 0.9000180581472341, + "grad_norm": 1.9978303909301758, + "learning_rate": 5.267364614580861e-06, + "loss": 0.1739, + "step": 1869 + }, + { + "epoch": 0.9004996087401432, + "grad_norm": 0.6680489778518677, + "learning_rate": 5.217521723482943e-06, + "loss": 0.2017, + "step": 1870 + }, + { + "epoch": 0.9009811593330525, + "grad_norm": 1.0578739643096924, + "learning_rate": 5.167909458660258e-06, + "loss": 0.6478, + "step": 1871 + }, + { + "epoch": 0.9014627099259616, + "grad_norm": 0.6430098414421082, + "learning_rate": 5.118527940830165e-06, + "loss": 0.4319, + "step": 1872 + }, + { + "epoch": 0.9019442605188708, + "grad_norm": 1.0397404432296753, + "learning_rate": 5.069377290148602e-06, + "loss": 0.2738, + "step": 1873 + }, + { + "epoch": 0.9024258111117799, + "grad_norm": 0.6406477689743042, + "learning_rate": 5.020457626209707e-06, + "loss": 0.2009, + "step": 1874 + }, + { + "epoch": 0.9029073617046891, + "grad_norm": 0.5656185150146484, + "learning_rate": 4.971769068045628e-06, + "loss": 0.8286, + "step": 1875 + }, + { + "epoch": 0.9033889122975982, + "grad_norm": 0.6659497618675232, + "learning_rate": 4.923311734126135e-06, + "loss": 0.3415, + "step": 1876 + }, + { + "epoch": 0.9038704628905074, + "grad_norm": 0.3788612484931946, + "learning_rate": 4.875085742358432e-06, + "loss": 0.3788, + "step": 1877 + }, + { + "epoch": 0.9043520134834167, + "grad_norm": 0.4597952365875244, + "learning_rate": 4.827091210086776e-06, + "loss": 0.4112, + "step": 1878 + }, + { + "epoch": 0.9048335640763258, + "grad_norm": 0.4560735821723938, + "learning_rate": 4.779328254092252e-06, + "loss": 0.8542, + "step": 1879 + }, + { + "epoch": 0.905315114669235, + "grad_norm": 0.4813135266304016, + "learning_rate": 4.731796990592452e-06, + "loss": 0.3249, + "step": 1880 + }, + { + "epoch": 0.9057966652621441, + "grad_norm": 0.463824063539505, + "learning_rate": 4.68449753524125e-06, + "loss": 0.6352, + "step": 1881 + }, + { + "epoch": 0.9062782158550533, + "grad_norm": 0.5444586873054504, + "learning_rate": 4.637430003128429e-06, + "loss": 0.3455, + "step": 1882 + }, + { + "epoch": 0.9067597664479624, + "grad_norm": 1.5401670932769775, + "learning_rate": 4.5905945087794996e-06, + "loss": 0.409, + "step": 1883 + }, + { + "epoch": 0.9072413170408716, + "grad_norm": 0.5584079027175903, + "learning_rate": 4.543991166155337e-06, + "loss": 0.8034, + "step": 1884 + }, + { + "epoch": 0.9077228676337807, + "grad_norm": 1.1037728786468506, + "learning_rate": 4.497620088651966e-06, + "loss": 0.373, + "step": 1885 + }, + { + "epoch": 0.90820441822669, + "grad_norm": 0.3681987524032593, + "learning_rate": 4.451481389100232e-06, + "loss": 0.2953, + "step": 1886 + }, + { + "epoch": 0.9086859688195991, + "grad_norm": 0.8980375528335571, + "learning_rate": 4.405575179765586e-06, + "loss": 0.3407, + "step": 1887 + }, + { + "epoch": 0.9091675194125083, + "grad_norm": 0.8619612455368042, + "learning_rate": 4.359901572347758e-06, + "loss": 0.5184, + "step": 1888 + }, + { + "epoch": 0.9096490700054174, + "grad_norm": 1.0720115900039673, + "learning_rate": 4.314460677980537e-06, + "loss": 0.4286, + "step": 1889 + }, + { + "epoch": 0.9101306205983266, + "grad_norm": 0.4389863908290863, + "learning_rate": 4.269252607231422e-06, + "loss": 0.3191, + "step": 1890 + }, + { + "epoch": 0.9106121711912358, + "grad_norm": 1.1555299758911133, + "learning_rate": 4.224277470101445e-06, + "loss": 0.5738, + "step": 1891 + }, + { + "epoch": 0.9110937217841449, + "grad_norm": 0.8084120750427246, + "learning_rate": 4.179535376024857e-06, + "loss": 0.5808, + "step": 1892 + }, + { + "epoch": 0.9115752723770542, + "grad_norm": 1.1835227012634277, + "learning_rate": 4.135026433868827e-06, + "loss": 0.7092, + "step": 1893 + }, + { + "epoch": 0.9120568229699633, + "grad_norm": 0.6801626086235046, + "learning_rate": 4.090750751933248e-06, + "loss": 0.8846, + "step": 1894 + }, + { + "epoch": 0.9125383735628725, + "grad_norm": 1.4253311157226562, + "learning_rate": 4.046708437950464e-06, + "loss": 0.3153, + "step": 1895 + }, + { + "epoch": 0.9130199241557816, + "grad_norm": 0.9780870079994202, + "learning_rate": 4.0028995990849084e-06, + "loss": 0.4197, + "step": 1896 + }, + { + "epoch": 0.9135014747486908, + "grad_norm": 0.8115230202674866, + "learning_rate": 3.95932434193299e-06, + "loss": 0.3357, + "step": 1897 + }, + { + "epoch": 0.9139830253415999, + "grad_norm": 1.1243044137954712, + "learning_rate": 3.915982772522719e-06, + "loss": 0.4661, + "step": 1898 + }, + { + "epoch": 0.9144645759345091, + "grad_norm": 0.6497891545295715, + "learning_rate": 3.872874996313513e-06, + "loss": 0.6243, + "step": 1899 + }, + { + "epoch": 0.9149461265274182, + "grad_norm": 0.4858989715576172, + "learning_rate": 3.830001118195936e-06, + "loss": 0.3582, + "step": 1900 + }, + { + "epoch": 0.9154276771203275, + "grad_norm": 0.6466355323791504, + "learning_rate": 3.787361242491394e-06, + "loss": 0.2838, + "step": 1901 + }, + { + "epoch": 0.9159092277132366, + "grad_norm": 0.3234995901584625, + "learning_rate": 3.744955472951928e-06, + "loss": 0.1623, + "step": 1902 + }, + { + "epoch": 0.9163907783061458, + "grad_norm": 0.8426408171653748, + "learning_rate": 3.702783912759955e-06, + "loss": 0.3511, + "step": 1903 + }, + { + "epoch": 0.916872328899055, + "grad_norm": 0.4413106143474579, + "learning_rate": 3.660846664528006e-06, + "loss": 0.3376, + "step": 1904 + }, + { + "epoch": 0.9173538794919641, + "grad_norm": 0.6131982207298279, + "learning_rate": 3.6191438302984772e-06, + "loss": 0.4348, + "step": 1905 + }, + { + "epoch": 0.9178354300848733, + "grad_norm": 0.5771300196647644, + "learning_rate": 3.577675511543388e-06, + "loss": 0.4118, + "step": 1906 + }, + { + "epoch": 0.9183169806777824, + "grad_norm": 0.8268334269523621, + "learning_rate": 3.5364418091641373e-06, + "loss": 0.7767, + "step": 1907 + }, + { + "epoch": 0.9187985312706917, + "grad_norm": 0.46120405197143555, + "learning_rate": 3.495442823491224e-06, + "loss": 0.3307, + "step": 1908 + }, + { + "epoch": 0.9192800818636008, + "grad_norm": 1.0451537370681763, + "learning_rate": 3.4546786542840605e-06, + "loss": 0.404, + "step": 1909 + }, + { + "epoch": 0.91976163245651, + "grad_norm": 0.5279560685157776, + "learning_rate": 3.4141494007306816e-06, + "loss": 0.264, + "step": 1910 + }, + { + "epoch": 0.9202431830494191, + "grad_norm": 0.7413983345031738, + "learning_rate": 3.373855161447548e-06, + "loss": 0.846, + "step": 1911 + }, + { + "epoch": 0.9207247336423283, + "grad_norm": 0.7785257697105408, + "learning_rate": 3.333796034479242e-06, + "loss": 0.3309, + "step": 1912 + }, + { + "epoch": 0.9212062842352374, + "grad_norm": 0.3095743656158447, + "learning_rate": 3.293972117298294e-06, + "loss": 0.3359, + "step": 1913 + }, + { + "epoch": 0.9216878348281466, + "grad_norm": 0.7517334222793579, + "learning_rate": 3.2543835068049255e-06, + "loss": 0.3582, + "step": 1914 + }, + { + "epoch": 0.9221693854210558, + "grad_norm": 0.7982280850410461, + "learning_rate": 3.21503029932676e-06, + "loss": 0.3456, + "step": 1915 + }, + { + "epoch": 0.922650936013965, + "grad_norm": 0.4703887403011322, + "learning_rate": 3.1759125906186793e-06, + "loss": 0.2288, + "step": 1916 + }, + { + "epoch": 0.9231324866068741, + "grad_norm": 0.740643322467804, + "learning_rate": 3.137030475862535e-06, + "loss": 0.5485, + "step": 1917 + }, + { + "epoch": 0.9236140371997833, + "grad_norm": 0.40739530324935913, + "learning_rate": 3.098384049666925e-06, + "loss": 0.2052, + "step": 1918 + }, + { + "epoch": 0.9240955877926925, + "grad_norm": 0.3366871178150177, + "learning_rate": 3.059973406066963e-06, + "loss": 0.222, + "step": 1919 + }, + { + "epoch": 0.9245771383856016, + "grad_norm": 1.84107506275177, + "learning_rate": 3.0217986385240537e-06, + "loss": 0.2745, + "step": 1920 + }, + { + "epoch": 0.9250586889785108, + "grad_norm": 0.5755985379219055, + "learning_rate": 2.983859839925662e-06, + "loss": 0.43, + "step": 1921 + }, + { + "epoch": 0.92554023957142, + "grad_norm": 0.5986522436141968, + "learning_rate": 2.94615710258509e-06, + "loss": 0.5719, + "step": 1922 + }, + { + "epoch": 0.9260217901643292, + "grad_norm": 0.8219228982925415, + "learning_rate": 2.908690518241275e-06, + "loss": 0.4841, + "step": 1923 + }, + { + "epoch": 0.9265033407572383, + "grad_norm": 0.6398215889930725, + "learning_rate": 2.8714601780584937e-06, + "loss": 0.2369, + "step": 1924 + }, + { + "epoch": 0.9269848913501475, + "grad_norm": 0.31903043389320374, + "learning_rate": 2.834466172626238e-06, + "loss": 0.152, + "step": 1925 + }, + { + "epoch": 0.9274664419430566, + "grad_norm": 0.7458855509757996, + "learning_rate": 2.7977085919589254e-06, + "loss": 0.3954, + "step": 1926 + }, + { + "epoch": 0.9279479925359658, + "grad_norm": 0.3328702747821808, + "learning_rate": 2.76118752549569e-06, + "loss": 0.1089, + "step": 1927 + }, + { + "epoch": 0.9284295431288749, + "grad_norm": 0.7288302779197693, + "learning_rate": 2.7249030621001924e-06, + "loss": 0.7062, + "step": 1928 + }, + { + "epoch": 0.9289110937217842, + "grad_norm": 0.9394994974136353, + "learning_rate": 2.688855290060399e-06, + "loss": 0.5547, + "step": 1929 + }, + { + "epoch": 0.9293926443146933, + "grad_norm": 0.6995114684104919, + "learning_rate": 2.653044297088314e-06, + "loss": 0.5769, + "step": 1930 + }, + { + "epoch": 0.9298741949076025, + "grad_norm": 0.5161604881286621, + "learning_rate": 2.6174701703198468e-06, + "loss": 0.2498, + "step": 1931 + }, + { + "epoch": 0.9303557455005117, + "grad_norm": 0.6070406436920166, + "learning_rate": 2.5821329963145347e-06, + "loss": 0.4596, + "step": 1932 + }, + { + "epoch": 0.9308372960934208, + "grad_norm": 0.48303160071372986, + "learning_rate": 2.547032861055376e-06, + "loss": 0.2382, + "step": 1933 + }, + { + "epoch": 0.93131884668633, + "grad_norm": 0.4752054214477539, + "learning_rate": 2.5121698499485757e-06, + "loss": 0.5855, + "step": 1934 + }, + { + "epoch": 0.9318003972792391, + "grad_norm": 0.5160010457038879, + "learning_rate": 2.4775440478233993e-06, + "loss": 0.5004, + "step": 1935 + }, + { + "epoch": 0.9322819478721484, + "grad_norm": 0.6929358839988708, + "learning_rate": 2.4431555389319074e-06, + "loss": 0.4168, + "step": 1936 + }, + { + "epoch": 0.9327634984650575, + "grad_norm": 0.6429789662361145, + "learning_rate": 2.4090044069487784e-06, + "loss": 0.7282, + "step": 1937 + }, + { + "epoch": 0.9332450490579667, + "grad_norm": 0.5752772092819214, + "learning_rate": 2.3750907349711084e-06, + "loss": 0.1861, + "step": 1938 + }, + { + "epoch": 0.9337265996508758, + "grad_norm": 0.8524832725524902, + "learning_rate": 2.3414146055182106e-06, + "loss": 0.6503, + "step": 1939 + }, + { + "epoch": 0.934208150243785, + "grad_norm": 0.7602763772010803, + "learning_rate": 2.307976100531384e-06, + "loss": 0.2954, + "step": 1940 + }, + { + "epoch": 0.9346897008366941, + "grad_norm": 0.5198546648025513, + "learning_rate": 2.274775301373744e-06, + "loss": 0.482, + "step": 1941 + }, + { + "epoch": 0.9351712514296033, + "grad_norm": 0.5872153639793396, + "learning_rate": 2.241812288830003e-06, + "loss": 0.6061, + "step": 1942 + }, + { + "epoch": 0.9356528020225124, + "grad_norm": 0.7004355192184448, + "learning_rate": 2.2090871431063253e-06, + "loss": 0.2804, + "step": 1943 + }, + { + "epoch": 0.9361343526154217, + "grad_norm": 1.5988649129867554, + "learning_rate": 2.176599943830071e-06, + "loss": 0.468, + "step": 1944 + }, + { + "epoch": 0.9366159032083309, + "grad_norm": 0.49101322889328003, + "learning_rate": 2.144350770049597e-06, + "loss": 0.1687, + "step": 1945 + }, + { + "epoch": 0.93709745380124, + "grad_norm": 0.48457643389701843, + "learning_rate": 2.112339700234156e-06, + "loss": 0.4094, + "step": 1946 + }, + { + "epoch": 0.9375790043941492, + "grad_norm": 0.8701443076133728, + "learning_rate": 2.0805668122735767e-06, + "loss": 0.3141, + "step": 1947 + }, + { + "epoch": 0.9380605549870583, + "grad_norm": 0.5563820004463196, + "learning_rate": 2.0490321834781833e-06, + "loss": 0.417, + "step": 1948 + }, + { + "epoch": 0.9385421055799675, + "grad_norm": 0.48042789101600647, + "learning_rate": 2.0177358905785537e-06, + "loss": 0.4803, + "step": 1949 + }, + { + "epoch": 0.9390236561728766, + "grad_norm": 0.5688298940658569, + "learning_rate": 1.986678009725329e-06, + "loss": 0.5181, + "step": 1950 + }, + { + "epoch": 0.9395052067657859, + "grad_norm": 0.595424473285675, + "learning_rate": 1.955858616489059e-06, + "loss": 0.8045, + "step": 1951 + }, + { + "epoch": 0.939986757358695, + "grad_norm": 0.9292742609977722, + "learning_rate": 1.9252777858599915e-06, + "loss": 0.6183, + "step": 1952 + }, + { + "epoch": 0.9404683079516042, + "grad_norm": 0.6292850971221924, + "learning_rate": 1.8949355922479151e-06, + "loss": 0.2678, + "step": 1953 + }, + { + "epoch": 0.9409498585445133, + "grad_norm": 0.8005472421646118, + "learning_rate": 1.8648321094819287e-06, + "loss": 0.5892, + "step": 1954 + }, + { + "epoch": 0.9414314091374225, + "grad_norm": 0.6208492517471313, + "learning_rate": 1.8349674108103288e-06, + "loss": 0.5731, + "step": 1955 + }, + { + "epoch": 0.9419129597303316, + "grad_norm": 0.5404171943664551, + "learning_rate": 1.8053415689003872e-06, + "loss": 0.4163, + "step": 1956 + }, + { + "epoch": 0.9423945103232408, + "grad_norm": 0.6060441732406616, + "learning_rate": 1.7759546558381967e-06, + "loss": 0.3149, + "step": 1957 + }, + { + "epoch": 0.94287606091615, + "grad_norm": 0.8346067070960999, + "learning_rate": 1.7468067431284707e-06, + "loss": 0.1014, + "step": 1958 + }, + { + "epoch": 0.9433576115090592, + "grad_norm": 0.6417046189308167, + "learning_rate": 1.7178979016943764e-06, + "loss": 0.3301, + "step": 1959 + }, + { + "epoch": 0.9438391621019684, + "grad_norm": 1.0229589939117432, + "learning_rate": 1.6892282018773908e-06, + "loss": 0.2667, + "step": 1960 + }, + { + "epoch": 0.9443207126948775, + "grad_norm": 0.6257058382034302, + "learning_rate": 1.6607977134370789e-06, + "loss": 0.583, + "step": 1961 + }, + { + "epoch": 0.9448022632877867, + "grad_norm": 1.0860248804092407, + "learning_rate": 1.6326065055510043e-06, + "loss": 0.7689, + "step": 1962 + }, + { + "epoch": 0.9452838138806958, + "grad_norm": 0.5820235013961792, + "learning_rate": 1.6046546468144407e-06, + "loss": 0.5898, + "step": 1963 + }, + { + "epoch": 0.945765364473605, + "grad_norm": 0.7211606502532959, + "learning_rate": 1.576942205240317e-06, + "loss": 0.4067, + "step": 1964 + }, + { + "epoch": 0.9462469150665141, + "grad_norm": 1.3478176593780518, + "learning_rate": 1.5494692482590057e-06, + "loss": 0.3179, + "step": 1965 + }, + { + "epoch": 0.9467284656594234, + "grad_norm": 0.4095667004585266, + "learning_rate": 1.522235842718156e-06, + "loss": 0.6086, + "step": 1966 + }, + { + "epoch": 0.9472100162523325, + "grad_norm": 0.5819193720817566, + "learning_rate": 1.4952420548825285e-06, + "loss": 0.1797, + "step": 1967 + }, + { + "epoch": 0.9476915668452417, + "grad_norm": 1.0790724754333496, + "learning_rate": 1.468487950433839e-06, + "loss": 0.5037, + "step": 1968 + }, + { + "epoch": 0.9481731174381508, + "grad_norm": 0.7441384792327881, + "learning_rate": 1.441973594470636e-06, + "loss": 0.4661, + "step": 1969 + }, + { + "epoch": 0.94865466803106, + "grad_norm": 0.6784555912017822, + "learning_rate": 1.415699051508068e-06, + "loss": 0.2276, + "step": 1970 + }, + { + "epoch": 0.9491362186239691, + "grad_norm": 0.5277613401412964, + "learning_rate": 1.3896643854777847e-06, + "loss": 0.4516, + "step": 1971 + }, + { + "epoch": 0.9496177692168783, + "grad_norm": 1.063583493232727, + "learning_rate": 1.3638696597277679e-06, + "loss": 0.1294, + "step": 1972 + }, + { + "epoch": 0.9500993198097876, + "grad_norm": 0.5436953902244568, + "learning_rate": 1.3383149370221449e-06, + "loss": 0.3495, + "step": 1973 + }, + { + "epoch": 0.9505808704026967, + "grad_norm": 1.130716323852539, + "learning_rate": 1.313000279541121e-06, + "loss": 0.3169, + "step": 1974 + }, + { + "epoch": 0.9510624209956059, + "grad_norm": 0.40417709946632385, + "learning_rate": 1.287925748880703e-06, + "loss": 0.3036, + "step": 1975 + }, + { + "epoch": 0.951543971588515, + "grad_norm": 1.2000254392623901, + "learning_rate": 1.2630914060526522e-06, + "loss": 0.6481, + "step": 1976 + }, + { + "epoch": 0.9520255221814242, + "grad_norm": 0.7881247997283936, + "learning_rate": 1.2384973114843101e-06, + "loss": 0.4756, + "step": 1977 + }, + { + "epoch": 0.9525070727743333, + "grad_norm": 0.2699849009513855, + "learning_rate": 1.2141435250184185e-06, + "loss": 0.0586, + "step": 1978 + }, + { + "epoch": 0.9529886233672425, + "grad_norm": 1.3528825044631958, + "learning_rate": 1.1900301059130093e-06, + "loss": 0.5126, + "step": 1979 + }, + { + "epoch": 0.9534701739601517, + "grad_norm": 1.0127407312393188, + "learning_rate": 1.1661571128412596e-06, + "loss": 0.3071, + "step": 1980 + }, + { + "epoch": 0.9539517245530609, + "grad_norm": 0.7234674692153931, + "learning_rate": 1.142524603891315e-06, + "loss": 0.7738, + "step": 1981 + }, + { + "epoch": 0.95443327514597, + "grad_norm": 0.5042656660079956, + "learning_rate": 1.1191326365661892e-06, + "loss": 0.5241, + "step": 1982 + }, + { + "epoch": 0.9549148257388792, + "grad_norm": 0.4042523503303528, + "learning_rate": 1.0959812677835968e-06, + "loss": 0.2122, + "step": 1983 + }, + { + "epoch": 0.9553963763317883, + "grad_norm": 1.0588494539260864, + "learning_rate": 1.0730705538758322e-06, + "loss": 0.7485, + "step": 1984 + }, + { + "epoch": 0.9558779269246975, + "grad_norm": 0.6784031391143799, + "learning_rate": 1.0504005505896141e-06, + "loss": 0.289, + "step": 1985 + }, + { + "epoch": 0.9563594775176067, + "grad_norm": 2.0912065505981445, + "learning_rate": 1.0279713130859514e-06, + "loss": 0.6496, + "step": 1986 + }, + { + "epoch": 0.9568410281105159, + "grad_norm": 0.6427431106567383, + "learning_rate": 1.005782895940055e-06, + "loss": 0.9881, + "step": 1987 + }, + { + "epoch": 0.9573225787034251, + "grad_norm": 0.6821915507316589, + "learning_rate": 9.838353531411272e-07, + "loss": 0.4672, + "step": 1988 + }, + { + "epoch": 0.9578041292963342, + "grad_norm": 0.43331241607666016, + "learning_rate": 9.62128738092294e-07, + "loss": 0.5128, + "step": 1989 + }, + { + "epoch": 0.9582856798892434, + "grad_norm": 0.7938480377197266, + "learning_rate": 9.406631036104508e-07, + "loss": 0.4462, + "step": 1990 + }, + { + "epoch": 0.9587672304821525, + "grad_norm": 0.9452693462371826, + "learning_rate": 9.194385019261287e-07, + "loss": 0.5285, + "step": 1991 + }, + { + "epoch": 0.9592487810750617, + "grad_norm": 0.4936063289642334, + "learning_rate": 8.984549846833612e-07, + "loss": 0.2313, + "step": 1992 + }, + { + "epoch": 0.9597303316679708, + "grad_norm": 0.42992129921913147, + "learning_rate": 8.777126029396065e-07, + "loss": 0.2099, + "step": 1993 + }, + { + "epoch": 0.9602118822608801, + "grad_norm": 0.397684782743454, + "learning_rate": 8.572114071655479e-07, + "loss": 0.2392, + "step": 1994 + }, + { + "epoch": 0.9606934328537892, + "grad_norm": 1.0377631187438965, + "learning_rate": 8.369514472450379e-07, + "loss": 0.4342, + "step": 1995 + }, + { + "epoch": 0.9611749834466984, + "grad_norm": 0.4852606952190399, + "learning_rate": 8.169327724749543e-07, + "loss": 0.4412, + "step": 1996 + }, + { + "epoch": 0.9616565340396075, + "grad_norm": 1.388045310974121, + "learning_rate": 7.971554315650442e-07, + "loss": 0.4204, + "step": 1997 + }, + { + "epoch": 0.9621380846325167, + "grad_norm": 0.5457773208618164, + "learning_rate": 7.776194726378583e-07, + "loss": 0.4762, + "step": 1998 + }, + { + "epoch": 0.9626196352254258, + "grad_norm": 0.8520801067352295, + "learning_rate": 7.583249432286277e-07, + "loss": 0.5644, + "step": 1999 + }, + { + "epoch": 0.963101185818335, + "grad_norm": 1.1888178586959839, + "learning_rate": 7.392718902850981e-07, + "loss": 0.4423, + "step": 2000 + }, + { + "epoch": 0.9635827364112443, + "grad_norm": 0.5341615080833435, + "learning_rate": 7.204603601674853e-07, + "loss": 0.3201, + "step": 2001 + }, + { + "epoch": 0.9640642870041534, + "grad_norm": 1.1105878353118896, + "learning_rate": 7.018903986483083e-07, + "loss": 0.4972, + "step": 2002 + }, + { + "epoch": 0.9645458375970626, + "grad_norm": 0.8204899430274963, + "learning_rate": 6.835620509122897e-07, + "loss": 0.3859, + "step": 2003 + }, + { + "epoch": 0.9650273881899717, + "grad_norm": 0.7533660531044006, + "learning_rate": 6.65475361556267e-07, + "loss": 0.4297, + "step": 2004 + }, + { + "epoch": 0.9655089387828809, + "grad_norm": 0.9280441403388977, + "learning_rate": 6.47630374589081e-07, + "loss": 0.289, + "step": 2005 + }, + { + "epoch": 0.96599048937579, + "grad_norm": 0.5193145275115967, + "learning_rate": 6.300271334314434e-07, + "loss": 0.2499, + "step": 2006 + }, + { + "epoch": 0.9664720399686992, + "grad_norm": 0.6323145031929016, + "learning_rate": 6.126656809158359e-07, + "loss": 0.706, + "step": 2007 + }, + { + "epoch": 0.9669535905616083, + "grad_norm": 0.3609849214553833, + "learning_rate": 5.955460592864337e-07, + "loss": 0.4897, + "step": 2008 + }, + { + "epoch": 0.9674351411545176, + "grad_norm": 1.041792631149292, + "learning_rate": 5.78668310198982e-07, + "loss": 0.5796, + "step": 2009 + }, + { + "epoch": 0.9679166917474267, + "grad_norm": 0.41407084465026855, + "learning_rate": 5.620324747207084e-07, + "loss": 0.1837, + "step": 2010 + }, + { + "epoch": 0.9683982423403359, + "grad_norm": 0.5978147387504578, + "learning_rate": 5.456385933301777e-07, + "loss": 0.2899, + "step": 2011 + }, + { + "epoch": 0.968879792933245, + "grad_norm": 0.8195249438285828, + "learning_rate": 5.294867059172592e-07, + "loss": 0.2686, + "step": 2012 + }, + { + "epoch": 0.9693613435261542, + "grad_norm": 0.37067919969558716, + "learning_rate": 5.135768517829819e-07, + "loss": 0.3171, + "step": 2013 + }, + { + "epoch": 0.9698428941190634, + "grad_norm": 0.5091788172721863, + "learning_rate": 4.979090696394795e-07, + "loss": 0.3966, + "step": 2014 + }, + { + "epoch": 0.9703244447119725, + "grad_norm": 0.6499751210212708, + "learning_rate": 4.824833976098453e-07, + "loss": 0.2391, + "step": 2015 + }, + { + "epoch": 0.9708059953048818, + "grad_norm": 0.7504374384880066, + "learning_rate": 4.6729987322807757e-07, + "loss": 0.5995, + "step": 2016 + }, + { + "epoch": 0.9712875458977909, + "grad_norm": 0.5955843925476074, + "learning_rate": 4.523585334389679e-07, + "loss": 0.663, + "step": 2017 + }, + { + "epoch": 0.9717690964907001, + "grad_norm": 0.7871188521385193, + "learning_rate": 4.3765941459804614e-07, + "loss": 0.5966, + "step": 2018 + }, + { + "epoch": 0.9722506470836092, + "grad_norm": 1.3839247226715088, + "learning_rate": 4.232025524714356e-07, + "loss": 0.3596, + "step": 2019 + }, + { + "epoch": 0.9727321976765184, + "grad_norm": 0.976439893245697, + "learning_rate": 4.0898798223582e-07, + "loss": 0.4091, + "step": 2020 + }, + { + "epoch": 0.9732137482694275, + "grad_norm": 0.42140138149261475, + "learning_rate": 3.950157384783104e-07, + "loss": 0.6924, + "step": 2021 + }, + { + "epoch": 0.9736952988623367, + "grad_norm": 0.4503850042819977, + "learning_rate": 3.8128585519640046e-07, + "loss": 0.3682, + "step": 2022 + }, + { + "epoch": 0.9741768494552459, + "grad_norm": 0.628343939781189, + "learning_rate": 3.677983657978779e-07, + "loss": 0.4995, + "step": 2023 + }, + { + "epoch": 0.9746584000481551, + "grad_norm": 1.997673511505127, + "learning_rate": 3.545533031007131e-07, + "loss": 0.541, + "step": 2024 + }, + { + "epoch": 0.9751399506410642, + "grad_norm": 0.5482580065727234, + "learning_rate": 3.415506993330153e-07, + "loss": 0.2417, + "step": 2025 + }, + { + "epoch": 0.9756215012339734, + "grad_norm": 1.0305861234664917, + "learning_rate": 3.2879058613292105e-07, + "loss": 0.336, + "step": 2026 + }, + { + "epoch": 0.9761030518268826, + "grad_norm": 0.6240251064300537, + "learning_rate": 3.1627299454856095e-07, + "loss": 0.2639, + "step": 2027 + }, + { + "epoch": 0.9765846024197917, + "grad_norm": 1.3697896003723145, + "learning_rate": 3.0399795503793793e-07, + "loss": 0.5073, + "step": 2028 + }, + { + "epoch": 0.9770661530127009, + "grad_norm": 0.6504653096199036, + "learning_rate": 2.9196549746888235e-07, + "loss": 0.2701, + "step": 2029 + }, + { + "epoch": 0.97754770360561, + "grad_norm": 0.5833175182342529, + "learning_rate": 2.801756511189524e-07, + "loss": 0.1475, + "step": 2030 + }, + { + "epoch": 0.9780292541985193, + "grad_norm": 1.1188310384750366, + "learning_rate": 2.686284446754006e-07, + "loss": 0.2546, + "step": 2031 + }, + { + "epoch": 0.9785108047914284, + "grad_norm": 0.6936346888542175, + "learning_rate": 2.573239062350963e-07, + "loss": 0.7926, + "step": 2032 + }, + { + "epoch": 0.9789923553843376, + "grad_norm": 0.772508442401886, + "learning_rate": 2.4626206330440326e-07, + "loss": 0.8223, + "step": 2033 + }, + { + "epoch": 0.9794739059772467, + "grad_norm": 0.6184590458869934, + "learning_rate": 2.3544294279918e-07, + "loss": 0.4059, + "step": 2034 + }, + { + "epoch": 0.9799554565701559, + "grad_norm": 0.43711236119270325, + "learning_rate": 2.2486657104471286e-07, + "loss": 0.6158, + "step": 2035 + }, + { + "epoch": 0.980437007163065, + "grad_norm": 0.5568966269493103, + "learning_rate": 2.1453297377557191e-07, + "loss": 0.3057, + "step": 2036 + }, + { + "epoch": 0.9809185577559743, + "grad_norm": 4.560033798217773, + "learning_rate": 2.044421761356552e-07, + "loss": 0.4007, + "step": 2037 + }, + { + "epoch": 0.9814001083488834, + "grad_norm": 0.5342201590538025, + "learning_rate": 1.9459420267804452e-07, + "loss": 0.7108, + "step": 2038 + }, + { + "epoch": 0.9818816589417926, + "grad_norm": 1.072467565536499, + "learning_rate": 1.8498907736499426e-07, + "loss": 0.5877, + "step": 2039 + }, + { + "epoch": 0.9823632095347017, + "grad_norm": 0.8219976425170898, + "learning_rate": 1.7562682356786487e-07, + "loss": 0.6299, + "step": 2040 + }, + { + "epoch": 0.9828447601276109, + "grad_norm": 0.9927340745925903, + "learning_rate": 1.665074640670228e-07, + "loss": 0.6474, + "step": 2041 + }, + { + "epoch": 0.9833263107205201, + "grad_norm": 0.4996251165866852, + "learning_rate": 1.576310210518517e-07, + "loss": 0.2832, + "step": 2042 + }, + { + "epoch": 0.9838078613134292, + "grad_norm": 1.55043625831604, + "learning_rate": 1.489975161206636e-07, + "loss": 0.4885, + "step": 2043 + }, + { + "epoch": 0.9842894119063385, + "grad_norm": 0.4588821530342102, + "learning_rate": 1.406069702806323e-07, + "loss": 0.3025, + "step": 2044 + }, + { + "epoch": 0.9847709624992476, + "grad_norm": 0.966001570224762, + "learning_rate": 1.324594039477822e-07, + "loss": 0.7202, + "step": 2045 + }, + { + "epoch": 0.9852525130921568, + "grad_norm": 0.9288619160652161, + "learning_rate": 1.2455483694689962e-07, + "loss": 0.7488, + "step": 2046 + }, + { + "epoch": 0.9857340636850659, + "grad_norm": 0.6214163303375244, + "learning_rate": 1.1689328851151038e-07, + "loss": 0.6782, + "step": 2047 + }, + { + "epoch": 0.9862156142779751, + "grad_norm": 0.5284736752510071, + "learning_rate": 1.0947477728381339e-07, + "loss": 0.7494, + "step": 2048 + }, + { + "epoch": 0.9866971648708842, + "grad_norm": 1.5116268396377563, + "learning_rate": 1.0229932131465836e-07, + "loss": 0.5377, + "step": 2049 + }, + { + "epoch": 0.9871787154637934, + "grad_norm": 1.3248600959777832, + "learning_rate": 9.536693806347919e-08, + "loss": 0.9369, + "step": 2050 + }, + { + "epoch": 0.9876602660567025, + "grad_norm": 0.8132151365280151, + "learning_rate": 8.867764439826065e-08, + "loss": 0.3722, + "step": 2051 + }, + { + "epoch": 0.9881418166496118, + "grad_norm": 0.3936309218406677, + "learning_rate": 8.223145659550513e-08, + "loss": 0.6441, + "step": 2052 + }, + { + "epoch": 0.9886233672425209, + "grad_norm": 0.7746490836143494, + "learning_rate": 7.602839034017706e-08, + "loss": 0.4787, + "step": 2053 + }, + { + "epoch": 0.9891049178354301, + "grad_norm": 0.9102803468704224, + "learning_rate": 7.006846072568074e-08, + "loss": 0.249, + "step": 2054 + }, + { + "epoch": 0.9895864684283393, + "grad_norm": 0.7294951677322388, + "learning_rate": 6.435168225381594e-08, + "loss": 0.2534, + "step": 2055 + }, + { + "epoch": 0.9900680190212484, + "grad_norm": 0.3461167812347412, + "learning_rate": 5.887806883474456e-08, + "loss": 0.2111, + "step": 2056 + }, + { + "epoch": 0.9905495696141576, + "grad_norm": 0.5097566246986389, + "learning_rate": 5.364763378694626e-08, + "loss": 0.3375, + "step": 2057 + }, + { + "epoch": 0.9910311202070667, + "grad_norm": 0.808583676815033, + "learning_rate": 4.8660389837207334e-08, + "loss": 0.6052, + "step": 2058 + }, + { + "epoch": 0.991512670799976, + "grad_norm": 0.4117794930934906, + "learning_rate": 4.391634912056519e-08, + "loss": 0.6487, + "step": 2059 + }, + { + "epoch": 0.9919942213928851, + "grad_norm": 0.8955495357513428, + "learning_rate": 3.9415523180297286e-08, + "loss": 0.3302, + "step": 2060 + }, + { + "epoch": 0.9924757719857943, + "grad_norm": 0.40873655676841736, + "learning_rate": 3.515792296789888e-08, + "loss": 0.0989, + "step": 2061 + }, + { + "epoch": 0.9929573225787034, + "grad_norm": 2.9391791820526123, + "learning_rate": 3.114355884301645e-08, + "loss": 0.5477, + "step": 2062 + }, + { + "epoch": 0.9934388731716126, + "grad_norm": 1.1457031965255737, + "learning_rate": 2.7372440573469883e-08, + "loss": 0.4869, + "step": 2063 + }, + { + "epoch": 0.9939204237645217, + "grad_norm": 0.5706020593643188, + "learning_rate": 2.384457733520806e-08, + "loss": 0.2756, + "step": 2064 + }, + { + "epoch": 0.9944019743574309, + "grad_norm": 1.2542824745178223, + "learning_rate": 2.0559977712297785e-08, + "loss": 0.4387, + "step": 2065 + }, + { + "epoch": 0.99488352495034, + "grad_norm": 0.7238984107971191, + "learning_rate": 1.7518649696857126e-08, + "loss": 0.5792, + "step": 2066 + }, + { + "epoch": 0.9953650755432493, + "grad_norm": 1.1104798316955566, + "learning_rate": 1.4720600689110963e-08, + "loss": 0.6109, + "step": 2067 + }, + { + "epoch": 0.9958466261361584, + "grad_norm": 0.6000664830207825, + "learning_rate": 1.216583749731326e-08, + "loss": 0.2634, + "step": 2068 + }, + { + "epoch": 0.9963281767290676, + "grad_norm": 0.6709604263305664, + "learning_rate": 9.854366337758159e-09, + "loss": 0.7701, + "step": 2069 + }, + { + "epoch": 0.9968097273219768, + "grad_norm": 0.7684647440910339, + "learning_rate": 7.786192834746686e-09, + "loss": 0.8132, + "step": 2070 + }, + { + "epoch": 0.9972912779148859, + "grad_norm": 0.5636360049247742, + "learning_rate": 5.961322020608951e-09, + "loss": 0.2347, + "step": 2071 + }, + { + "epoch": 0.9977728285077951, + "grad_norm": 0.16398949921131134, + "learning_rate": 4.3797583356264275e-09, + "loss": 0.0772, + "step": 2072 + }, + { + "epoch": 0.9982543791007042, + "grad_norm": 0.594808042049408, + "learning_rate": 3.0415056281096755e-09, + "loss": 0.2629, + "step": 2073 + }, + { + "epoch": 0.9987359296936135, + "grad_norm": 1.039145827293396, + "learning_rate": 1.9465671543095197e-09, + "loss": 1.0005, + "step": 2074 + }, + { + "epoch": 0.9992174802865226, + "grad_norm": 0.5652502775192261, + "learning_rate": 1.094945578439255e-09, + "loss": 0.5995, + "step": 2075 + }, + { + "epoch": 0.9996990308794318, + "grad_norm": 0.9868450164794922, + "learning_rate": 4.866429726857469e-10, + "loss": 0.6929, + "step": 2076 + }, + { + "epoch": 1.0, + "grad_norm": 0.5426987409591675, + "learning_rate": 1.2166081717612797e-10, + "loss": 0.5192, + "step": 2077 + }, + { + "epoch": 1.0, + "step": 2077, + "total_flos": 6.718855893014938e+17, + "train_loss": 0.5155493662435674, + "train_runtime": 5780.5031, + "train_samples_per_second": 5.748, + "train_steps_per_second": 0.359 + } + ], + "logging_steps": 1, + "max_steps": 2077, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 2400000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.718855893014938e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}