diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7917 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.993342210386152, + "eval_steps": 500, + "global_step": 1125, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002663115845539281, + "grad_norm": 59.669442519158444, + "learning_rate": 4.424778761061947e-07, + "loss": 11.0815, + "step": 1 + }, + { + "epoch": 0.005326231691078562, + "grad_norm": 59.77300379138749, + "learning_rate": 8.849557522123894e-07, + "loss": 11.0703, + "step": 2 + }, + { + "epoch": 0.007989347536617843, + "grad_norm": 59.37811338851668, + "learning_rate": 1.3274336283185841e-06, + "loss": 11.1149, + "step": 3 + }, + { + "epoch": 0.010652463382157125, + "grad_norm": 59.714257927262075, + "learning_rate": 1.7699115044247788e-06, + "loss": 11.1, + "step": 4 + }, + { + "epoch": 0.013315579227696404, + "grad_norm": 62.19325541849273, + "learning_rate": 2.2123893805309734e-06, + "loss": 10.9008, + "step": 5 + }, + { + "epoch": 0.015978695073235686, + "grad_norm": 64.3469313247898, + "learning_rate": 2.6548672566371683e-06, + "loss": 10.7897, + "step": 6 + }, + { + "epoch": 0.018641810918774968, + "grad_norm": 64.70693307946331, + "learning_rate": 3.097345132743363e-06, + "loss": 10.6244, + "step": 7 + }, + { + "epoch": 0.02130492676431425, + "grad_norm": 100.07904925734698, + "learning_rate": 3.5398230088495575e-06, + "loss": 9.3505, + "step": 8 + }, + { + "epoch": 0.023968042609853527, + "grad_norm": 121.42213770896274, + "learning_rate": 3.982300884955752e-06, + "loss": 8.5961, + "step": 9 + }, + { + "epoch": 0.02663115845539281, + "grad_norm": 64.96997432704501, + "learning_rate": 4.424778761061947e-06, + "loss": 3.5386, + "step": 10 + }, + { + "epoch": 0.02929427430093209, + "grad_norm": 53.5067123571589, + "learning_rate": 4.867256637168142e-06, + "loss": 3.1169, + "step": 11 + }, + { + "epoch": 0.03195739014647137, + "grad_norm": 34.28454533456946, + "learning_rate": 5.3097345132743365e-06, + "loss": 2.3171, + "step": 12 + }, + { + "epoch": 0.03462050599201065, + "grad_norm": 28.02284592011359, + "learning_rate": 5.752212389380531e-06, + "loss": 2.1704, + "step": 13 + }, + { + "epoch": 0.037283621837549935, + "grad_norm": 6.230233716943746, + "learning_rate": 6.194690265486726e-06, + "loss": 1.3702, + "step": 14 + }, + { + "epoch": 0.03994673768308921, + "grad_norm": 4.8265444090252325, + "learning_rate": 6.6371681415929215e-06, + "loss": 1.2994, + "step": 15 + }, + { + "epoch": 0.0426098535286285, + "grad_norm": 3.4989649353882544, + "learning_rate": 7.079646017699115e-06, + "loss": 1.1939, + "step": 16 + }, + { + "epoch": 0.045272969374167776, + "grad_norm": 2.548022240081304, + "learning_rate": 7.52212389380531e-06, + "loss": 1.1113, + "step": 17 + }, + { + "epoch": 0.047936085219707054, + "grad_norm": 1.7785073197319812, + "learning_rate": 7.964601769911505e-06, + "loss": 1.0099, + "step": 18 + }, + { + "epoch": 0.05059920106524634, + "grad_norm": 52.43472197468591, + "learning_rate": 8.407079646017701e-06, + "loss": 1.0002, + "step": 19 + }, + { + "epoch": 0.05326231691078562, + "grad_norm": 18.71256882921437, + "learning_rate": 8.849557522123894e-06, + "loss": 0.9335, + "step": 20 + }, + { + "epoch": 0.0559254327563249, + "grad_norm": 1.6748381666125123, + "learning_rate": 9.29203539823009e-06, + "loss": 0.8897, + "step": 21 + }, + { + "epoch": 0.05858854860186418, + "grad_norm": 1.2119772296620004, + "learning_rate": 9.734513274336284e-06, + "loss": 0.8728, + "step": 22 + }, + { + "epoch": 0.06125166444740346, + "grad_norm": 0.9292233025769583, + "learning_rate": 1.0176991150442479e-05, + "loss": 0.8443, + "step": 23 + }, + { + "epoch": 0.06391478029294274, + "grad_norm": 0.8058222924733704, + "learning_rate": 1.0619469026548673e-05, + "loss": 0.8065, + "step": 24 + }, + { + "epoch": 0.06657789613848203, + "grad_norm": 0.7676888976773729, + "learning_rate": 1.1061946902654869e-05, + "loss": 0.744, + "step": 25 + }, + { + "epoch": 0.0692410119840213, + "grad_norm": 1.1442962246712427, + "learning_rate": 1.1504424778761062e-05, + "loss": 0.7962, + "step": 26 + }, + { + "epoch": 0.07190412782956059, + "grad_norm": 0.8086732801653846, + "learning_rate": 1.1946902654867258e-05, + "loss": 0.7546, + "step": 27 + }, + { + "epoch": 0.07456724367509987, + "grad_norm": 0.6032687314644429, + "learning_rate": 1.2389380530973452e-05, + "loss": 0.6961, + "step": 28 + }, + { + "epoch": 0.07723035952063914, + "grad_norm": 0.8050008569135423, + "learning_rate": 1.2831858407079647e-05, + "loss": 0.7181, + "step": 29 + }, + { + "epoch": 0.07989347536617843, + "grad_norm": 0.7760170053857292, + "learning_rate": 1.3274336283185843e-05, + "loss": 0.7011, + "step": 30 + }, + { + "epoch": 0.08255659121171771, + "grad_norm": 0.6911853454916363, + "learning_rate": 1.3716814159292036e-05, + "loss": 0.6767, + "step": 31 + }, + { + "epoch": 0.085219707057257, + "grad_norm": 0.5690990372888421, + "learning_rate": 1.415929203539823e-05, + "loss": 0.6657, + "step": 32 + }, + { + "epoch": 0.08788282290279627, + "grad_norm": 0.46539236587043925, + "learning_rate": 1.4601769911504426e-05, + "loss": 0.6585, + "step": 33 + }, + { + "epoch": 0.09054593874833555, + "grad_norm": 0.6011651474231043, + "learning_rate": 1.504424778761062e-05, + "loss": 0.6571, + "step": 34 + }, + { + "epoch": 0.09320905459387484, + "grad_norm": 0.6055438783984222, + "learning_rate": 1.5486725663716813e-05, + "loss": 0.6307, + "step": 35 + }, + { + "epoch": 0.09587217043941411, + "grad_norm": 0.4930140407791457, + "learning_rate": 1.592920353982301e-05, + "loss": 0.638, + "step": 36 + }, + { + "epoch": 0.0985352862849534, + "grad_norm": 0.38727032176053555, + "learning_rate": 1.6371681415929206e-05, + "loss": 0.6189, + "step": 37 + }, + { + "epoch": 0.10119840213049268, + "grad_norm": 0.46992360907642716, + "learning_rate": 1.6814159292035402e-05, + "loss": 0.6242, + "step": 38 + }, + { + "epoch": 0.10386151797603196, + "grad_norm": 0.5002104790615647, + "learning_rate": 1.7256637168141594e-05, + "loss": 0.6087, + "step": 39 + }, + { + "epoch": 0.10652463382157124, + "grad_norm": 0.4378982855259104, + "learning_rate": 1.7699115044247787e-05, + "loss": 0.6112, + "step": 40 + }, + { + "epoch": 0.10918774966711052, + "grad_norm": 0.343549106950523, + "learning_rate": 1.8141592920353983e-05, + "loss": 0.6251, + "step": 41 + }, + { + "epoch": 0.1118508655126498, + "grad_norm": 0.43140422077824325, + "learning_rate": 1.858407079646018e-05, + "loss": 0.625, + "step": 42 + }, + { + "epoch": 0.11451398135818908, + "grad_norm": 0.44945895418028914, + "learning_rate": 1.9026548672566372e-05, + "loss": 0.576, + "step": 43 + }, + { + "epoch": 0.11717709720372836, + "grad_norm": 0.33640715838659224, + "learning_rate": 1.946902654867257e-05, + "loss": 0.602, + "step": 44 + }, + { + "epoch": 0.11984021304926765, + "grad_norm": 0.3602083165810118, + "learning_rate": 1.991150442477876e-05, + "loss": 0.5707, + "step": 45 + }, + { + "epoch": 0.12250332889480692, + "grad_norm": 1.7341245223857158, + "learning_rate": 2.0353982300884957e-05, + "loss": 0.5662, + "step": 46 + }, + { + "epoch": 0.12516644474034622, + "grad_norm": 0.42320706053839496, + "learning_rate": 2.079646017699115e-05, + "loss": 0.5718, + "step": 47 + }, + { + "epoch": 0.1278295605858855, + "grad_norm": 0.34356067841011745, + "learning_rate": 2.1238938053097346e-05, + "loss": 0.5652, + "step": 48 + }, + { + "epoch": 0.13049267643142476, + "grad_norm": 0.37607875054105366, + "learning_rate": 2.1681415929203542e-05, + "loss": 0.6079, + "step": 49 + }, + { + "epoch": 0.13315579227696406, + "grad_norm": 0.355877489349339, + "learning_rate": 2.2123893805309738e-05, + "loss": 0.5414, + "step": 50 + }, + { + "epoch": 0.13581890812250333, + "grad_norm": 0.3531413648567738, + "learning_rate": 2.2566371681415928e-05, + "loss": 0.5383, + "step": 51 + }, + { + "epoch": 0.1384820239680426, + "grad_norm": 0.3900867327584249, + "learning_rate": 2.3008849557522124e-05, + "loss": 0.5607, + "step": 52 + }, + { + "epoch": 0.1411451398135819, + "grad_norm": 0.29096561379999103, + "learning_rate": 2.345132743362832e-05, + "loss": 0.5428, + "step": 53 + }, + { + "epoch": 0.14380825565912117, + "grad_norm": 0.34882597172967983, + "learning_rate": 2.3893805309734516e-05, + "loss": 0.5597, + "step": 54 + }, + { + "epoch": 0.14647137150466044, + "grad_norm": 0.31745047102841745, + "learning_rate": 2.433628318584071e-05, + "loss": 0.5427, + "step": 55 + }, + { + "epoch": 0.14913448735019974, + "grad_norm": 0.3429464925874952, + "learning_rate": 2.4778761061946905e-05, + "loss": 0.5418, + "step": 56 + }, + { + "epoch": 0.151797603195739, + "grad_norm": 0.28154789184935636, + "learning_rate": 2.5221238938053098e-05, + "loss": 0.5701, + "step": 57 + }, + { + "epoch": 0.15446071904127828, + "grad_norm": 0.3141148216942468, + "learning_rate": 2.5663716814159294e-05, + "loss": 0.5279, + "step": 58 + }, + { + "epoch": 0.15712383488681758, + "grad_norm": 0.3077683025338142, + "learning_rate": 2.610619469026549e-05, + "loss": 0.5443, + "step": 59 + }, + { + "epoch": 0.15978695073235685, + "grad_norm": 0.35329472069062134, + "learning_rate": 2.6548672566371686e-05, + "loss": 0.5657, + "step": 60 + }, + { + "epoch": 0.16245006657789615, + "grad_norm": 0.30082869981695665, + "learning_rate": 2.6991150442477875e-05, + "loss": 0.5386, + "step": 61 + }, + { + "epoch": 0.16511318242343542, + "grad_norm": 0.3705381333041911, + "learning_rate": 2.743362831858407e-05, + "loss": 0.5417, + "step": 62 + }, + { + "epoch": 0.1677762982689747, + "grad_norm": 0.3424625742113855, + "learning_rate": 2.7876106194690264e-05, + "loss": 0.5334, + "step": 63 + }, + { + "epoch": 0.170439414114514, + "grad_norm": 0.2904098798351202, + "learning_rate": 2.831858407079646e-05, + "loss": 0.5424, + "step": 64 + }, + { + "epoch": 0.17310252996005326, + "grad_norm": 0.32851572085926894, + "learning_rate": 2.8761061946902656e-05, + "loss": 0.5231, + "step": 65 + }, + { + "epoch": 0.17576564580559254, + "grad_norm": 0.29034784648982725, + "learning_rate": 2.9203539823008852e-05, + "loss": 0.5394, + "step": 66 + }, + { + "epoch": 0.17842876165113183, + "grad_norm": 0.33213549417249844, + "learning_rate": 2.964601769911505e-05, + "loss": 0.54, + "step": 67 + }, + { + "epoch": 0.1810918774966711, + "grad_norm": 0.2751631826164567, + "learning_rate": 3.008849557522124e-05, + "loss": 0.5254, + "step": 68 + }, + { + "epoch": 0.18375499334221038, + "grad_norm": 0.3037009657021324, + "learning_rate": 3.0530973451327434e-05, + "loss": 0.5216, + "step": 69 + }, + { + "epoch": 0.18641810918774968, + "grad_norm": 0.30105360826964594, + "learning_rate": 3.097345132743363e-05, + "loss": 0.5111, + "step": 70 + }, + { + "epoch": 0.18908122503328895, + "grad_norm": 0.3202863693523833, + "learning_rate": 3.1415929203539826e-05, + "loss": 0.537, + "step": 71 + }, + { + "epoch": 0.19174434087882822, + "grad_norm": 0.3294366280935238, + "learning_rate": 3.185840707964602e-05, + "loss": 0.5215, + "step": 72 + }, + { + "epoch": 0.19440745672436752, + "grad_norm": 0.32228297514585236, + "learning_rate": 3.230088495575221e-05, + "loss": 0.536, + "step": 73 + }, + { + "epoch": 0.1970705725699068, + "grad_norm": 0.31224977631197853, + "learning_rate": 3.274336283185841e-05, + "loss": 0.5133, + "step": 74 + }, + { + "epoch": 0.19973368841544606, + "grad_norm": 0.34249789697496347, + "learning_rate": 3.3185840707964604e-05, + "loss": 0.5187, + "step": 75 + }, + { + "epoch": 0.20239680426098536, + "grad_norm": 0.3014674455677291, + "learning_rate": 3.3628318584070804e-05, + "loss": 0.5173, + "step": 76 + }, + { + "epoch": 0.20505992010652463, + "grad_norm": 0.31181209074311145, + "learning_rate": 3.407079646017699e-05, + "loss": 0.4938, + "step": 77 + }, + { + "epoch": 0.20772303595206393, + "grad_norm": 0.3421599429123891, + "learning_rate": 3.451327433628319e-05, + "loss": 0.5178, + "step": 78 + }, + { + "epoch": 0.2103861517976032, + "grad_norm": 0.32144698779599035, + "learning_rate": 3.495575221238938e-05, + "loss": 0.529, + "step": 79 + }, + { + "epoch": 0.21304926764314247, + "grad_norm": 0.30829102288383803, + "learning_rate": 3.5398230088495574e-05, + "loss": 0.5045, + "step": 80 + }, + { + "epoch": 0.21571238348868177, + "grad_norm": 0.3320673147021741, + "learning_rate": 3.5840707964601774e-05, + "loss": 0.5193, + "step": 81 + }, + { + "epoch": 0.21837549933422104, + "grad_norm": 0.3257493459194373, + "learning_rate": 3.628318584070797e-05, + "loss": 0.5161, + "step": 82 + }, + { + "epoch": 0.2210386151797603, + "grad_norm": 0.3451069209364067, + "learning_rate": 3.672566371681416e-05, + "loss": 0.4902, + "step": 83 + }, + { + "epoch": 0.2237017310252996, + "grad_norm": 0.38062902785170477, + "learning_rate": 3.716814159292036e-05, + "loss": 0.5106, + "step": 84 + }, + { + "epoch": 0.22636484687083888, + "grad_norm": 0.3437845837066077, + "learning_rate": 3.7610619469026545e-05, + "loss": 0.5072, + "step": 85 + }, + { + "epoch": 0.22902796271637815, + "grad_norm": 0.4369801740657791, + "learning_rate": 3.8053097345132744e-05, + "loss": 0.5016, + "step": 86 + }, + { + "epoch": 0.23169107856191745, + "grad_norm": 0.39323367167161793, + "learning_rate": 3.849557522123894e-05, + "loss": 0.5126, + "step": 87 + }, + { + "epoch": 0.23435419440745672, + "grad_norm": 0.3804923058106557, + "learning_rate": 3.893805309734514e-05, + "loss": 0.5169, + "step": 88 + }, + { + "epoch": 0.237017310252996, + "grad_norm": 0.3991475997522414, + "learning_rate": 3.938053097345133e-05, + "loss": 0.5206, + "step": 89 + }, + { + "epoch": 0.2396804260985353, + "grad_norm": 0.3345983998430803, + "learning_rate": 3.982300884955752e-05, + "loss": 0.5126, + "step": 90 + }, + { + "epoch": 0.24234354194407456, + "grad_norm": 0.37605023011424904, + "learning_rate": 4.026548672566372e-05, + "loss": 0.517, + "step": 91 + }, + { + "epoch": 0.24500665778961384, + "grad_norm": 0.30015095297467786, + "learning_rate": 4.0707964601769914e-05, + "loss": 0.5146, + "step": 92 + }, + { + "epoch": 0.24766977363515313, + "grad_norm": 0.37615535541775885, + "learning_rate": 4.115044247787611e-05, + "loss": 0.4897, + "step": 93 + }, + { + "epoch": 0.25033288948069243, + "grad_norm": 0.32506469165922075, + "learning_rate": 4.15929203539823e-05, + "loss": 0.5033, + "step": 94 + }, + { + "epoch": 0.2529960053262317, + "grad_norm": 0.3955130401533768, + "learning_rate": 4.20353982300885e-05, + "loss": 0.517, + "step": 95 + }, + { + "epoch": 0.255659121171771, + "grad_norm": 0.38256193351931217, + "learning_rate": 4.247787610619469e-05, + "loss": 0.4903, + "step": 96 + }, + { + "epoch": 0.2583222370173103, + "grad_norm": 0.3757931359073768, + "learning_rate": 4.2920353982300885e-05, + "loss": 0.4881, + "step": 97 + }, + { + "epoch": 0.2609853528628495, + "grad_norm": 0.4073525724085135, + "learning_rate": 4.3362831858407084e-05, + "loss": 0.4981, + "step": 98 + }, + { + "epoch": 0.2636484687083888, + "grad_norm": 0.42226304140119747, + "learning_rate": 4.380530973451328e-05, + "loss": 0.4777, + "step": 99 + }, + { + "epoch": 0.2663115845539281, + "grad_norm": 0.47546631243940135, + "learning_rate": 4.4247787610619477e-05, + "loss": 0.5012, + "step": 100 + }, + { + "epoch": 0.26897470039946736, + "grad_norm": 0.38067024978966585, + "learning_rate": 4.469026548672566e-05, + "loss": 0.5038, + "step": 101 + }, + { + "epoch": 0.27163781624500666, + "grad_norm": 0.3549335612107799, + "learning_rate": 4.5132743362831855e-05, + "loss": 0.5046, + "step": 102 + }, + { + "epoch": 0.27430093209054596, + "grad_norm": 0.4081532806299182, + "learning_rate": 4.5575221238938055e-05, + "loss": 0.4816, + "step": 103 + }, + { + "epoch": 0.2769640479360852, + "grad_norm": 0.35702973975911423, + "learning_rate": 4.601769911504425e-05, + "loss": 0.4969, + "step": 104 + }, + { + "epoch": 0.2796271637816245, + "grad_norm": 0.3750952303695297, + "learning_rate": 4.646017699115045e-05, + "loss": 0.5129, + "step": 105 + }, + { + "epoch": 0.2822902796271638, + "grad_norm": 0.3713537523929101, + "learning_rate": 4.690265486725664e-05, + "loss": 0.4871, + "step": 106 + }, + { + "epoch": 0.28495339547270304, + "grad_norm": 0.47534354342607993, + "learning_rate": 4.734513274336283e-05, + "loss": 0.4971, + "step": 107 + }, + { + "epoch": 0.28761651131824234, + "grad_norm": 0.41826478296211245, + "learning_rate": 4.778761061946903e-05, + "loss": 0.4943, + "step": 108 + }, + { + "epoch": 0.29027962716378164, + "grad_norm": 0.39759514237849775, + "learning_rate": 4.823008849557522e-05, + "loss": 0.5014, + "step": 109 + }, + { + "epoch": 0.2929427430093209, + "grad_norm": 0.4548008624547614, + "learning_rate": 4.867256637168142e-05, + "loss": 0.5067, + "step": 110 + }, + { + "epoch": 0.2956058588548602, + "grad_norm": 0.4618812739465874, + "learning_rate": 4.911504424778761e-05, + "loss": 0.487, + "step": 111 + }, + { + "epoch": 0.2982689747003995, + "grad_norm": 0.31165613667101594, + "learning_rate": 4.955752212389381e-05, + "loss": 0.4908, + "step": 112 + }, + { + "epoch": 0.3009320905459387, + "grad_norm": 0.45735168765249185, + "learning_rate": 5e-05, + "loss": 0.4924, + "step": 113 + }, + { + "epoch": 0.303595206391478, + "grad_norm": 0.4659242945372524, + "learning_rate": 4.9950592885375493e-05, + "loss": 0.49, + "step": 114 + }, + { + "epoch": 0.3062583222370173, + "grad_norm": 0.3422222311667708, + "learning_rate": 4.990118577075099e-05, + "loss": 0.4902, + "step": 115 + }, + { + "epoch": 0.30892143808255657, + "grad_norm": 0.5702864889691999, + "learning_rate": 4.985177865612648e-05, + "loss": 0.4712, + "step": 116 + }, + { + "epoch": 0.31158455392809586, + "grad_norm": 0.31000398399919754, + "learning_rate": 4.980237154150198e-05, + "loss": 0.4729, + "step": 117 + }, + { + "epoch": 0.31424766977363516, + "grad_norm": 0.5329093367544124, + "learning_rate": 4.975296442687747e-05, + "loss": 0.4979, + "step": 118 + }, + { + "epoch": 0.3169107856191744, + "grad_norm": 0.41581595613618844, + "learning_rate": 4.970355731225297e-05, + "loss": 0.4979, + "step": 119 + }, + { + "epoch": 0.3195739014647137, + "grad_norm": 0.5898871183617019, + "learning_rate": 4.965415019762846e-05, + "loss": 0.4841, + "step": 120 + }, + { + "epoch": 0.322237017310253, + "grad_norm": 0.5277745967026336, + "learning_rate": 4.960474308300396e-05, + "loss": 0.494, + "step": 121 + }, + { + "epoch": 0.3249001331557923, + "grad_norm": 0.6707049603761084, + "learning_rate": 4.955533596837945e-05, + "loss": 0.4816, + "step": 122 + }, + { + "epoch": 0.32756324900133155, + "grad_norm": 0.39379278723705347, + "learning_rate": 4.950592885375494e-05, + "loss": 0.4708, + "step": 123 + }, + { + "epoch": 0.33022636484687085, + "grad_norm": 0.5682660745624962, + "learning_rate": 4.945652173913044e-05, + "loss": 0.4844, + "step": 124 + }, + { + "epoch": 0.33288948069241014, + "grad_norm": 0.4164160620027728, + "learning_rate": 4.940711462450593e-05, + "loss": 0.4577, + "step": 125 + }, + { + "epoch": 0.3355525965379494, + "grad_norm": 0.5359420179155978, + "learning_rate": 4.9357707509881426e-05, + "loss": 0.4723, + "step": 126 + }, + { + "epoch": 0.3382157123834887, + "grad_norm": 0.5026386563312899, + "learning_rate": 4.930830039525692e-05, + "loss": 0.4706, + "step": 127 + }, + { + "epoch": 0.340878828229028, + "grad_norm": 0.5189502106027113, + "learning_rate": 4.9258893280632415e-05, + "loss": 0.4814, + "step": 128 + }, + { + "epoch": 0.34354194407456723, + "grad_norm": 0.46462849504368775, + "learning_rate": 4.9209486166007906e-05, + "loss": 0.4735, + "step": 129 + }, + { + "epoch": 0.34620505992010653, + "grad_norm": 0.5495458064144569, + "learning_rate": 4.9160079051383404e-05, + "loss": 0.4964, + "step": 130 + }, + { + "epoch": 0.3488681757656458, + "grad_norm": 0.4136354389486864, + "learning_rate": 4.9110671936758895e-05, + "loss": 0.4937, + "step": 131 + }, + { + "epoch": 0.35153129161118507, + "grad_norm": 0.49819742888588847, + "learning_rate": 4.906126482213439e-05, + "loss": 0.4929, + "step": 132 + }, + { + "epoch": 0.35419440745672437, + "grad_norm": 0.5211986557669676, + "learning_rate": 4.901185770750988e-05, + "loss": 0.4722, + "step": 133 + }, + { + "epoch": 0.35685752330226367, + "grad_norm": 0.3743611868649684, + "learning_rate": 4.896245059288538e-05, + "loss": 0.4852, + "step": 134 + }, + { + "epoch": 0.3595206391478029, + "grad_norm": 0.47244102498767254, + "learning_rate": 4.891304347826087e-05, + "loss": 0.4846, + "step": 135 + }, + { + "epoch": 0.3621837549933422, + "grad_norm": 0.39536123377896054, + "learning_rate": 4.886363636363637e-05, + "loss": 0.4812, + "step": 136 + }, + { + "epoch": 0.3648468708388815, + "grad_norm": 0.39389579963168014, + "learning_rate": 4.881422924901186e-05, + "loss": 0.4814, + "step": 137 + }, + { + "epoch": 0.36750998668442075, + "grad_norm": 0.5517767967854046, + "learning_rate": 4.876482213438736e-05, + "loss": 0.4605, + "step": 138 + }, + { + "epoch": 0.37017310252996005, + "grad_norm": 0.3371092349408584, + "learning_rate": 4.871541501976285e-05, + "loss": 0.4919, + "step": 139 + }, + { + "epoch": 0.37283621837549935, + "grad_norm": 0.5454997328166629, + "learning_rate": 4.866600790513835e-05, + "loss": 0.478, + "step": 140 + }, + { + "epoch": 0.3754993342210386, + "grad_norm": 0.38191662974594565, + "learning_rate": 4.861660079051384e-05, + "loss": 0.4675, + "step": 141 + }, + { + "epoch": 0.3781624500665779, + "grad_norm": 0.44622867680541506, + "learning_rate": 4.8567193675889336e-05, + "loss": 0.4767, + "step": 142 + }, + { + "epoch": 0.3808255659121172, + "grad_norm": 0.40615171610446554, + "learning_rate": 4.851778656126482e-05, + "loss": 0.4796, + "step": 143 + }, + { + "epoch": 0.38348868175765644, + "grad_norm": 0.4067512139515564, + "learning_rate": 4.846837944664032e-05, + "loss": 0.4921, + "step": 144 + }, + { + "epoch": 0.38615179760319573, + "grad_norm": 0.3764557796844728, + "learning_rate": 4.841897233201581e-05, + "loss": 0.4859, + "step": 145 + }, + { + "epoch": 0.38881491344873503, + "grad_norm": 0.4154794205261891, + "learning_rate": 4.836956521739131e-05, + "loss": 0.4673, + "step": 146 + }, + { + "epoch": 0.3914780292942743, + "grad_norm": 0.4269745611686079, + "learning_rate": 4.83201581027668e-05, + "loss": 0.4551, + "step": 147 + }, + { + "epoch": 0.3941411451398136, + "grad_norm": 0.38377387438781274, + "learning_rate": 4.8270750988142296e-05, + "loss": 0.487, + "step": 148 + }, + { + "epoch": 0.3968042609853529, + "grad_norm": 0.5603533831020405, + "learning_rate": 4.822134387351779e-05, + "loss": 0.4849, + "step": 149 + }, + { + "epoch": 0.3994673768308921, + "grad_norm": 0.3973953941114295, + "learning_rate": 4.8171936758893284e-05, + "loss": 0.4776, + "step": 150 + }, + { + "epoch": 0.4021304926764314, + "grad_norm": 0.4956339650363368, + "learning_rate": 4.8122529644268775e-05, + "loss": 0.4588, + "step": 151 + }, + { + "epoch": 0.4047936085219707, + "grad_norm": 0.38460346615021695, + "learning_rate": 4.807312252964427e-05, + "loss": 0.4737, + "step": 152 + }, + { + "epoch": 0.40745672436750996, + "grad_norm": 0.5226991882164052, + "learning_rate": 4.8023715415019764e-05, + "loss": 0.4827, + "step": 153 + }, + { + "epoch": 0.41011984021304926, + "grad_norm": 0.3418933085513387, + "learning_rate": 4.797430830039526e-05, + "loss": 0.4594, + "step": 154 + }, + { + "epoch": 0.41278295605858856, + "grad_norm": 0.41779277140490917, + "learning_rate": 4.792490118577075e-05, + "loss": 0.4738, + "step": 155 + }, + { + "epoch": 0.41544607190412786, + "grad_norm": 0.40524225841023903, + "learning_rate": 4.787549407114625e-05, + "loss": 0.4725, + "step": 156 + }, + { + "epoch": 0.4181091877496671, + "grad_norm": 0.37804713363928255, + "learning_rate": 4.782608695652174e-05, + "loss": 0.476, + "step": 157 + }, + { + "epoch": 0.4207723035952064, + "grad_norm": 0.32987544007452513, + "learning_rate": 4.777667984189724e-05, + "loss": 0.4606, + "step": 158 + }, + { + "epoch": 0.4234354194407457, + "grad_norm": 0.32638522089295396, + "learning_rate": 4.772727272727273e-05, + "loss": 0.4796, + "step": 159 + }, + { + "epoch": 0.42609853528628494, + "grad_norm": 0.3653611962183669, + "learning_rate": 4.767786561264823e-05, + "loss": 0.4703, + "step": 160 + }, + { + "epoch": 0.42876165113182424, + "grad_norm": 0.39387144328442575, + "learning_rate": 4.762845849802372e-05, + "loss": 0.4821, + "step": 161 + }, + { + "epoch": 0.43142476697736354, + "grad_norm": 0.473795283228247, + "learning_rate": 4.757905138339921e-05, + "loss": 0.4638, + "step": 162 + }, + { + "epoch": 0.4340878828229028, + "grad_norm": 0.33040966306125785, + "learning_rate": 4.75296442687747e-05, + "loss": 0.4734, + "step": 163 + }, + { + "epoch": 0.4367509986684421, + "grad_norm": 0.42723446550700767, + "learning_rate": 4.74802371541502e-05, + "loss": 0.4809, + "step": 164 + }, + { + "epoch": 0.4394141145139814, + "grad_norm": 0.3675475725903659, + "learning_rate": 4.743083003952569e-05, + "loss": 0.4586, + "step": 165 + }, + { + "epoch": 0.4420772303595206, + "grad_norm": 0.4219979464151687, + "learning_rate": 4.738142292490119e-05, + "loss": 0.4678, + "step": 166 + }, + { + "epoch": 0.4447403462050599, + "grad_norm": 0.3857740050906692, + "learning_rate": 4.733201581027668e-05, + "loss": 0.4633, + "step": 167 + }, + { + "epoch": 0.4474034620505992, + "grad_norm": 0.365686963876862, + "learning_rate": 4.7282608695652177e-05, + "loss": 0.4712, + "step": 168 + }, + { + "epoch": 0.45006657789613846, + "grad_norm": 0.43242439287350204, + "learning_rate": 4.723320158102767e-05, + "loss": 0.4751, + "step": 169 + }, + { + "epoch": 0.45272969374167776, + "grad_norm": 0.3908982963736634, + "learning_rate": 4.7183794466403165e-05, + "loss": 0.4723, + "step": 170 + }, + { + "epoch": 0.45539280958721706, + "grad_norm": 0.4693769425526856, + "learning_rate": 4.7134387351778656e-05, + "loss": 0.4511, + "step": 171 + }, + { + "epoch": 0.4580559254327563, + "grad_norm": 0.3437754359793867, + "learning_rate": 4.7084980237154154e-05, + "loss": 0.4634, + "step": 172 + }, + { + "epoch": 0.4607190412782956, + "grad_norm": 0.5270401669346302, + "learning_rate": 4.7035573122529645e-05, + "loss": 0.4621, + "step": 173 + }, + { + "epoch": 0.4633821571238349, + "grad_norm": 0.4696714456346351, + "learning_rate": 4.698616600790514e-05, + "loss": 0.4544, + "step": 174 + }, + { + "epoch": 0.46604527296937415, + "grad_norm": 0.5068508932227126, + "learning_rate": 4.6936758893280634e-05, + "loss": 0.4506, + "step": 175 + }, + { + "epoch": 0.46870838881491345, + "grad_norm": 0.503240500645686, + "learning_rate": 4.688735177865613e-05, + "loss": 0.4653, + "step": 176 + }, + { + "epoch": 0.47137150466045274, + "grad_norm": 0.4373004531246149, + "learning_rate": 4.683794466403162e-05, + "loss": 0.4711, + "step": 177 + }, + { + "epoch": 0.474034620505992, + "grad_norm": 0.3777218592654747, + "learning_rate": 4.678853754940712e-05, + "loss": 0.466, + "step": 178 + }, + { + "epoch": 0.4766977363515313, + "grad_norm": 0.5064461910000716, + "learning_rate": 4.673913043478261e-05, + "loss": 0.4516, + "step": 179 + }, + { + "epoch": 0.4793608521970706, + "grad_norm": 0.37515242222191797, + "learning_rate": 4.668972332015811e-05, + "loss": 0.4708, + "step": 180 + }, + { + "epoch": 0.48202396804260983, + "grad_norm": 0.44905049367290634, + "learning_rate": 4.66403162055336e-05, + "loss": 0.4462, + "step": 181 + }, + { + "epoch": 0.48468708388814913, + "grad_norm": 0.37911463481430624, + "learning_rate": 4.659090909090909e-05, + "loss": 0.4451, + "step": 182 + }, + { + "epoch": 0.4873501997336884, + "grad_norm": 0.3830462171805543, + "learning_rate": 4.654150197628458e-05, + "loss": 0.4682, + "step": 183 + }, + { + "epoch": 0.49001331557922767, + "grad_norm": 0.41200778908045926, + "learning_rate": 4.649209486166008e-05, + "loss": 0.4497, + "step": 184 + }, + { + "epoch": 0.49267643142476697, + "grad_norm": 0.4315187398326425, + "learning_rate": 4.644268774703557e-05, + "loss": 0.4752, + "step": 185 + }, + { + "epoch": 0.49533954727030627, + "grad_norm": 0.4519541174810682, + "learning_rate": 4.639328063241107e-05, + "loss": 0.4764, + "step": 186 + }, + { + "epoch": 0.4980026631158455, + "grad_norm": 0.4089102997614078, + "learning_rate": 4.634387351778656e-05, + "loss": 0.4663, + "step": 187 + }, + { + "epoch": 0.5006657789613849, + "grad_norm": 0.352791614063271, + "learning_rate": 4.629446640316206e-05, + "loss": 0.4671, + "step": 188 + }, + { + "epoch": 0.5033288948069241, + "grad_norm": 0.3866144187741864, + "learning_rate": 4.624505928853755e-05, + "loss": 0.4746, + "step": 189 + }, + { + "epoch": 0.5059920106524634, + "grad_norm": 0.4028526989391047, + "learning_rate": 4.6195652173913046e-05, + "loss": 0.4811, + "step": 190 + }, + { + "epoch": 0.5086551264980027, + "grad_norm": 0.4580432915919317, + "learning_rate": 4.614624505928854e-05, + "loss": 0.4678, + "step": 191 + }, + { + "epoch": 0.511318242343542, + "grad_norm": 0.47798645545842755, + "learning_rate": 4.6096837944664035e-05, + "loss": 0.4514, + "step": 192 + }, + { + "epoch": 0.5139813581890812, + "grad_norm": 0.40636636658954495, + "learning_rate": 4.6047430830039526e-05, + "loss": 0.4356, + "step": 193 + }, + { + "epoch": 0.5166444740346205, + "grad_norm": 0.4206946394322433, + "learning_rate": 4.5998023715415024e-05, + "loss": 0.4637, + "step": 194 + }, + { + "epoch": 0.5193075898801598, + "grad_norm": 0.4977083130622833, + "learning_rate": 4.5948616600790515e-05, + "loss": 0.4525, + "step": 195 + }, + { + "epoch": 0.521970705725699, + "grad_norm": 0.3826090231131446, + "learning_rate": 4.589920948616601e-05, + "loss": 0.4647, + "step": 196 + }, + { + "epoch": 0.5246338215712384, + "grad_norm": 0.443905698975846, + "learning_rate": 4.5849802371541504e-05, + "loss": 0.466, + "step": 197 + }, + { + "epoch": 0.5272969374167776, + "grad_norm": 0.34058976392880835, + "learning_rate": 4.5800395256917e-05, + "loss": 0.4462, + "step": 198 + }, + { + "epoch": 0.5299600532623169, + "grad_norm": 0.3708303032984336, + "learning_rate": 4.575098814229249e-05, + "loss": 0.4638, + "step": 199 + }, + { + "epoch": 0.5326231691078562, + "grad_norm": 0.4046635861089521, + "learning_rate": 4.570158102766799e-05, + "loss": 0.4702, + "step": 200 + }, + { + "epoch": 0.5352862849533955, + "grad_norm": 0.390485621135718, + "learning_rate": 4.565217391304348e-05, + "loss": 0.467, + "step": 201 + }, + { + "epoch": 0.5379494007989347, + "grad_norm": 0.36389394329456204, + "learning_rate": 4.560276679841897e-05, + "loss": 0.4676, + "step": 202 + }, + { + "epoch": 0.5406125166444741, + "grad_norm": 0.36415110756708385, + "learning_rate": 4.555335968379447e-05, + "loss": 0.4508, + "step": 203 + }, + { + "epoch": 0.5432756324900133, + "grad_norm": 0.5185630368770853, + "learning_rate": 4.550395256916996e-05, + "loss": 0.4835, + "step": 204 + }, + { + "epoch": 0.5459387483355526, + "grad_norm": 0.3004205195451817, + "learning_rate": 4.545454545454546e-05, + "loss": 0.4655, + "step": 205 + }, + { + "epoch": 0.5486018641810919, + "grad_norm": 0.40992528241944887, + "learning_rate": 4.540513833992095e-05, + "loss": 0.4516, + "step": 206 + }, + { + "epoch": 0.5512649800266312, + "grad_norm": 0.3462175317121373, + "learning_rate": 4.535573122529644e-05, + "loss": 0.4471, + "step": 207 + }, + { + "epoch": 0.5539280958721704, + "grad_norm": 0.4220985656684442, + "learning_rate": 4.530632411067194e-05, + "loss": 0.4483, + "step": 208 + }, + { + "epoch": 0.5565912117177098, + "grad_norm": 0.2992081906139443, + "learning_rate": 4.525691699604743e-05, + "loss": 0.4659, + "step": 209 + }, + { + "epoch": 0.559254327563249, + "grad_norm": 0.34958390386904065, + "learning_rate": 4.520750988142293e-05, + "loss": 0.4594, + "step": 210 + }, + { + "epoch": 0.5619174434087882, + "grad_norm": 0.36711080919022626, + "learning_rate": 4.515810276679842e-05, + "loss": 0.4329, + "step": 211 + }, + { + "epoch": 0.5645805592543276, + "grad_norm": 0.32211416124144243, + "learning_rate": 4.5108695652173916e-05, + "loss": 0.4487, + "step": 212 + }, + { + "epoch": 0.5672436750998668, + "grad_norm": 0.38626649006957514, + "learning_rate": 4.505928853754941e-05, + "loss": 0.4544, + "step": 213 + }, + { + "epoch": 0.5699067909454061, + "grad_norm": 0.4022394284778984, + "learning_rate": 4.5009881422924905e-05, + "loss": 0.4505, + "step": 214 + }, + { + "epoch": 0.5725699067909454, + "grad_norm": 0.3174185878452103, + "learning_rate": 4.4960474308300396e-05, + "loss": 0.4652, + "step": 215 + }, + { + "epoch": 0.5752330226364847, + "grad_norm": 0.3872997977647099, + "learning_rate": 4.4911067193675893e-05, + "loss": 0.4771, + "step": 216 + }, + { + "epoch": 0.5778961384820239, + "grad_norm": 0.2832157450180407, + "learning_rate": 4.4861660079051384e-05, + "loss": 0.4535, + "step": 217 + }, + { + "epoch": 0.5805592543275633, + "grad_norm": 0.3394496956003534, + "learning_rate": 4.481225296442688e-05, + "loss": 0.4401, + "step": 218 + }, + { + "epoch": 0.5832223701731025, + "grad_norm": 0.29084562762850125, + "learning_rate": 4.476284584980237e-05, + "loss": 0.445, + "step": 219 + }, + { + "epoch": 0.5858854860186418, + "grad_norm": 0.30783953367051076, + "learning_rate": 4.471343873517787e-05, + "loss": 0.437, + "step": 220 + }, + { + "epoch": 0.5885486018641811, + "grad_norm": 0.3183591003829617, + "learning_rate": 4.466403162055336e-05, + "loss": 0.4549, + "step": 221 + }, + { + "epoch": 0.5912117177097204, + "grad_norm": 0.30102542208170724, + "learning_rate": 4.461462450592885e-05, + "loss": 0.4455, + "step": 222 + }, + { + "epoch": 0.5938748335552596, + "grad_norm": 0.36209246659651434, + "learning_rate": 4.456521739130435e-05, + "loss": 0.4401, + "step": 223 + }, + { + "epoch": 0.596537949400799, + "grad_norm": 0.3264752372953629, + "learning_rate": 4.451581027667984e-05, + "loss": 0.4379, + "step": 224 + }, + { + "epoch": 0.5992010652463382, + "grad_norm": 0.38508783562543825, + "learning_rate": 4.446640316205534e-05, + "loss": 0.4617, + "step": 225 + }, + { + "epoch": 0.6018641810918774, + "grad_norm": 0.3397449828204806, + "learning_rate": 4.441699604743083e-05, + "loss": 0.4516, + "step": 226 + }, + { + "epoch": 0.6045272969374168, + "grad_norm": 0.3587152523608094, + "learning_rate": 4.436758893280633e-05, + "loss": 0.4627, + "step": 227 + }, + { + "epoch": 0.607190412782956, + "grad_norm": 0.3533298903513862, + "learning_rate": 4.431818181818182e-05, + "loss": 0.4539, + "step": 228 + }, + { + "epoch": 0.6098535286284953, + "grad_norm": 0.4031621223527615, + "learning_rate": 4.426877470355732e-05, + "loss": 0.4475, + "step": 229 + }, + { + "epoch": 0.6125166444740346, + "grad_norm": 0.31598897434214096, + "learning_rate": 4.421936758893281e-05, + "loss": 0.4594, + "step": 230 + }, + { + "epoch": 0.6151797603195739, + "grad_norm": 0.39490506767356415, + "learning_rate": 4.4169960474308306e-05, + "loss": 0.4481, + "step": 231 + }, + { + "epoch": 0.6178428761651131, + "grad_norm": 0.34551286464789904, + "learning_rate": 4.41205533596838e-05, + "loss": 0.4417, + "step": 232 + }, + { + "epoch": 0.6205059920106525, + "grad_norm": 0.3471665108105545, + "learning_rate": 4.4071146245059295e-05, + "loss": 0.444, + "step": 233 + }, + { + "epoch": 0.6231691078561917, + "grad_norm": 0.3236727871934815, + "learning_rate": 4.4021739130434786e-05, + "loss": 0.4465, + "step": 234 + }, + { + "epoch": 0.625832223701731, + "grad_norm": 0.3951638876292987, + "learning_rate": 4.397233201581028e-05, + "loss": 0.4476, + "step": 235 + }, + { + "epoch": 0.6284953395472703, + "grad_norm": 0.3186324774552031, + "learning_rate": 4.3922924901185774e-05, + "loss": 0.4359, + "step": 236 + }, + { + "epoch": 0.6311584553928096, + "grad_norm": 0.3446758582788272, + "learning_rate": 4.387351778656127e-05, + "loss": 0.4425, + "step": 237 + }, + { + "epoch": 0.6338215712383488, + "grad_norm": 0.3712178318421026, + "learning_rate": 4.382411067193676e-05, + "loss": 0.4479, + "step": 238 + }, + { + "epoch": 0.6364846870838882, + "grad_norm": 0.2869593917948936, + "learning_rate": 4.377470355731226e-05, + "loss": 0.4487, + "step": 239 + }, + { + "epoch": 0.6391478029294274, + "grad_norm": 0.35621809137402505, + "learning_rate": 4.3725296442687745e-05, + "loss": 0.459, + "step": 240 + }, + { + "epoch": 0.6418109187749668, + "grad_norm": 0.3219598029099912, + "learning_rate": 4.367588932806324e-05, + "loss": 0.4486, + "step": 241 + }, + { + "epoch": 0.644474034620506, + "grad_norm": 0.345671883817814, + "learning_rate": 4.3626482213438734e-05, + "loss": 0.4494, + "step": 242 + }, + { + "epoch": 0.6471371504660453, + "grad_norm": 0.3326228424406132, + "learning_rate": 4.357707509881423e-05, + "loss": 0.467, + "step": 243 + }, + { + "epoch": 0.6498002663115846, + "grad_norm": 0.42093399894851624, + "learning_rate": 4.352766798418972e-05, + "loss": 0.4361, + "step": 244 + }, + { + "epoch": 0.6524633821571239, + "grad_norm": 0.4162222276319394, + "learning_rate": 4.347826086956522e-05, + "loss": 0.4606, + "step": 245 + }, + { + "epoch": 0.6551264980026631, + "grad_norm": 0.36750359997980137, + "learning_rate": 4.342885375494071e-05, + "loss": 0.4429, + "step": 246 + }, + { + "epoch": 0.6577896138482024, + "grad_norm": 0.5483612794064252, + "learning_rate": 4.337944664031621e-05, + "loss": 0.4533, + "step": 247 + }, + { + "epoch": 0.6604527296937417, + "grad_norm": 0.3506444877775761, + "learning_rate": 4.33300395256917e-05, + "loss": 0.4469, + "step": 248 + }, + { + "epoch": 0.6631158455392809, + "grad_norm": 0.49614493451666597, + "learning_rate": 4.32806324110672e-05, + "loss": 0.4511, + "step": 249 + }, + { + "epoch": 0.6657789613848203, + "grad_norm": 0.38209500350480796, + "learning_rate": 4.323122529644269e-05, + "loss": 0.4556, + "step": 250 + }, + { + "epoch": 0.6684420772303595, + "grad_norm": 0.3909575859613948, + "learning_rate": 4.318181818181819e-05, + "loss": 0.4573, + "step": 251 + }, + { + "epoch": 0.6711051930758988, + "grad_norm": 0.41081105341671875, + "learning_rate": 4.313241106719368e-05, + "loss": 0.4319, + "step": 252 + }, + { + "epoch": 0.6737683089214381, + "grad_norm": 0.3263282193938601, + "learning_rate": 4.3083003952569175e-05, + "loss": 0.4477, + "step": 253 + }, + { + "epoch": 0.6764314247669774, + "grad_norm": 0.30906206450856727, + "learning_rate": 4.3033596837944666e-05, + "loss": 0.449, + "step": 254 + }, + { + "epoch": 0.6790945406125166, + "grad_norm": 0.4519613203178409, + "learning_rate": 4.2984189723320164e-05, + "loss": 0.4411, + "step": 255 + }, + { + "epoch": 0.681757656458056, + "grad_norm": 0.4018486844337667, + "learning_rate": 4.2934782608695655e-05, + "loss": 0.4402, + "step": 256 + }, + { + "epoch": 0.6844207723035952, + "grad_norm": 0.41908409625079107, + "learning_rate": 4.288537549407115e-05, + "loss": 0.4531, + "step": 257 + }, + { + "epoch": 0.6870838881491345, + "grad_norm": 0.34694110159483726, + "learning_rate": 4.2835968379446644e-05, + "loss": 0.4533, + "step": 258 + }, + { + "epoch": 0.6897470039946738, + "grad_norm": 0.4051995527756752, + "learning_rate": 4.2786561264822135e-05, + "loss": 0.4533, + "step": 259 + }, + { + "epoch": 0.6924101198402131, + "grad_norm": 0.3557731708549695, + "learning_rate": 4.2737154150197626e-05, + "loss": 0.4665, + "step": 260 + }, + { + "epoch": 0.6950732356857523, + "grad_norm": 0.387832077012766, + "learning_rate": 4.2687747035573124e-05, + "loss": 0.4407, + "step": 261 + }, + { + "epoch": 0.6977363515312917, + "grad_norm": 0.38082367574409703, + "learning_rate": 4.2638339920948615e-05, + "loss": 0.453, + "step": 262 + }, + { + "epoch": 0.7003994673768309, + "grad_norm": 0.33683683724829466, + "learning_rate": 4.258893280632411e-05, + "loss": 0.4635, + "step": 263 + }, + { + "epoch": 0.7030625832223701, + "grad_norm": 0.4169335496839881, + "learning_rate": 4.2539525691699603e-05, + "loss": 0.4563, + "step": 264 + }, + { + "epoch": 0.7057256990679095, + "grad_norm": 0.3214835965167982, + "learning_rate": 4.24901185770751e-05, + "loss": 0.4542, + "step": 265 + }, + { + "epoch": 0.7083888149134487, + "grad_norm": 0.3530582715253166, + "learning_rate": 4.244071146245059e-05, + "loss": 0.4331, + "step": 266 + }, + { + "epoch": 0.711051930758988, + "grad_norm": 0.36340494740289614, + "learning_rate": 4.239130434782609e-05, + "loss": 0.4394, + "step": 267 + }, + { + "epoch": 0.7137150466045273, + "grad_norm": 0.3874861034018051, + "learning_rate": 4.234189723320158e-05, + "loss": 0.4297, + "step": 268 + }, + { + "epoch": 0.7163781624500666, + "grad_norm": 0.387734289004501, + "learning_rate": 4.229249011857708e-05, + "loss": 0.4518, + "step": 269 + }, + { + "epoch": 0.7190412782956058, + "grad_norm": 0.3011771126496286, + "learning_rate": 4.224308300395257e-05, + "loss": 0.4369, + "step": 270 + }, + { + "epoch": 0.7217043941411452, + "grad_norm": 0.41746724783245387, + "learning_rate": 4.219367588932807e-05, + "loss": 0.4509, + "step": 271 + }, + { + "epoch": 0.7243675099866844, + "grad_norm": 0.3395798145391856, + "learning_rate": 4.214426877470356e-05, + "loss": 0.4643, + "step": 272 + }, + { + "epoch": 0.7270306258322237, + "grad_norm": 0.4118033460496559, + "learning_rate": 4.2094861660079056e-05, + "loss": 0.4238, + "step": 273 + }, + { + "epoch": 0.729693741677763, + "grad_norm": 0.2988995865914867, + "learning_rate": 4.204545454545455e-05, + "loss": 0.4414, + "step": 274 + }, + { + "epoch": 0.7323568575233023, + "grad_norm": 0.4755302873686915, + "learning_rate": 4.1996047430830045e-05, + "loss": 0.4408, + "step": 275 + }, + { + "epoch": 0.7350199733688415, + "grad_norm": 0.3321861192448237, + "learning_rate": 4.1946640316205536e-05, + "loss": 0.4471, + "step": 276 + }, + { + "epoch": 0.7376830892143809, + "grad_norm": 0.45541818319145366, + "learning_rate": 4.1897233201581034e-05, + "loss": 0.4473, + "step": 277 + }, + { + "epoch": 0.7403462050599201, + "grad_norm": 0.37099566890533026, + "learning_rate": 4.1847826086956525e-05, + "loss": 0.4495, + "step": 278 + }, + { + "epoch": 0.7430093209054593, + "grad_norm": 0.4035270770785246, + "learning_rate": 4.1798418972332016e-05, + "loss": 0.4513, + "step": 279 + }, + { + "epoch": 0.7456724367509987, + "grad_norm": 0.3441312582159767, + "learning_rate": 4.174901185770751e-05, + "loss": 0.4358, + "step": 280 + }, + { + "epoch": 0.748335552596538, + "grad_norm": 0.44606462407083225, + "learning_rate": 4.1699604743083005e-05, + "loss": 0.4441, + "step": 281 + }, + { + "epoch": 0.7509986684420772, + "grad_norm": 0.41551217890891706, + "learning_rate": 4.1650197628458496e-05, + "loss": 0.4389, + "step": 282 + }, + { + "epoch": 0.7536617842876165, + "grad_norm": 0.3972988958201408, + "learning_rate": 4.160079051383399e-05, + "loss": 0.4375, + "step": 283 + }, + { + "epoch": 0.7563249001331558, + "grad_norm": 0.47085225893645843, + "learning_rate": 4.1551383399209484e-05, + "loss": 0.4567, + "step": 284 + }, + { + "epoch": 0.758988015978695, + "grad_norm": 0.34543261673414827, + "learning_rate": 4.150197628458498e-05, + "loss": 0.4459, + "step": 285 + }, + { + "epoch": 0.7616511318242344, + "grad_norm": 0.43195994812681116, + "learning_rate": 4.145256916996047e-05, + "loss": 0.4589, + "step": 286 + }, + { + "epoch": 0.7643142476697736, + "grad_norm": 0.3459436864735825, + "learning_rate": 4.140316205533597e-05, + "loss": 0.4599, + "step": 287 + }, + { + "epoch": 0.7669773635153129, + "grad_norm": 0.36207300529867464, + "learning_rate": 4.135375494071146e-05, + "loss": 0.4303, + "step": 288 + }, + { + "epoch": 0.7696404793608522, + "grad_norm": 0.41345784501066335, + "learning_rate": 4.130434782608696e-05, + "loss": 0.4271, + "step": 289 + }, + { + "epoch": 0.7723035952063915, + "grad_norm": 0.3159838632384483, + "learning_rate": 4.125494071146245e-05, + "loss": 0.4559, + "step": 290 + }, + { + "epoch": 0.7749667110519307, + "grad_norm": 0.3812699162571922, + "learning_rate": 4.120553359683795e-05, + "loss": 0.4342, + "step": 291 + }, + { + "epoch": 0.7776298268974701, + "grad_norm": 0.37911131885498967, + "learning_rate": 4.115612648221344e-05, + "loss": 0.4362, + "step": 292 + }, + { + "epoch": 0.7802929427430093, + "grad_norm": 0.29763254355588903, + "learning_rate": 4.110671936758894e-05, + "loss": 0.438, + "step": 293 + }, + { + "epoch": 0.7829560585885486, + "grad_norm": 0.42619217859831243, + "learning_rate": 4.105731225296443e-05, + "loss": 0.4359, + "step": 294 + }, + { + "epoch": 0.7856191744340879, + "grad_norm": 0.3300550679665931, + "learning_rate": 4.1007905138339926e-05, + "loss": 0.43, + "step": 295 + }, + { + "epoch": 0.7882822902796272, + "grad_norm": 0.36668560763021596, + "learning_rate": 4.095849802371542e-05, + "loss": 0.4307, + "step": 296 + }, + { + "epoch": 0.7909454061251664, + "grad_norm": 0.4285864023060217, + "learning_rate": 4.0909090909090915e-05, + "loss": 0.44, + "step": 297 + }, + { + "epoch": 0.7936085219707057, + "grad_norm": 0.40308733058892654, + "learning_rate": 4.0859683794466406e-05, + "loss": 0.4438, + "step": 298 + }, + { + "epoch": 0.796271637816245, + "grad_norm": 0.48251508562888784, + "learning_rate": 4.08102766798419e-05, + "loss": 0.465, + "step": 299 + }, + { + "epoch": 0.7989347536617842, + "grad_norm": 0.3630289677972406, + "learning_rate": 4.076086956521739e-05, + "loss": 0.4472, + "step": 300 + }, + { + "epoch": 0.8015978695073236, + "grad_norm": 0.39496674097555107, + "learning_rate": 4.0711462450592886e-05, + "loss": 0.4391, + "step": 301 + }, + { + "epoch": 0.8042609853528628, + "grad_norm": 0.3844393845604204, + "learning_rate": 4.0662055335968377e-05, + "loss": 0.4594, + "step": 302 + }, + { + "epoch": 0.8069241011984021, + "grad_norm": 0.41185922961873794, + "learning_rate": 4.0612648221343874e-05, + "loss": 0.4302, + "step": 303 + }, + { + "epoch": 0.8095872170439414, + "grad_norm": 0.3856385433600225, + "learning_rate": 4.0563241106719365e-05, + "loss": 0.4436, + "step": 304 + }, + { + "epoch": 0.8122503328894807, + "grad_norm": 0.38840299488987834, + "learning_rate": 4.051383399209486e-05, + "loss": 0.4536, + "step": 305 + }, + { + "epoch": 0.8149134487350199, + "grad_norm": 0.3814150713404761, + "learning_rate": 4.0464426877470354e-05, + "loss": 0.4478, + "step": 306 + }, + { + "epoch": 0.8175765645805593, + "grad_norm": 0.3688695146114231, + "learning_rate": 4.041501976284585e-05, + "loss": 0.4371, + "step": 307 + }, + { + "epoch": 0.8202396804260985, + "grad_norm": 0.4525942844580142, + "learning_rate": 4.036561264822134e-05, + "loss": 0.4291, + "step": 308 + }, + { + "epoch": 0.8229027962716379, + "grad_norm": 0.4052871924274271, + "learning_rate": 4.031620553359684e-05, + "loss": 0.4441, + "step": 309 + }, + { + "epoch": 0.8255659121171771, + "grad_norm": 0.39806513754399514, + "learning_rate": 4.026679841897233e-05, + "loss": 0.4411, + "step": 310 + }, + { + "epoch": 0.8282290279627164, + "grad_norm": 0.3805049053303521, + "learning_rate": 4.021739130434783e-05, + "loss": 0.4366, + "step": 311 + }, + { + "epoch": 0.8308921438082557, + "grad_norm": 0.4001908389883243, + "learning_rate": 4.016798418972332e-05, + "loss": 0.4481, + "step": 312 + }, + { + "epoch": 0.833555259653795, + "grad_norm": 0.3685478975261263, + "learning_rate": 4.011857707509882e-05, + "loss": 0.4444, + "step": 313 + }, + { + "epoch": 0.8362183754993342, + "grad_norm": 0.3338436350006864, + "learning_rate": 4.006916996047431e-05, + "loss": 0.4479, + "step": 314 + }, + { + "epoch": 0.8388814913448736, + "grad_norm": 0.41429245260714803, + "learning_rate": 4.001976284584981e-05, + "loss": 0.449, + "step": 315 + }, + { + "epoch": 0.8415446071904128, + "grad_norm": 0.4423411865525233, + "learning_rate": 3.99703557312253e-05, + "loss": 0.4659, + "step": 316 + }, + { + "epoch": 0.844207723035952, + "grad_norm": 0.2957853011048819, + "learning_rate": 3.9920948616600796e-05, + "loss": 0.4251, + "step": 317 + }, + { + "epoch": 0.8468708388814914, + "grad_norm": 0.4030160825498704, + "learning_rate": 3.987154150197629e-05, + "loss": 0.4371, + "step": 318 + }, + { + "epoch": 0.8495339547270306, + "grad_norm": 0.3580572215645172, + "learning_rate": 3.982213438735178e-05, + "loss": 0.4227, + "step": 319 + }, + { + "epoch": 0.8521970705725699, + "grad_norm": 0.39710125591854223, + "learning_rate": 3.9772727272727275e-05, + "loss": 0.4293, + "step": 320 + }, + { + "epoch": 0.8548601864181092, + "grad_norm": 0.4051765562646604, + "learning_rate": 3.9723320158102766e-05, + "loss": 0.4334, + "step": 321 + }, + { + "epoch": 0.8575233022636485, + "grad_norm": 0.41675278060825943, + "learning_rate": 3.9673913043478264e-05, + "loss": 0.4386, + "step": 322 + }, + { + "epoch": 0.8601864181091877, + "grad_norm": 0.4375405045592726, + "learning_rate": 3.9624505928853755e-05, + "loss": 0.4533, + "step": 323 + }, + { + "epoch": 0.8628495339547271, + "grad_norm": 0.4043621563504148, + "learning_rate": 3.957509881422925e-05, + "loss": 0.4497, + "step": 324 + }, + { + "epoch": 0.8655126498002663, + "grad_norm": 0.37983530045601516, + "learning_rate": 3.9525691699604744e-05, + "loss": 0.4392, + "step": 325 + }, + { + "epoch": 0.8681757656458056, + "grad_norm": 0.4289732652538706, + "learning_rate": 3.947628458498024e-05, + "loss": 0.4401, + "step": 326 + }, + { + "epoch": 0.8708388814913449, + "grad_norm": 0.34033600614743714, + "learning_rate": 3.942687747035573e-05, + "loss": 0.453, + "step": 327 + }, + { + "epoch": 0.8735019973368842, + "grad_norm": 0.399300367168935, + "learning_rate": 3.937747035573123e-05, + "loss": 0.433, + "step": 328 + }, + { + "epoch": 0.8761651131824234, + "grad_norm": 0.36717092389818584, + "learning_rate": 3.932806324110672e-05, + "loss": 0.4523, + "step": 329 + }, + { + "epoch": 0.8788282290279628, + "grad_norm": 0.43669770511305556, + "learning_rate": 3.927865612648222e-05, + "loss": 0.437, + "step": 330 + }, + { + "epoch": 0.881491344873502, + "grad_norm": 0.3631294987791108, + "learning_rate": 3.922924901185771e-05, + "loss": 0.4335, + "step": 331 + }, + { + "epoch": 0.8841544607190412, + "grad_norm": 0.45116504976872973, + "learning_rate": 3.917984189723321e-05, + "loss": 0.4562, + "step": 332 + }, + { + "epoch": 0.8868175765645806, + "grad_norm": 0.3163566159546663, + "learning_rate": 3.91304347826087e-05, + "loss": 0.4286, + "step": 333 + }, + { + "epoch": 0.8894806924101198, + "grad_norm": 0.49699702016497876, + "learning_rate": 3.90810276679842e-05, + "loss": 0.4214, + "step": 334 + }, + { + "epoch": 0.8921438082556591, + "grad_norm": 0.4164898463983148, + "learning_rate": 3.903162055335969e-05, + "loss": 0.4354, + "step": 335 + }, + { + "epoch": 0.8948069241011984, + "grad_norm": 0.39631778611383006, + "learning_rate": 3.8982213438735186e-05, + "loss": 0.4389, + "step": 336 + }, + { + "epoch": 0.8974700399467377, + "grad_norm": 0.4545892509897146, + "learning_rate": 3.893280632411067e-05, + "loss": 0.4312, + "step": 337 + }, + { + "epoch": 0.9001331557922769, + "grad_norm": 0.41988367228289636, + "learning_rate": 3.888339920948617e-05, + "loss": 0.4433, + "step": 338 + }, + { + "epoch": 0.9027962716378163, + "grad_norm": 0.3123307577517813, + "learning_rate": 3.883399209486166e-05, + "loss": 0.4272, + "step": 339 + }, + { + "epoch": 0.9054593874833555, + "grad_norm": 0.31692127951353677, + "learning_rate": 3.8784584980237156e-05, + "loss": 0.4292, + "step": 340 + }, + { + "epoch": 0.9081225033288948, + "grad_norm": 0.33613245505768613, + "learning_rate": 3.873517786561265e-05, + "loss": 0.4249, + "step": 341 + }, + { + "epoch": 0.9107856191744341, + "grad_norm": 0.30559768683570065, + "learning_rate": 3.8685770750988145e-05, + "loss": 0.4398, + "step": 342 + }, + { + "epoch": 0.9134487350199734, + "grad_norm": 0.3939981911193064, + "learning_rate": 3.8636363636363636e-05, + "loss": 0.4335, + "step": 343 + }, + { + "epoch": 0.9161118508655126, + "grad_norm": 0.33858345690029085, + "learning_rate": 3.8586956521739134e-05, + "loss": 0.4451, + "step": 344 + }, + { + "epoch": 0.918774966711052, + "grad_norm": 0.3422872934004404, + "learning_rate": 3.8537549407114625e-05, + "loss": 0.4353, + "step": 345 + }, + { + "epoch": 0.9214380825565912, + "grad_norm": 0.3280283881293896, + "learning_rate": 3.848814229249012e-05, + "loss": 0.4336, + "step": 346 + }, + { + "epoch": 0.9241011984021305, + "grad_norm": 0.3212166344001671, + "learning_rate": 3.8438735177865614e-05, + "loss": 0.4436, + "step": 347 + }, + { + "epoch": 0.9267643142476698, + "grad_norm": 0.29779879718680563, + "learning_rate": 3.838932806324111e-05, + "loss": 0.4224, + "step": 348 + }, + { + "epoch": 0.929427430093209, + "grad_norm": 0.32257209602500175, + "learning_rate": 3.83399209486166e-05, + "loss": 0.4324, + "step": 349 + }, + { + "epoch": 0.9320905459387483, + "grad_norm": 0.3283760169277036, + "learning_rate": 3.82905138339921e-05, + "loss": 0.4312, + "step": 350 + }, + { + "epoch": 0.9347536617842876, + "grad_norm": 0.29560048048387905, + "learning_rate": 3.824110671936759e-05, + "loss": 0.438, + "step": 351 + }, + { + "epoch": 0.9374167776298269, + "grad_norm": 0.31047996971013586, + "learning_rate": 3.819169960474309e-05, + "loss": 0.436, + "step": 352 + }, + { + "epoch": 0.9400798934753661, + "grad_norm": 0.3203340478559344, + "learning_rate": 3.814229249011858e-05, + "loss": 0.4178, + "step": 353 + }, + { + "epoch": 0.9427430093209055, + "grad_norm": 0.3000799797652741, + "learning_rate": 3.809288537549408e-05, + "loss": 0.4283, + "step": 354 + }, + { + "epoch": 0.9454061251664447, + "grad_norm": 0.31625082964426837, + "learning_rate": 3.804347826086957e-05, + "loss": 0.4355, + "step": 355 + }, + { + "epoch": 0.948069241011984, + "grad_norm": 0.38688019968777704, + "learning_rate": 3.7994071146245066e-05, + "loss": 0.4561, + "step": 356 + }, + { + "epoch": 0.9507323568575233, + "grad_norm": 0.309916135809927, + "learning_rate": 3.794466403162055e-05, + "loss": 0.4323, + "step": 357 + }, + { + "epoch": 0.9533954727030626, + "grad_norm": 0.4119303884073823, + "learning_rate": 3.789525691699605e-05, + "loss": 0.4346, + "step": 358 + }, + { + "epoch": 0.9560585885486018, + "grad_norm": 0.36057463061333933, + "learning_rate": 3.784584980237154e-05, + "loss": 0.4521, + "step": 359 + }, + { + "epoch": 0.9587217043941412, + "grad_norm": 0.3385683676369823, + "learning_rate": 3.779644268774704e-05, + "loss": 0.4186, + "step": 360 + }, + { + "epoch": 0.9613848202396804, + "grad_norm": 0.40056553056875543, + "learning_rate": 3.774703557312253e-05, + "loss": 0.4577, + "step": 361 + }, + { + "epoch": 0.9640479360852197, + "grad_norm": 0.3362167210172609, + "learning_rate": 3.7697628458498026e-05, + "loss": 0.4232, + "step": 362 + }, + { + "epoch": 0.966711051930759, + "grad_norm": 0.39765353196088127, + "learning_rate": 3.764822134387352e-05, + "loss": 0.4441, + "step": 363 + }, + { + "epoch": 0.9693741677762983, + "grad_norm": 0.34508268417865146, + "learning_rate": 3.7598814229249015e-05, + "loss": 0.4339, + "step": 364 + }, + { + "epoch": 0.9720372836218375, + "grad_norm": 0.346158165413465, + "learning_rate": 3.7549407114624506e-05, + "loss": 0.4314, + "step": 365 + }, + { + "epoch": 0.9747003994673769, + "grad_norm": 0.38758138562436, + "learning_rate": 3.7500000000000003e-05, + "loss": 0.4479, + "step": 366 + }, + { + "epoch": 0.9773635153129161, + "grad_norm": 0.3616955496837348, + "learning_rate": 3.7450592885375494e-05, + "loss": 0.4295, + "step": 367 + }, + { + "epoch": 0.9800266311584553, + "grad_norm": 0.36330419598482033, + "learning_rate": 3.740118577075099e-05, + "loss": 0.431, + "step": 368 + }, + { + "epoch": 0.9826897470039947, + "grad_norm": 0.38220931731215757, + "learning_rate": 3.735177865612648e-05, + "loss": 0.4411, + "step": 369 + }, + { + "epoch": 0.9853528628495339, + "grad_norm": 0.32482883893874537, + "learning_rate": 3.730237154150198e-05, + "loss": 0.4352, + "step": 370 + }, + { + "epoch": 0.9880159786950732, + "grad_norm": 0.3797976983855516, + "learning_rate": 3.725296442687747e-05, + "loss": 0.4273, + "step": 371 + }, + { + "epoch": 0.9906790945406125, + "grad_norm": 0.3333203576267911, + "learning_rate": 3.720355731225297e-05, + "loss": 0.4353, + "step": 372 + }, + { + "epoch": 0.9933422103861518, + "grad_norm": 0.3565932063789887, + "learning_rate": 3.715415019762846e-05, + "loss": 0.4312, + "step": 373 + }, + { + "epoch": 0.996005326231691, + "grad_norm": 0.35499721260713074, + "learning_rate": 3.710474308300396e-05, + "loss": 0.4328, + "step": 374 + }, + { + "epoch": 0.9986684420772304, + "grad_norm": 0.34312841144350587, + "learning_rate": 3.705533596837945e-05, + "loss": 0.4238, + "step": 375 + }, + { + "epoch": 1.0, + "grad_norm": 0.34312841144350587, + "learning_rate": 3.700592885375494e-05, + "loss": 0.4292, + "step": 376 + }, + { + "epoch": 1.0026631158455392, + "grad_norm": 0.523484923884555, + "learning_rate": 3.695652173913043e-05, + "loss": 0.3827, + "step": 377 + }, + { + "epoch": 1.0053262316910785, + "grad_norm": 0.44981178204276556, + "learning_rate": 3.690711462450593e-05, + "loss": 0.3497, + "step": 378 + }, + { + "epoch": 1.007989347536618, + "grad_norm": 0.30585009680415987, + "learning_rate": 3.685770750988142e-05, + "loss": 0.3667, + "step": 379 + }, + { + "epoch": 1.0106524633821572, + "grad_norm": 0.3734972975740805, + "learning_rate": 3.680830039525692e-05, + "loss": 0.365, + "step": 380 + }, + { + "epoch": 1.0133155792276964, + "grad_norm": 0.32549667969227175, + "learning_rate": 3.675889328063241e-05, + "loss": 0.3756, + "step": 381 + }, + { + "epoch": 1.0159786950732357, + "grad_norm": 0.4493130971817616, + "learning_rate": 3.670948616600791e-05, + "loss": 0.358, + "step": 382 + }, + { + "epoch": 1.018641810918775, + "grad_norm": 0.40705895511048784, + "learning_rate": 3.66600790513834e-05, + "loss": 0.3711, + "step": 383 + }, + { + "epoch": 1.0213049267643142, + "grad_norm": 0.3979472669944709, + "learning_rate": 3.6610671936758896e-05, + "loss": 0.3613, + "step": 384 + }, + { + "epoch": 1.0239680426098536, + "grad_norm": 0.44247177084982264, + "learning_rate": 3.656126482213439e-05, + "loss": 0.3461, + "step": 385 + }, + { + "epoch": 1.0266311584553929, + "grad_norm": 0.3643767210189153, + "learning_rate": 3.6511857707509884e-05, + "loss": 0.3682, + "step": 386 + }, + { + "epoch": 1.0292942743009321, + "grad_norm": 0.3710522218627508, + "learning_rate": 3.6462450592885375e-05, + "loss": 0.3616, + "step": 387 + }, + { + "epoch": 1.0319573901464714, + "grad_norm": 0.39199235847196745, + "learning_rate": 3.641304347826087e-05, + "loss": 0.3373, + "step": 388 + }, + { + "epoch": 1.0346205059920106, + "grad_norm": 0.3716307271666748, + "learning_rate": 3.6363636363636364e-05, + "loss": 0.3783, + "step": 389 + }, + { + "epoch": 1.0372836218375499, + "grad_norm": 0.39593613574016095, + "learning_rate": 3.631422924901186e-05, + "loss": 0.3605, + "step": 390 + }, + { + "epoch": 1.0399467376830893, + "grad_norm": 0.3741049180680241, + "learning_rate": 3.626482213438735e-05, + "loss": 0.3643, + "step": 391 + }, + { + "epoch": 1.0426098535286286, + "grad_norm": 0.39560887666458844, + "learning_rate": 3.621541501976285e-05, + "loss": 0.3873, + "step": 392 + }, + { + "epoch": 1.0452729693741678, + "grad_norm": 0.4542194912059658, + "learning_rate": 3.616600790513834e-05, + "loss": 0.3517, + "step": 393 + }, + { + "epoch": 1.047936085219707, + "grad_norm": 0.3376853296582342, + "learning_rate": 3.611660079051384e-05, + "loss": 0.3746, + "step": 394 + }, + { + "epoch": 1.0505992010652463, + "grad_norm": 0.38846148578122447, + "learning_rate": 3.606719367588933e-05, + "loss": 0.3389, + "step": 395 + }, + { + "epoch": 1.0532623169107855, + "grad_norm": 0.32360005393691865, + "learning_rate": 3.601778656126482e-05, + "loss": 0.3663, + "step": 396 + }, + { + "epoch": 1.055925432756325, + "grad_norm": 0.326112805381814, + "learning_rate": 3.596837944664031e-05, + "loss": 0.3581, + "step": 397 + }, + { + "epoch": 1.0585885486018642, + "grad_norm": 0.28926622056464246, + "learning_rate": 3.591897233201581e-05, + "loss": 0.358, + "step": 398 + }, + { + "epoch": 1.0612516644474035, + "grad_norm": 0.3055465293423247, + "learning_rate": 3.58695652173913e-05, + "loss": 0.3617, + "step": 399 + }, + { + "epoch": 1.0639147802929427, + "grad_norm": 0.33022021713183336, + "learning_rate": 3.58201581027668e-05, + "loss": 0.353, + "step": 400 + }, + { + "epoch": 1.066577896138482, + "grad_norm": 0.29024468585164404, + "learning_rate": 3.577075098814229e-05, + "loss": 0.355, + "step": 401 + }, + { + "epoch": 1.0692410119840212, + "grad_norm": 0.2733040275941461, + "learning_rate": 3.572134387351779e-05, + "loss": 0.3574, + "step": 402 + }, + { + "epoch": 1.0719041278295607, + "grad_norm": 0.3226214256196561, + "learning_rate": 3.567193675889328e-05, + "loss": 0.3528, + "step": 403 + }, + { + "epoch": 1.0745672436751, + "grad_norm": 0.31534151465175414, + "learning_rate": 3.5622529644268777e-05, + "loss": 0.3539, + "step": 404 + }, + { + "epoch": 1.0772303595206392, + "grad_norm": 0.2751061424659443, + "learning_rate": 3.557312252964427e-05, + "loss": 0.3667, + "step": 405 + }, + { + "epoch": 1.0798934753661784, + "grad_norm": 0.3612676719250419, + "learning_rate": 3.5523715415019765e-05, + "loss": 0.3541, + "step": 406 + }, + { + "epoch": 1.0825565912117177, + "grad_norm": 0.3011759295136269, + "learning_rate": 3.5474308300395256e-05, + "loss": 0.3606, + "step": 407 + }, + { + "epoch": 1.085219707057257, + "grad_norm": 0.3978993850172965, + "learning_rate": 3.5424901185770754e-05, + "loss": 0.3626, + "step": 408 + }, + { + "epoch": 1.0878828229027964, + "grad_norm": 0.2872210237523896, + "learning_rate": 3.5375494071146245e-05, + "loss": 0.3889, + "step": 409 + }, + { + "epoch": 1.0905459387483356, + "grad_norm": 0.443073058318771, + "learning_rate": 3.532608695652174e-05, + "loss": 0.3535, + "step": 410 + }, + { + "epoch": 1.0932090545938749, + "grad_norm": 0.33127012106810017, + "learning_rate": 3.5276679841897234e-05, + "loss": 0.3459, + "step": 411 + }, + { + "epoch": 1.095872170439414, + "grad_norm": 0.2919448905657829, + "learning_rate": 3.522727272727273e-05, + "loss": 0.365, + "step": 412 + }, + { + "epoch": 1.0985352862849533, + "grad_norm": 0.33466018716475304, + "learning_rate": 3.517786561264822e-05, + "loss": 0.3625, + "step": 413 + }, + { + "epoch": 1.1011984021304926, + "grad_norm": 0.3413607594653121, + "learning_rate": 3.512845849802372e-05, + "loss": 0.3724, + "step": 414 + }, + { + "epoch": 1.103861517976032, + "grad_norm": 0.35737975021729407, + "learning_rate": 3.507905138339921e-05, + "loss": 0.3774, + "step": 415 + }, + { + "epoch": 1.1065246338215713, + "grad_norm": 0.34162270993471044, + "learning_rate": 3.50296442687747e-05, + "loss": 0.3686, + "step": 416 + }, + { + "epoch": 1.1091877496671105, + "grad_norm": 0.35133143811370443, + "learning_rate": 3.49802371541502e-05, + "loss": 0.3699, + "step": 417 + }, + { + "epoch": 1.1118508655126498, + "grad_norm": 0.3579722853716089, + "learning_rate": 3.493083003952569e-05, + "loss": 0.3505, + "step": 418 + }, + { + "epoch": 1.114513981358189, + "grad_norm": 0.2618428057689255, + "learning_rate": 3.488142292490119e-05, + "loss": 0.3463, + "step": 419 + }, + { + "epoch": 1.1171770972037283, + "grad_norm": 0.35732356240927676, + "learning_rate": 3.483201581027668e-05, + "loss": 0.3473, + "step": 420 + }, + { + "epoch": 1.1198402130492677, + "grad_norm": 0.34101793627943705, + "learning_rate": 3.478260869565218e-05, + "loss": 0.3738, + "step": 421 + }, + { + "epoch": 1.122503328894807, + "grad_norm": 0.3005835100136546, + "learning_rate": 3.473320158102767e-05, + "loss": 0.3748, + "step": 422 + }, + { + "epoch": 1.1251664447403462, + "grad_norm": 0.3512554307406862, + "learning_rate": 3.4683794466403166e-05, + "loss": 0.3578, + "step": 423 + }, + { + "epoch": 1.1278295605858855, + "grad_norm": 0.3037958675770476, + "learning_rate": 3.463438735177866e-05, + "loss": 0.3812, + "step": 424 + }, + { + "epoch": 1.1304926764314247, + "grad_norm": 0.33131881019625853, + "learning_rate": 3.4584980237154155e-05, + "loss": 0.3475, + "step": 425 + }, + { + "epoch": 1.133155792276964, + "grad_norm": 0.2887902456682679, + "learning_rate": 3.4535573122529646e-05, + "loss": 0.3658, + "step": 426 + }, + { + "epoch": 1.1358189081225034, + "grad_norm": 0.3429001374635811, + "learning_rate": 3.4486166007905144e-05, + "loss": 0.37, + "step": 427 + }, + { + "epoch": 1.1384820239680427, + "grad_norm": 0.32345869994940707, + "learning_rate": 3.4436758893280635e-05, + "loss": 0.3325, + "step": 428 + }, + { + "epoch": 1.141145139813582, + "grad_norm": 0.3183193536956743, + "learning_rate": 3.438735177865613e-05, + "loss": 0.3597, + "step": 429 + }, + { + "epoch": 1.1438082556591211, + "grad_norm": 0.3300209265208329, + "learning_rate": 3.4337944664031624e-05, + "loss": 0.3718, + "step": 430 + }, + { + "epoch": 1.1464713715046604, + "grad_norm": 0.31339838507600637, + "learning_rate": 3.428853754940712e-05, + "loss": 0.3505, + "step": 431 + }, + { + "epoch": 1.1491344873501999, + "grad_norm": 0.30103241701187505, + "learning_rate": 3.423913043478261e-05, + "loss": 0.3515, + "step": 432 + }, + { + "epoch": 1.151797603195739, + "grad_norm": 0.33142077936580827, + "learning_rate": 3.418972332015811e-05, + "loss": 0.3454, + "step": 433 + }, + { + "epoch": 1.1544607190412783, + "grad_norm": 0.26672583595142774, + "learning_rate": 3.41403162055336e-05, + "loss": 0.3557, + "step": 434 + }, + { + "epoch": 1.1571238348868176, + "grad_norm": 0.29810972252935447, + "learning_rate": 3.409090909090909e-05, + "loss": 0.3627, + "step": 435 + }, + { + "epoch": 1.1597869507323568, + "grad_norm": 0.4004613882147666, + "learning_rate": 3.404150197628458e-05, + "loss": 0.3596, + "step": 436 + }, + { + "epoch": 1.162450066577896, + "grad_norm": 0.3230914038022782, + "learning_rate": 3.399209486166008e-05, + "loss": 0.3494, + "step": 437 + }, + { + "epoch": 1.1651131824234353, + "grad_norm": 0.26213767359417905, + "learning_rate": 3.394268774703557e-05, + "loss": 0.3686, + "step": 438 + }, + { + "epoch": 1.1677762982689748, + "grad_norm": 0.4095014774133373, + "learning_rate": 3.389328063241107e-05, + "loss": 0.3688, + "step": 439 + }, + { + "epoch": 1.170439414114514, + "grad_norm": 0.266377270998587, + "learning_rate": 3.384387351778656e-05, + "loss": 0.3648, + "step": 440 + }, + { + "epoch": 1.1731025299600533, + "grad_norm": 0.32985529288585497, + "learning_rate": 3.379446640316206e-05, + "loss": 0.3703, + "step": 441 + }, + { + "epoch": 1.1757656458055925, + "grad_norm": 0.3629424885940422, + "learning_rate": 3.374505928853755e-05, + "loss": 0.3502, + "step": 442 + }, + { + "epoch": 1.1784287616511318, + "grad_norm": 0.29079091604622403, + "learning_rate": 3.369565217391305e-05, + "loss": 0.3696, + "step": 443 + }, + { + "epoch": 1.1810918774966712, + "grad_norm": 0.36019836895937174, + "learning_rate": 3.364624505928854e-05, + "loss": 0.3507, + "step": 444 + }, + { + "epoch": 1.1837549933422105, + "grad_norm": 0.3710021105040673, + "learning_rate": 3.3596837944664036e-05, + "loss": 0.3458, + "step": 445 + }, + { + "epoch": 1.1864181091877497, + "grad_norm": 0.2814671230360335, + "learning_rate": 3.354743083003953e-05, + "loss": 0.3625, + "step": 446 + }, + { + "epoch": 1.189081225033289, + "grad_norm": 0.39752143956114194, + "learning_rate": 3.3498023715415025e-05, + "loss": 0.3372, + "step": 447 + }, + { + "epoch": 1.1917443408788282, + "grad_norm": 0.3447518628047081, + "learning_rate": 3.3448616600790516e-05, + "loss": 0.352, + "step": 448 + }, + { + "epoch": 1.1944074567243674, + "grad_norm": 0.23476338435026442, + "learning_rate": 3.3399209486166014e-05, + "loss": 0.3433, + "step": 449 + }, + { + "epoch": 1.1970705725699067, + "grad_norm": 0.41285793244761565, + "learning_rate": 3.3349802371541505e-05, + "loss": 0.3507, + "step": 450 + }, + { + "epoch": 1.1997336884154461, + "grad_norm": 0.2756526642604148, + "learning_rate": 3.3300395256917e-05, + "loss": 0.3679, + "step": 451 + }, + { + "epoch": 1.2023968042609854, + "grad_norm": 0.35361646973541144, + "learning_rate": 3.325098814229249e-05, + "loss": 0.3771, + "step": 452 + }, + { + "epoch": 1.2050599201065246, + "grad_norm": 0.3011012199917682, + "learning_rate": 3.320158102766799e-05, + "loss": 0.3501, + "step": 453 + }, + { + "epoch": 1.2077230359520639, + "grad_norm": 0.2753809532139054, + "learning_rate": 3.3152173913043475e-05, + "loss": 0.3751, + "step": 454 + }, + { + "epoch": 1.2103861517976031, + "grad_norm": 0.345446601586865, + "learning_rate": 3.310276679841897e-05, + "loss": 0.3675, + "step": 455 + }, + { + "epoch": 1.2130492676431426, + "grad_norm": 0.3105483046559569, + "learning_rate": 3.3053359683794464e-05, + "loss": 0.3473, + "step": 456 + }, + { + "epoch": 1.2157123834886818, + "grad_norm": 0.31097501000340777, + "learning_rate": 3.300395256916996e-05, + "loss": 0.3685, + "step": 457 + }, + { + "epoch": 1.218375499334221, + "grad_norm": 0.35861972517870744, + "learning_rate": 3.295454545454545e-05, + "loss": 0.3493, + "step": 458 + }, + { + "epoch": 1.2210386151797603, + "grad_norm": 0.2497414905559577, + "learning_rate": 3.290513833992095e-05, + "loss": 0.3596, + "step": 459 + }, + { + "epoch": 1.2237017310252996, + "grad_norm": 0.3260671903675003, + "learning_rate": 3.285573122529644e-05, + "loss": 0.3584, + "step": 460 + }, + { + "epoch": 1.2263648468708388, + "grad_norm": 0.303125715747872, + "learning_rate": 3.280632411067194e-05, + "loss": 0.3468, + "step": 461 + }, + { + "epoch": 1.229027962716378, + "grad_norm": 0.2894307336548194, + "learning_rate": 3.275691699604743e-05, + "loss": 0.3589, + "step": 462 + }, + { + "epoch": 1.2316910785619175, + "grad_norm": 0.3081296705994847, + "learning_rate": 3.270750988142293e-05, + "loss": 0.3586, + "step": 463 + }, + { + "epoch": 1.2343541944074568, + "grad_norm": 0.2926327290593828, + "learning_rate": 3.265810276679842e-05, + "loss": 0.3594, + "step": 464 + }, + { + "epoch": 1.237017310252996, + "grad_norm": 0.3050352656827861, + "learning_rate": 3.260869565217392e-05, + "loss": 0.3794, + "step": 465 + }, + { + "epoch": 1.2396804260985352, + "grad_norm": 0.34421850278839233, + "learning_rate": 3.255928853754941e-05, + "loss": 0.3448, + "step": 466 + }, + { + "epoch": 1.2423435419440745, + "grad_norm": 0.3178141996560178, + "learning_rate": 3.2509881422924906e-05, + "loss": 0.3596, + "step": 467 + }, + { + "epoch": 1.245006657789614, + "grad_norm": 0.36055320312739547, + "learning_rate": 3.24604743083004e-05, + "loss": 0.3374, + "step": 468 + }, + { + "epoch": 1.2476697736351532, + "grad_norm": 0.2584894490878346, + "learning_rate": 3.2411067193675894e-05, + "loss": 0.3381, + "step": 469 + }, + { + "epoch": 1.2503328894806924, + "grad_norm": 0.3556442871963007, + "learning_rate": 3.2361660079051385e-05, + "loss": 0.3757, + "step": 470 + }, + { + "epoch": 1.2529960053262317, + "grad_norm": 0.2936471278443274, + "learning_rate": 3.231225296442688e-05, + "loss": 0.3612, + "step": 471 + }, + { + "epoch": 1.255659121171771, + "grad_norm": 0.34920820452723006, + "learning_rate": 3.2262845849802374e-05, + "loss": 0.3571, + "step": 472 + }, + { + "epoch": 1.2583222370173104, + "grad_norm": 0.27353129045046504, + "learning_rate": 3.221343873517787e-05, + "loss": 0.366, + "step": 473 + }, + { + "epoch": 1.2609853528628494, + "grad_norm": 0.3336825600119343, + "learning_rate": 3.2164031620553356e-05, + "loss": 0.3682, + "step": 474 + }, + { + "epoch": 1.2636484687083889, + "grad_norm": 0.28422664920281926, + "learning_rate": 3.2114624505928854e-05, + "loss": 0.3574, + "step": 475 + }, + { + "epoch": 1.2663115845539281, + "grad_norm": 0.27995772097533356, + "learning_rate": 3.2065217391304345e-05, + "loss": 0.3577, + "step": 476 + }, + { + "epoch": 1.2689747003994674, + "grad_norm": 0.3073145651684054, + "learning_rate": 3.201581027667984e-05, + "loss": 0.356, + "step": 477 + }, + { + "epoch": 1.2716378162450066, + "grad_norm": 0.2926799912079748, + "learning_rate": 3.1966403162055334e-05, + "loss": 0.3398, + "step": 478 + }, + { + "epoch": 1.2743009320905458, + "grad_norm": 0.2638946062975387, + "learning_rate": 3.191699604743083e-05, + "loss": 0.3742, + "step": 479 + }, + { + "epoch": 1.2769640479360853, + "grad_norm": 0.3188095670364053, + "learning_rate": 3.186758893280632e-05, + "loss": 0.3564, + "step": 480 + }, + { + "epoch": 1.2796271637816246, + "grad_norm": 0.2620162833825017, + "learning_rate": 3.181818181818182e-05, + "loss": 0.36, + "step": 481 + }, + { + "epoch": 1.2822902796271638, + "grad_norm": 0.34823059030048475, + "learning_rate": 3.176877470355731e-05, + "loss": 0.3595, + "step": 482 + }, + { + "epoch": 1.284953395472703, + "grad_norm": 0.31553137736166625, + "learning_rate": 3.171936758893281e-05, + "loss": 0.3599, + "step": 483 + }, + { + "epoch": 1.2876165113182423, + "grad_norm": 0.2955708469323441, + "learning_rate": 3.16699604743083e-05, + "loss": 0.3402, + "step": 484 + }, + { + "epoch": 1.2902796271637818, + "grad_norm": 0.3913482669169413, + "learning_rate": 3.16205533596838e-05, + "loss": 0.3758, + "step": 485 + }, + { + "epoch": 1.2929427430093208, + "grad_norm": 0.35700628657251265, + "learning_rate": 3.157114624505929e-05, + "loss": 0.3581, + "step": 486 + }, + { + "epoch": 1.2956058588548602, + "grad_norm": 0.3014863988052369, + "learning_rate": 3.152173913043479e-05, + "loss": 0.3554, + "step": 487 + }, + { + "epoch": 1.2982689747003995, + "grad_norm": 0.3644987716917946, + "learning_rate": 3.147233201581028e-05, + "loss": 0.3562, + "step": 488 + }, + { + "epoch": 1.3009320905459387, + "grad_norm": 0.30956500239595414, + "learning_rate": 3.1422924901185775e-05, + "loss": 0.3454, + "step": 489 + }, + { + "epoch": 1.303595206391478, + "grad_norm": 0.4175232794253573, + "learning_rate": 3.1373517786561266e-05, + "loss": 0.3641, + "step": 490 + }, + { + "epoch": 1.3062583222370172, + "grad_norm": 0.28246226404029123, + "learning_rate": 3.1324110671936764e-05, + "loss": 0.3601, + "step": 491 + }, + { + "epoch": 1.3089214380825567, + "grad_norm": 0.3755376891190061, + "learning_rate": 3.1274703557312255e-05, + "loss": 0.3774, + "step": 492 + }, + { + "epoch": 1.311584553928096, + "grad_norm": 0.27298674883257873, + "learning_rate": 3.1225296442687746e-05, + "loss": 0.3627, + "step": 493 + }, + { + "epoch": 1.3142476697736352, + "grad_norm": 0.3706229801540267, + "learning_rate": 3.117588932806324e-05, + "loss": 0.3735, + "step": 494 + }, + { + "epoch": 1.3169107856191744, + "grad_norm": 0.28143910738942546, + "learning_rate": 3.1126482213438735e-05, + "loss": 0.3725, + "step": 495 + }, + { + "epoch": 1.3195739014647137, + "grad_norm": 0.3349025665393724, + "learning_rate": 3.1077075098814226e-05, + "loss": 0.3659, + "step": 496 + }, + { + "epoch": 1.3222370173102531, + "grad_norm": 0.29588987329109573, + "learning_rate": 3.1027667984189724e-05, + "loss": 0.3749, + "step": 497 + }, + { + "epoch": 1.3249001331557924, + "grad_norm": 0.27901948593654424, + "learning_rate": 3.0978260869565215e-05, + "loss": 0.3555, + "step": 498 + }, + { + "epoch": 1.3275632490013316, + "grad_norm": 0.3180943674654497, + "learning_rate": 3.092885375494071e-05, + "loss": 0.3399, + "step": 499 + }, + { + "epoch": 1.3302263648468708, + "grad_norm": 0.3257820898386027, + "learning_rate": 3.0879446640316203e-05, + "loss": 0.3592, + "step": 500 + }, + { + "epoch": 1.33288948069241, + "grad_norm": 0.29341640703427146, + "learning_rate": 3.08300395256917e-05, + "loss": 0.3602, + "step": 501 + }, + { + "epoch": 1.3355525965379493, + "grad_norm": 0.2975810782284494, + "learning_rate": 3.078063241106719e-05, + "loss": 0.3392, + "step": 502 + }, + { + "epoch": 1.3382157123834886, + "grad_norm": 0.26682712897635374, + "learning_rate": 3.073122529644269e-05, + "loss": 0.3539, + "step": 503 + }, + { + "epoch": 1.340878828229028, + "grad_norm": 0.29028707302441564, + "learning_rate": 3.068181818181818e-05, + "loss": 0.3511, + "step": 504 + }, + { + "epoch": 1.3435419440745673, + "grad_norm": 0.32760242848226895, + "learning_rate": 3.063241106719368e-05, + "loss": 0.3804, + "step": 505 + }, + { + "epoch": 1.3462050599201065, + "grad_norm": 0.3092786220233137, + "learning_rate": 3.058300395256917e-05, + "loss": 0.3699, + "step": 506 + }, + { + "epoch": 1.3488681757656458, + "grad_norm": 0.3020724813833627, + "learning_rate": 3.053359683794467e-05, + "loss": 0.3676, + "step": 507 + }, + { + "epoch": 1.351531291611185, + "grad_norm": 0.2824033966398368, + "learning_rate": 3.0484189723320162e-05, + "loss": 0.3729, + "step": 508 + }, + { + "epoch": 1.3541944074567245, + "grad_norm": 0.3618887388165828, + "learning_rate": 3.0434782608695656e-05, + "loss": 0.3554, + "step": 509 + }, + { + "epoch": 1.3568575233022637, + "grad_norm": 0.28130180514019887, + "learning_rate": 3.038537549407115e-05, + "loss": 0.3553, + "step": 510 + }, + { + "epoch": 1.359520639147803, + "grad_norm": 0.2893653104001468, + "learning_rate": 3.0335968379446645e-05, + "loss": 0.3782, + "step": 511 + }, + { + "epoch": 1.3621837549933422, + "grad_norm": 0.3469803538239057, + "learning_rate": 3.0286561264822133e-05, + "loss": 0.3464, + "step": 512 + }, + { + "epoch": 1.3648468708388815, + "grad_norm": 0.2732418490440155, + "learning_rate": 3.0237154150197627e-05, + "loss": 0.3616, + "step": 513 + }, + { + "epoch": 1.3675099866844207, + "grad_norm": 0.28562062527552706, + "learning_rate": 3.018774703557312e-05, + "loss": 0.3535, + "step": 514 + }, + { + "epoch": 1.37017310252996, + "grad_norm": 0.2658369004792245, + "learning_rate": 3.0138339920948616e-05, + "loss": 0.3725, + "step": 515 + }, + { + "epoch": 1.3728362183754994, + "grad_norm": 0.29358847654377684, + "learning_rate": 3.008893280632411e-05, + "loss": 0.3496, + "step": 516 + }, + { + "epoch": 1.3754993342210386, + "grad_norm": 0.27539943140564604, + "learning_rate": 3.0039525691699605e-05, + "loss": 0.369, + "step": 517 + }, + { + "epoch": 1.378162450066578, + "grad_norm": 0.300263236071914, + "learning_rate": 2.99901185770751e-05, + "loss": 0.3585, + "step": 518 + }, + { + "epoch": 1.3808255659121171, + "grad_norm": 0.31613231965587374, + "learning_rate": 2.9940711462450593e-05, + "loss": 0.3777, + "step": 519 + }, + { + "epoch": 1.3834886817576564, + "grad_norm": 0.2770700909868314, + "learning_rate": 2.9891304347826088e-05, + "loss": 0.3561, + "step": 520 + }, + { + "epoch": 1.3861517976031958, + "grad_norm": 0.3050401099786546, + "learning_rate": 2.9841897233201582e-05, + "loss": 0.3563, + "step": 521 + }, + { + "epoch": 1.388814913448735, + "grad_norm": 0.2533844111874208, + "learning_rate": 2.9792490118577076e-05, + "loss": 0.3469, + "step": 522 + }, + { + "epoch": 1.3914780292942743, + "grad_norm": 0.2695972396120006, + "learning_rate": 2.974308300395257e-05, + "loss": 0.3621, + "step": 523 + }, + { + "epoch": 1.3941411451398136, + "grad_norm": 0.28186697645815617, + "learning_rate": 2.9693675889328065e-05, + "loss": 0.3559, + "step": 524 + }, + { + "epoch": 1.3968042609853528, + "grad_norm": 0.26628352738719235, + "learning_rate": 2.964426877470356e-05, + "loss": 0.3646, + "step": 525 + }, + { + "epoch": 1.399467376830892, + "grad_norm": 0.2833122304678988, + "learning_rate": 2.9594861660079054e-05, + "loss": 0.3552, + "step": 526 + }, + { + "epoch": 1.4021304926764313, + "grad_norm": 0.26716813523678146, + "learning_rate": 2.954545454545455e-05, + "loss": 0.3345, + "step": 527 + }, + { + "epoch": 1.4047936085219708, + "grad_norm": 0.2754005215378796, + "learning_rate": 2.9496047430830043e-05, + "loss": 0.3531, + "step": 528 + }, + { + "epoch": 1.40745672436751, + "grad_norm": 0.3036387674463336, + "learning_rate": 2.9446640316205537e-05, + "loss": 0.3394, + "step": 529 + }, + { + "epoch": 1.4101198402130493, + "grad_norm": 0.28788105676480225, + "learning_rate": 2.939723320158103e-05, + "loss": 0.342, + "step": 530 + }, + { + "epoch": 1.4127829560585885, + "grad_norm": 0.28191999375557225, + "learning_rate": 2.9347826086956526e-05, + "loss": 0.3488, + "step": 531 + }, + { + "epoch": 1.4154460719041277, + "grad_norm": 0.2973599610924886, + "learning_rate": 2.9298418972332014e-05, + "loss": 0.369, + "step": 532 + }, + { + "epoch": 1.4181091877496672, + "grad_norm": 0.29639597168777376, + "learning_rate": 2.9249011857707508e-05, + "loss": 0.3696, + "step": 533 + }, + { + "epoch": 1.4207723035952065, + "grad_norm": 0.2943864772067253, + "learning_rate": 2.9199604743083002e-05, + "loss": 0.3708, + "step": 534 + }, + { + "epoch": 1.4234354194407457, + "grad_norm": 0.3275031870349291, + "learning_rate": 2.9150197628458497e-05, + "loss": 0.359, + "step": 535 + }, + { + "epoch": 1.426098535286285, + "grad_norm": 0.288973368099439, + "learning_rate": 2.910079051383399e-05, + "loss": 0.3534, + "step": 536 + }, + { + "epoch": 1.4287616511318242, + "grad_norm": 0.3066522465043432, + "learning_rate": 2.9051383399209485e-05, + "loss": 0.3568, + "step": 537 + }, + { + "epoch": 1.4314247669773636, + "grad_norm": 0.3056985012074139, + "learning_rate": 2.900197628458498e-05, + "loss": 0.3457, + "step": 538 + }, + { + "epoch": 1.4340878828229027, + "grad_norm": 0.2793941010759859, + "learning_rate": 2.8952569169960474e-05, + "loss": 0.3559, + "step": 539 + }, + { + "epoch": 1.4367509986684421, + "grad_norm": 0.2535278252678889, + "learning_rate": 2.890316205533597e-05, + "loss": 0.3528, + "step": 540 + }, + { + "epoch": 1.4394141145139814, + "grad_norm": 0.2842251418338047, + "learning_rate": 2.8853754940711463e-05, + "loss": 0.3522, + "step": 541 + }, + { + "epoch": 1.4420772303595206, + "grad_norm": 0.2778073412674222, + "learning_rate": 2.8804347826086957e-05, + "loss": 0.3603, + "step": 542 + }, + { + "epoch": 1.4447403462050599, + "grad_norm": 0.2554361454610928, + "learning_rate": 2.8754940711462452e-05, + "loss": 0.3635, + "step": 543 + }, + { + "epoch": 1.447403462050599, + "grad_norm": 0.3049003958057493, + "learning_rate": 2.8705533596837946e-05, + "loss": 0.3602, + "step": 544 + }, + { + "epoch": 1.4500665778961386, + "grad_norm": 0.2675057851041106, + "learning_rate": 2.865612648221344e-05, + "loss": 0.3612, + "step": 545 + }, + { + "epoch": 1.4527296937416778, + "grad_norm": 0.24887490119807607, + "learning_rate": 2.8606719367588935e-05, + "loss": 0.3654, + "step": 546 + }, + { + "epoch": 1.455392809587217, + "grad_norm": 0.3195728958038635, + "learning_rate": 2.855731225296443e-05, + "loss": 0.3513, + "step": 547 + }, + { + "epoch": 1.4580559254327563, + "grad_norm": 0.2546987092178984, + "learning_rate": 2.8507905138339924e-05, + "loss": 0.3398, + "step": 548 + }, + { + "epoch": 1.4607190412782955, + "grad_norm": 0.29773690473267483, + "learning_rate": 2.8458498023715418e-05, + "loss": 0.3694, + "step": 549 + }, + { + "epoch": 1.463382157123835, + "grad_norm": 0.29315481833169116, + "learning_rate": 2.8409090909090912e-05, + "loss": 0.3426, + "step": 550 + }, + { + "epoch": 1.466045272969374, + "grad_norm": 0.3296358712762741, + "learning_rate": 2.8359683794466403e-05, + "loss": 0.3761, + "step": 551 + }, + { + "epoch": 1.4687083888149135, + "grad_norm": 0.2989240945630588, + "learning_rate": 2.8310276679841894e-05, + "loss": 0.3574, + "step": 552 + }, + { + "epoch": 1.4713715046604527, + "grad_norm": 0.2933347023687216, + "learning_rate": 2.826086956521739e-05, + "loss": 0.3615, + "step": 553 + }, + { + "epoch": 1.474034620505992, + "grad_norm": 0.31885875118020457, + "learning_rate": 2.8211462450592883e-05, + "loss": 0.3645, + "step": 554 + }, + { + "epoch": 1.4766977363515312, + "grad_norm": 0.2777657172797497, + "learning_rate": 2.8162055335968378e-05, + "loss": 0.3531, + "step": 555 + }, + { + "epoch": 1.4793608521970705, + "grad_norm": 0.3318676753935055, + "learning_rate": 2.8112648221343872e-05, + "loss": 0.3668, + "step": 556 + }, + { + "epoch": 1.48202396804261, + "grad_norm": 0.3316376422278272, + "learning_rate": 2.8063241106719366e-05, + "loss": 0.348, + "step": 557 + }, + { + "epoch": 1.4846870838881492, + "grad_norm": 0.34334200086282374, + "learning_rate": 2.801383399209486e-05, + "loss": 0.3684, + "step": 558 + }, + { + "epoch": 1.4873501997336884, + "grad_norm": 0.2998752672686297, + "learning_rate": 2.7964426877470355e-05, + "loss": 0.343, + "step": 559 + }, + { + "epoch": 1.4900133155792277, + "grad_norm": 0.323718625297975, + "learning_rate": 2.791501976284585e-05, + "loss": 0.3435, + "step": 560 + }, + { + "epoch": 1.492676431424767, + "grad_norm": 0.3042077739086944, + "learning_rate": 2.7865612648221344e-05, + "loss": 0.357, + "step": 561 + }, + { + "epoch": 1.4953395472703064, + "grad_norm": 0.3132911982849499, + "learning_rate": 2.7816205533596838e-05, + "loss": 0.3481, + "step": 562 + }, + { + "epoch": 1.4980026631158454, + "grad_norm": 0.25389583970465485, + "learning_rate": 2.7766798418972333e-05, + "loss": 0.3567, + "step": 563 + }, + { + "epoch": 1.5006657789613849, + "grad_norm": 0.263337393271962, + "learning_rate": 2.7717391304347827e-05, + "loss": 0.3431, + "step": 564 + }, + { + "epoch": 1.503328894806924, + "grad_norm": 0.2712654205175259, + "learning_rate": 2.766798418972332e-05, + "loss": 0.3582, + "step": 565 + }, + { + "epoch": 1.5059920106524634, + "grad_norm": 0.2612896047069462, + "learning_rate": 2.7618577075098816e-05, + "loss": 0.3445, + "step": 566 + }, + { + "epoch": 1.5086551264980028, + "grad_norm": 0.27219615901029837, + "learning_rate": 2.756916996047431e-05, + "loss": 0.3652, + "step": 567 + }, + { + "epoch": 1.5113182423435418, + "grad_norm": 0.24840155978956244, + "learning_rate": 2.7519762845849805e-05, + "loss": 0.3421, + "step": 568 + }, + { + "epoch": 1.5139813581890813, + "grad_norm": 0.24176135920761713, + "learning_rate": 2.74703557312253e-05, + "loss": 0.3512, + "step": 569 + }, + { + "epoch": 1.5166444740346205, + "grad_norm": 0.2647051981979065, + "learning_rate": 2.7420948616600793e-05, + "loss": 0.3499, + "step": 570 + }, + { + "epoch": 1.5193075898801598, + "grad_norm": 0.27211007538489024, + "learning_rate": 2.7371541501976284e-05, + "loss": 0.3462, + "step": 571 + }, + { + "epoch": 1.521970705725699, + "grad_norm": 0.2507493740105373, + "learning_rate": 2.732213438735178e-05, + "loss": 0.3434, + "step": 572 + }, + { + "epoch": 1.5246338215712383, + "grad_norm": 0.2693556555763232, + "learning_rate": 2.7272727272727273e-05, + "loss": 0.3615, + "step": 573 + }, + { + "epoch": 1.5272969374167777, + "grad_norm": 0.274645850715254, + "learning_rate": 2.7223320158102767e-05, + "loss": 0.3445, + "step": 574 + }, + { + "epoch": 1.5299600532623168, + "grad_norm": 0.24351837189102682, + "learning_rate": 2.7173913043478262e-05, + "loss": 0.3686, + "step": 575 + }, + { + "epoch": 1.5326231691078562, + "grad_norm": 0.27710340393878174, + "learning_rate": 2.7124505928853756e-05, + "loss": 0.3547, + "step": 576 + }, + { + "epoch": 1.5352862849533955, + "grad_norm": 0.2806488747523977, + "learning_rate": 2.707509881422925e-05, + "loss": 0.3672, + "step": 577 + }, + { + "epoch": 1.5379494007989347, + "grad_norm": 0.32294972985992815, + "learning_rate": 2.7025691699604745e-05, + "loss": 0.3527, + "step": 578 + }, + { + "epoch": 1.5406125166444742, + "grad_norm": 0.24771959309258884, + "learning_rate": 2.697628458498024e-05, + "loss": 0.3626, + "step": 579 + }, + { + "epoch": 1.5432756324900132, + "grad_norm": 0.31974111618484613, + "learning_rate": 2.6926877470355734e-05, + "loss": 0.3553, + "step": 580 + }, + { + "epoch": 1.5459387483355527, + "grad_norm": 0.28071413168163195, + "learning_rate": 2.6877470355731228e-05, + "loss": 0.3676, + "step": 581 + }, + { + "epoch": 1.548601864181092, + "grad_norm": 0.2584928716043461, + "learning_rate": 2.6828063241106723e-05, + "loss": 0.3427, + "step": 582 + }, + { + "epoch": 1.5512649800266312, + "grad_norm": 0.2648608207536266, + "learning_rate": 2.6778656126482217e-05, + "loss": 0.3377, + "step": 583 + }, + { + "epoch": 1.5539280958721704, + "grad_norm": 0.2671119266891378, + "learning_rate": 2.672924901185771e-05, + "loss": 0.3559, + "step": 584 + }, + { + "epoch": 1.5565912117177096, + "grad_norm": 0.2840788018392293, + "learning_rate": 2.6679841897233206e-05, + "loss": 0.355, + "step": 585 + }, + { + "epoch": 1.559254327563249, + "grad_norm": 0.29216560920303836, + "learning_rate": 2.66304347826087e-05, + "loss": 0.3625, + "step": 586 + }, + { + "epoch": 1.5619174434087881, + "grad_norm": 0.2782406231477868, + "learning_rate": 2.6581027667984194e-05, + "loss": 0.3544, + "step": 587 + }, + { + "epoch": 1.5645805592543276, + "grad_norm": 0.27482653297611137, + "learning_rate": 2.653162055335969e-05, + "loss": 0.3505, + "step": 588 + }, + { + "epoch": 1.5672436750998668, + "grad_norm": 0.2737639812672786, + "learning_rate": 2.6482213438735183e-05, + "loss": 0.3339, + "step": 589 + }, + { + "epoch": 1.569906790945406, + "grad_norm": 0.30172379604459587, + "learning_rate": 2.643280632411067e-05, + "loss": 0.3574, + "step": 590 + }, + { + "epoch": 1.5725699067909455, + "grad_norm": 0.30937296239336515, + "learning_rate": 2.6383399209486165e-05, + "loss": 0.3552, + "step": 591 + }, + { + "epoch": 1.5752330226364846, + "grad_norm": 0.30263893603202113, + "learning_rate": 2.633399209486166e-05, + "loss": 0.3806, + "step": 592 + }, + { + "epoch": 1.577896138482024, + "grad_norm": 0.36351951882340405, + "learning_rate": 2.6284584980237154e-05, + "loss": 0.3483, + "step": 593 + }, + { + "epoch": 1.5805592543275633, + "grad_norm": 0.27596120256597706, + "learning_rate": 2.623517786561265e-05, + "loss": 0.3785, + "step": 594 + }, + { + "epoch": 1.5832223701731025, + "grad_norm": 0.30086295136857, + "learning_rate": 2.6185770750988143e-05, + "loss": 0.3536, + "step": 595 + }, + { + "epoch": 1.5858854860186418, + "grad_norm": 0.3786534775512319, + "learning_rate": 2.6136363636363637e-05, + "loss": 0.3577, + "step": 596 + }, + { + "epoch": 1.588548601864181, + "grad_norm": 0.294153803281236, + "learning_rate": 2.608695652173913e-05, + "loss": 0.3603, + "step": 597 + }, + { + "epoch": 1.5912117177097205, + "grad_norm": 0.316506621080003, + "learning_rate": 2.6037549407114626e-05, + "loss": 0.3763, + "step": 598 + }, + { + "epoch": 1.5938748335552595, + "grad_norm": 0.31539133712695033, + "learning_rate": 2.598814229249012e-05, + "loss": 0.3373, + "step": 599 + }, + { + "epoch": 1.596537949400799, + "grad_norm": 0.29787884422276756, + "learning_rate": 2.5938735177865615e-05, + "loss": 0.3461, + "step": 600 + }, + { + "epoch": 1.5992010652463382, + "grad_norm": 0.2794574362382508, + "learning_rate": 2.588932806324111e-05, + "loss": 0.3607, + "step": 601 + }, + { + "epoch": 1.6018641810918774, + "grad_norm": 0.28198668683252337, + "learning_rate": 2.5839920948616603e-05, + "loss": 0.3698, + "step": 602 + }, + { + "epoch": 1.604527296937417, + "grad_norm": 0.2767707782956735, + "learning_rate": 2.5790513833992098e-05, + "loss": 0.3358, + "step": 603 + }, + { + "epoch": 1.607190412782956, + "grad_norm": 0.26770289783678053, + "learning_rate": 2.5741106719367592e-05, + "loss": 0.3376, + "step": 604 + }, + { + "epoch": 1.6098535286284954, + "grad_norm": 0.3244106061056206, + "learning_rate": 2.5691699604743087e-05, + "loss": 0.3515, + "step": 605 + }, + { + "epoch": 1.6125166444740346, + "grad_norm": 0.29260066196150414, + "learning_rate": 2.564229249011858e-05, + "loss": 0.3712, + "step": 606 + }, + { + "epoch": 1.6151797603195739, + "grad_norm": 0.39595763824507085, + "learning_rate": 2.5592885375494075e-05, + "loss": 0.3402, + "step": 607 + }, + { + "epoch": 1.6178428761651131, + "grad_norm": 0.2911698047056363, + "learning_rate": 2.554347826086957e-05, + "loss": 0.3579, + "step": 608 + }, + { + "epoch": 1.6205059920106524, + "grad_norm": 0.30667505894069086, + "learning_rate": 2.5494071146245064e-05, + "loss": 0.3488, + "step": 609 + }, + { + "epoch": 1.6231691078561918, + "grad_norm": 0.3377626596928706, + "learning_rate": 2.5444664031620552e-05, + "loss": 0.3455, + "step": 610 + }, + { + "epoch": 1.6258322237017309, + "grad_norm": 0.3019507720671119, + "learning_rate": 2.5395256916996046e-05, + "loss": 0.352, + "step": 611 + }, + { + "epoch": 1.6284953395472703, + "grad_norm": 0.2835949922829532, + "learning_rate": 2.534584980237154e-05, + "loss": 0.3602, + "step": 612 + }, + { + "epoch": 1.6311584553928096, + "grad_norm": 0.32444980944074003, + "learning_rate": 2.5296442687747035e-05, + "loss": 0.3626, + "step": 613 + }, + { + "epoch": 1.6338215712383488, + "grad_norm": 0.30852262333031255, + "learning_rate": 2.524703557312253e-05, + "loss": 0.3415, + "step": 614 + }, + { + "epoch": 1.6364846870838883, + "grad_norm": 0.2769395153617194, + "learning_rate": 2.5197628458498024e-05, + "loss": 0.36, + "step": 615 + }, + { + "epoch": 1.6391478029294273, + "grad_norm": 0.3225695333542542, + "learning_rate": 2.5148221343873518e-05, + "loss": 0.3504, + "step": 616 + }, + { + "epoch": 1.6418109187749668, + "grad_norm": 0.26000908179747434, + "learning_rate": 2.5098814229249012e-05, + "loss": 0.3511, + "step": 617 + }, + { + "epoch": 1.644474034620506, + "grad_norm": 0.2558998742720099, + "learning_rate": 2.5049407114624507e-05, + "loss": 0.3551, + "step": 618 + }, + { + "epoch": 1.6471371504660453, + "grad_norm": 0.2810631366750719, + "learning_rate": 2.5e-05, + "loss": 0.359, + "step": 619 + }, + { + "epoch": 1.6498002663115847, + "grad_norm": 0.2764036943026752, + "learning_rate": 2.4950592885375496e-05, + "loss": 0.3552, + "step": 620 + }, + { + "epoch": 1.6524633821571237, + "grad_norm": 0.29157627798525887, + "learning_rate": 2.490118577075099e-05, + "loss": 0.3477, + "step": 621 + }, + { + "epoch": 1.6551264980026632, + "grad_norm": 0.30005399168360375, + "learning_rate": 2.4851778656126484e-05, + "loss": 0.3635, + "step": 622 + }, + { + "epoch": 1.6577896138482024, + "grad_norm": 0.28682265413573244, + "learning_rate": 2.480237154150198e-05, + "loss": 0.3472, + "step": 623 + }, + { + "epoch": 1.6604527296937417, + "grad_norm": 0.30810891527099654, + "learning_rate": 2.475296442687747e-05, + "loss": 0.3453, + "step": 624 + }, + { + "epoch": 1.663115845539281, + "grad_norm": 0.2894658697891752, + "learning_rate": 2.4703557312252964e-05, + "loss": 0.348, + "step": 625 + }, + { + "epoch": 1.6657789613848202, + "grad_norm": 0.26056026406293753, + "learning_rate": 2.465415019762846e-05, + "loss": 0.3422, + "step": 626 + }, + { + "epoch": 1.6684420772303596, + "grad_norm": 0.27955802745377495, + "learning_rate": 2.4604743083003953e-05, + "loss": 0.351, + "step": 627 + }, + { + "epoch": 1.6711051930758987, + "grad_norm": 0.2589447838000819, + "learning_rate": 2.4555335968379447e-05, + "loss": 0.3606, + "step": 628 + }, + { + "epoch": 1.6737683089214381, + "grad_norm": 0.2726720946381243, + "learning_rate": 2.450592885375494e-05, + "loss": 0.3553, + "step": 629 + }, + { + "epoch": 1.6764314247669774, + "grad_norm": 0.29585982981776077, + "learning_rate": 2.4456521739130436e-05, + "loss": 0.3429, + "step": 630 + }, + { + "epoch": 1.6790945406125166, + "grad_norm": 0.25866785993085295, + "learning_rate": 2.440711462450593e-05, + "loss": 0.3464, + "step": 631 + }, + { + "epoch": 1.681757656458056, + "grad_norm": 0.26186173743371105, + "learning_rate": 2.4357707509881425e-05, + "loss": 0.3624, + "step": 632 + }, + { + "epoch": 1.684420772303595, + "grad_norm": 0.27529386090536323, + "learning_rate": 2.430830039525692e-05, + "loss": 0.3464, + "step": 633 + }, + { + "epoch": 1.6870838881491346, + "grad_norm": 0.24305368943964414, + "learning_rate": 2.425889328063241e-05, + "loss": 0.3542, + "step": 634 + }, + { + "epoch": 1.6897470039946738, + "grad_norm": 0.263035963649886, + "learning_rate": 2.4209486166007905e-05, + "loss": 0.3638, + "step": 635 + }, + { + "epoch": 1.692410119840213, + "grad_norm": 0.2737080512587832, + "learning_rate": 2.41600790513834e-05, + "loss": 0.3368, + "step": 636 + }, + { + "epoch": 1.6950732356857523, + "grad_norm": 0.33404220986339256, + "learning_rate": 2.4110671936758893e-05, + "loss": 0.3724, + "step": 637 + }, + { + "epoch": 1.6977363515312915, + "grad_norm": 0.2897416261690682, + "learning_rate": 2.4061264822134388e-05, + "loss": 0.3593, + "step": 638 + }, + { + "epoch": 1.700399467376831, + "grad_norm": 0.3041816217006561, + "learning_rate": 2.4011857707509882e-05, + "loss": 0.3513, + "step": 639 + }, + { + "epoch": 1.70306258322237, + "grad_norm": 0.2677006117678147, + "learning_rate": 2.3962450592885376e-05, + "loss": 0.3594, + "step": 640 + }, + { + "epoch": 1.7057256990679095, + "grad_norm": 0.2783081801536929, + "learning_rate": 2.391304347826087e-05, + "loss": 0.3497, + "step": 641 + }, + { + "epoch": 1.7083888149134487, + "grad_norm": 0.2949970037820572, + "learning_rate": 2.3863636363636365e-05, + "loss": 0.3527, + "step": 642 + }, + { + "epoch": 1.711051930758988, + "grad_norm": 0.29435826287206446, + "learning_rate": 2.381422924901186e-05, + "loss": 0.3476, + "step": 643 + }, + { + "epoch": 1.7137150466045274, + "grad_norm": 0.22820704347237256, + "learning_rate": 2.376482213438735e-05, + "loss": 0.3563, + "step": 644 + }, + { + "epoch": 1.7163781624500665, + "grad_norm": 0.2662369562790593, + "learning_rate": 2.3715415019762845e-05, + "loss": 0.3564, + "step": 645 + }, + { + "epoch": 1.719041278295606, + "grad_norm": 0.2660848595820705, + "learning_rate": 2.366600790513834e-05, + "loss": 0.3507, + "step": 646 + }, + { + "epoch": 1.7217043941411452, + "grad_norm": 0.2736362440179924, + "learning_rate": 2.3616600790513834e-05, + "loss": 0.3583, + "step": 647 + }, + { + "epoch": 1.7243675099866844, + "grad_norm": 0.2877841104207108, + "learning_rate": 2.3567193675889328e-05, + "loss": 0.3543, + "step": 648 + }, + { + "epoch": 1.7270306258322237, + "grad_norm": 0.26935615929008033, + "learning_rate": 2.3517786561264823e-05, + "loss": 0.3437, + "step": 649 + }, + { + "epoch": 1.729693741677763, + "grad_norm": 0.2578776022705283, + "learning_rate": 2.3468379446640317e-05, + "loss": 0.3665, + "step": 650 + }, + { + "epoch": 1.7323568575233024, + "grad_norm": 0.28540169794092723, + "learning_rate": 2.341897233201581e-05, + "loss": 0.3427, + "step": 651 + }, + { + "epoch": 1.7350199733688414, + "grad_norm": 0.302406678764912, + "learning_rate": 2.3369565217391306e-05, + "loss": 0.3493, + "step": 652 + }, + { + "epoch": 1.7376830892143809, + "grad_norm": 0.2613558705976954, + "learning_rate": 2.33201581027668e-05, + "loss": 0.3384, + "step": 653 + }, + { + "epoch": 1.74034620505992, + "grad_norm": 0.31445958338443253, + "learning_rate": 2.327075098814229e-05, + "loss": 0.3563, + "step": 654 + }, + { + "epoch": 1.7430093209054593, + "grad_norm": 0.26295035895535324, + "learning_rate": 2.3221343873517785e-05, + "loss": 0.3523, + "step": 655 + }, + { + "epoch": 1.7456724367509988, + "grad_norm": 0.26455791446031185, + "learning_rate": 2.317193675889328e-05, + "loss": 0.347, + "step": 656 + }, + { + "epoch": 1.7483355525965378, + "grad_norm": 0.267920904226216, + "learning_rate": 2.3122529644268774e-05, + "loss": 0.3757, + "step": 657 + }, + { + "epoch": 1.7509986684420773, + "grad_norm": 0.29766057642277893, + "learning_rate": 2.307312252964427e-05, + "loss": 0.3388, + "step": 658 + }, + { + "epoch": 1.7536617842876165, + "grad_norm": 0.2614333124037635, + "learning_rate": 2.3023715415019763e-05, + "loss": 0.3448, + "step": 659 + }, + { + "epoch": 1.7563249001331558, + "grad_norm": 0.2460873862604595, + "learning_rate": 2.2974308300395257e-05, + "loss": 0.3701, + "step": 660 + }, + { + "epoch": 1.758988015978695, + "grad_norm": 0.32415471595000084, + "learning_rate": 2.2924901185770752e-05, + "loss": 0.3502, + "step": 661 + }, + { + "epoch": 1.7616511318242343, + "grad_norm": 0.28861202445680917, + "learning_rate": 2.2875494071146246e-05, + "loss": 0.3419, + "step": 662 + }, + { + "epoch": 1.7643142476697737, + "grad_norm": 0.33178480237112284, + "learning_rate": 2.282608695652174e-05, + "loss": 0.364, + "step": 663 + }, + { + "epoch": 1.7669773635153128, + "grad_norm": 0.28362428197182826, + "learning_rate": 2.2776679841897235e-05, + "loss": 0.3447, + "step": 664 + }, + { + "epoch": 1.7696404793608522, + "grad_norm": 0.2593493932357841, + "learning_rate": 2.272727272727273e-05, + "loss": 0.3566, + "step": 665 + }, + { + "epoch": 1.7723035952063915, + "grad_norm": 0.32399886004151673, + "learning_rate": 2.267786561264822e-05, + "loss": 0.352, + "step": 666 + }, + { + "epoch": 1.7749667110519307, + "grad_norm": 0.2898594306022826, + "learning_rate": 2.2628458498023715e-05, + "loss": 0.3552, + "step": 667 + }, + { + "epoch": 1.7776298268974702, + "grad_norm": 0.30141440115798507, + "learning_rate": 2.257905138339921e-05, + "loss": 0.3394, + "step": 668 + }, + { + "epoch": 1.7802929427430092, + "grad_norm": 0.2748566768296462, + "learning_rate": 2.2529644268774703e-05, + "loss": 0.3639, + "step": 669 + }, + { + "epoch": 1.7829560585885487, + "grad_norm": 0.2597063738725183, + "learning_rate": 2.2480237154150198e-05, + "loss": 0.3523, + "step": 670 + }, + { + "epoch": 1.785619174434088, + "grad_norm": 0.27428899527158185, + "learning_rate": 2.2430830039525692e-05, + "loss": 0.3576, + "step": 671 + }, + { + "epoch": 1.7882822902796272, + "grad_norm": 0.27821642567843663, + "learning_rate": 2.2381422924901187e-05, + "loss": 0.3431, + "step": 672 + }, + { + "epoch": 1.7909454061251664, + "grad_norm": 0.3009289717068197, + "learning_rate": 2.233201581027668e-05, + "loss": 0.3506, + "step": 673 + }, + { + "epoch": 1.7936085219707056, + "grad_norm": 0.27901500754869907, + "learning_rate": 2.2282608695652175e-05, + "loss": 0.3413, + "step": 674 + }, + { + "epoch": 1.796271637816245, + "grad_norm": 0.26359419972730574, + "learning_rate": 2.223320158102767e-05, + "loss": 0.3574, + "step": 675 + }, + { + "epoch": 1.7989347536617841, + "grad_norm": 0.301875250326235, + "learning_rate": 2.2183794466403164e-05, + "loss": 0.3586, + "step": 676 + }, + { + "epoch": 1.8015978695073236, + "grad_norm": 0.293396805853932, + "learning_rate": 2.213438735177866e-05, + "loss": 0.3631, + "step": 677 + }, + { + "epoch": 1.8042609853528628, + "grad_norm": 0.2627077951859255, + "learning_rate": 2.2084980237154153e-05, + "loss": 0.3421, + "step": 678 + }, + { + "epoch": 1.806924101198402, + "grad_norm": 0.2910041424241653, + "learning_rate": 2.2035573122529647e-05, + "loss": 0.3508, + "step": 679 + }, + { + "epoch": 1.8095872170439415, + "grad_norm": 0.2700422024120216, + "learning_rate": 2.198616600790514e-05, + "loss": 0.3656, + "step": 680 + }, + { + "epoch": 1.8122503328894806, + "grad_norm": 0.261122870241434, + "learning_rate": 2.1936758893280636e-05, + "loss": 0.3727, + "step": 681 + }, + { + "epoch": 1.81491344873502, + "grad_norm": 0.2759182990026985, + "learning_rate": 2.188735177865613e-05, + "loss": 0.3429, + "step": 682 + }, + { + "epoch": 1.8175765645805593, + "grad_norm": 0.25688731642570295, + "learning_rate": 2.183794466403162e-05, + "loss": 0.3638, + "step": 683 + }, + { + "epoch": 1.8202396804260985, + "grad_norm": 0.2583299882188377, + "learning_rate": 2.1788537549407116e-05, + "loss": 0.3627, + "step": 684 + }, + { + "epoch": 1.822902796271638, + "grad_norm": 0.24824630818405677, + "learning_rate": 2.173913043478261e-05, + "loss": 0.3509, + "step": 685 + }, + { + "epoch": 1.825565912117177, + "grad_norm": 0.2775222142294749, + "learning_rate": 2.1689723320158105e-05, + "loss": 0.3421, + "step": 686 + }, + { + "epoch": 1.8282290279627165, + "grad_norm": 0.23869310034905467, + "learning_rate": 2.16403162055336e-05, + "loss": 0.3376, + "step": 687 + }, + { + "epoch": 1.8308921438082557, + "grad_norm": 0.2933357911415976, + "learning_rate": 2.1590909090909093e-05, + "loss": 0.3521, + "step": 688 + }, + { + "epoch": 1.833555259653795, + "grad_norm": 0.27832210393035933, + "learning_rate": 2.1541501976284588e-05, + "loss": 0.3553, + "step": 689 + }, + { + "epoch": 1.8362183754993342, + "grad_norm": 0.3087436970907245, + "learning_rate": 2.1492094861660082e-05, + "loss": 0.347, + "step": 690 + }, + { + "epoch": 1.8388814913448734, + "grad_norm": 0.2943513499295711, + "learning_rate": 2.1442687747035576e-05, + "loss": 0.3536, + "step": 691 + }, + { + "epoch": 1.841544607190413, + "grad_norm": 0.26722654225950093, + "learning_rate": 2.1393280632411067e-05, + "loss": 0.3624, + "step": 692 + }, + { + "epoch": 1.844207723035952, + "grad_norm": 0.2686739391641238, + "learning_rate": 2.1343873517786562e-05, + "loss": 0.3551, + "step": 693 + }, + { + "epoch": 1.8468708388814914, + "grad_norm": 0.3317404535951985, + "learning_rate": 2.1294466403162056e-05, + "loss": 0.3519, + "step": 694 + }, + { + "epoch": 1.8495339547270306, + "grad_norm": 0.25888461414583197, + "learning_rate": 2.124505928853755e-05, + "loss": 0.3621, + "step": 695 + }, + { + "epoch": 1.8521970705725699, + "grad_norm": 0.2388947383775022, + "learning_rate": 2.1195652173913045e-05, + "loss": 0.3464, + "step": 696 + }, + { + "epoch": 1.8548601864181093, + "grad_norm": 0.32253652339123096, + "learning_rate": 2.114624505928854e-05, + "loss": 0.3486, + "step": 697 + }, + { + "epoch": 1.8575233022636484, + "grad_norm": 0.23971764237483872, + "learning_rate": 2.1096837944664034e-05, + "loss": 0.3469, + "step": 698 + }, + { + "epoch": 1.8601864181091878, + "grad_norm": 0.2822968430519757, + "learning_rate": 2.1047430830039528e-05, + "loss": 0.3464, + "step": 699 + }, + { + "epoch": 1.862849533954727, + "grad_norm": 0.28707092445711563, + "learning_rate": 2.0998023715415023e-05, + "loss": 0.3454, + "step": 700 + }, + { + "epoch": 1.8655126498002663, + "grad_norm": 0.26633357589223594, + "learning_rate": 2.0948616600790517e-05, + "loss": 0.3528, + "step": 701 + }, + { + "epoch": 1.8681757656458056, + "grad_norm": 0.30480677025070735, + "learning_rate": 2.0899209486166008e-05, + "loss": 0.3705, + "step": 702 + }, + { + "epoch": 1.8708388814913448, + "grad_norm": 0.2589295473498244, + "learning_rate": 2.0849802371541502e-05, + "loss": 0.366, + "step": 703 + }, + { + "epoch": 1.8735019973368843, + "grad_norm": 0.3615686651832072, + "learning_rate": 2.0800395256916997e-05, + "loss": 0.3545, + "step": 704 + }, + { + "epoch": 1.8761651131824233, + "grad_norm": 0.2643316410023579, + "learning_rate": 2.075098814229249e-05, + "loss": 0.3478, + "step": 705 + }, + { + "epoch": 1.8788282290279628, + "grad_norm": 0.3002604064308654, + "learning_rate": 2.0701581027667985e-05, + "loss": 0.3691, + "step": 706 + }, + { + "epoch": 1.881491344873502, + "grad_norm": 0.2842611156357375, + "learning_rate": 2.065217391304348e-05, + "loss": 0.361, + "step": 707 + }, + { + "epoch": 1.8841544607190412, + "grad_norm": 0.3130168183378823, + "learning_rate": 2.0602766798418974e-05, + "loss": 0.3536, + "step": 708 + }, + { + "epoch": 1.8868175765645807, + "grad_norm": 0.3519161067004107, + "learning_rate": 2.055335968379447e-05, + "loss": 0.3557, + "step": 709 + }, + { + "epoch": 1.8894806924101197, + "grad_norm": 0.27233651062760655, + "learning_rate": 2.0503952569169963e-05, + "loss": 0.3594, + "step": 710 + }, + { + "epoch": 1.8921438082556592, + "grad_norm": 0.31833253788492577, + "learning_rate": 2.0454545454545457e-05, + "loss": 0.3489, + "step": 711 + }, + { + "epoch": 1.8948069241011984, + "grad_norm": 0.24567699858003664, + "learning_rate": 2.040513833992095e-05, + "loss": 0.3367, + "step": 712 + }, + { + "epoch": 1.8974700399467377, + "grad_norm": 0.2969050880879015, + "learning_rate": 2.0355731225296443e-05, + "loss": 0.3537, + "step": 713 + }, + { + "epoch": 1.900133155792277, + "grad_norm": 0.3189993081371087, + "learning_rate": 2.0306324110671937e-05, + "loss": 0.3669, + "step": 714 + }, + { + "epoch": 1.9027962716378162, + "grad_norm": 0.24524923802003742, + "learning_rate": 2.025691699604743e-05, + "loss": 0.3448, + "step": 715 + }, + { + "epoch": 1.9054593874833556, + "grad_norm": 0.3002012848114626, + "learning_rate": 2.0207509881422926e-05, + "loss": 0.3592, + "step": 716 + }, + { + "epoch": 1.9081225033288947, + "grad_norm": 0.2577221774068482, + "learning_rate": 2.015810276679842e-05, + "loss": 0.3615, + "step": 717 + }, + { + "epoch": 1.9107856191744341, + "grad_norm": 0.2662922499052391, + "learning_rate": 2.0108695652173915e-05, + "loss": 0.3564, + "step": 718 + }, + { + "epoch": 1.9134487350199734, + "grad_norm": 0.2748543453818437, + "learning_rate": 2.005928853754941e-05, + "loss": 0.3367, + "step": 719 + }, + { + "epoch": 1.9161118508655126, + "grad_norm": 0.29453902437825724, + "learning_rate": 2.0009881422924903e-05, + "loss": 0.3346, + "step": 720 + }, + { + "epoch": 1.918774966711052, + "grad_norm": 0.2958384946201868, + "learning_rate": 1.9960474308300398e-05, + "loss": 0.3653, + "step": 721 + }, + { + "epoch": 1.921438082556591, + "grad_norm": 0.3110870857995837, + "learning_rate": 1.991106719367589e-05, + "loss": 0.3626, + "step": 722 + }, + { + "epoch": 1.9241011984021306, + "grad_norm": 0.29754006004298117, + "learning_rate": 1.9861660079051383e-05, + "loss": 0.3595, + "step": 723 + }, + { + "epoch": 1.9267643142476698, + "grad_norm": 0.2637206512469971, + "learning_rate": 1.9812252964426878e-05, + "loss": 0.3637, + "step": 724 + }, + { + "epoch": 1.929427430093209, + "grad_norm": 0.28572071909963137, + "learning_rate": 1.9762845849802372e-05, + "loss": 0.351, + "step": 725 + }, + { + "epoch": 1.9320905459387483, + "grad_norm": 0.26449910347561634, + "learning_rate": 1.9713438735177866e-05, + "loss": 0.3607, + "step": 726 + }, + { + "epoch": 1.9347536617842875, + "grad_norm": 0.312752897256756, + "learning_rate": 1.966403162055336e-05, + "loss": 0.3591, + "step": 727 + }, + { + "epoch": 1.937416777629827, + "grad_norm": 0.2592410502272739, + "learning_rate": 1.9614624505928855e-05, + "loss": 0.3439, + "step": 728 + }, + { + "epoch": 1.940079893475366, + "grad_norm": 0.24250837194662156, + "learning_rate": 1.956521739130435e-05, + "loss": 0.3322, + "step": 729 + }, + { + "epoch": 1.9427430093209055, + "grad_norm": 0.27100632690728255, + "learning_rate": 1.9515810276679844e-05, + "loss": 0.3478, + "step": 730 + }, + { + "epoch": 1.9454061251664447, + "grad_norm": 0.2792664428193274, + "learning_rate": 1.9466403162055335e-05, + "loss": 0.3667, + "step": 731 + }, + { + "epoch": 1.948069241011984, + "grad_norm": 0.2619688688672022, + "learning_rate": 1.941699604743083e-05, + "loss": 0.3533, + "step": 732 + }, + { + "epoch": 1.9507323568575234, + "grad_norm": 0.250474396728028, + "learning_rate": 1.9367588932806324e-05, + "loss": 0.3615, + "step": 733 + }, + { + "epoch": 1.9533954727030625, + "grad_norm": 0.2592917559527508, + "learning_rate": 1.9318181818181818e-05, + "loss": 0.35, + "step": 734 + }, + { + "epoch": 1.956058588548602, + "grad_norm": 0.28358412495828245, + "learning_rate": 1.9268774703557312e-05, + "loss": 0.3438, + "step": 735 + }, + { + "epoch": 1.9587217043941412, + "grad_norm": 0.2905168266596484, + "learning_rate": 1.9219367588932807e-05, + "loss": 0.3363, + "step": 736 + }, + { + "epoch": 1.9613848202396804, + "grad_norm": 0.2558334592646534, + "learning_rate": 1.91699604743083e-05, + "loss": 0.3636, + "step": 737 + }, + { + "epoch": 1.9640479360852197, + "grad_norm": 0.2856486905717076, + "learning_rate": 1.9120553359683796e-05, + "loss": 0.3423, + "step": 738 + }, + { + "epoch": 1.966711051930759, + "grad_norm": 0.25338680291782845, + "learning_rate": 1.907114624505929e-05, + "loss": 0.3647, + "step": 739 + }, + { + "epoch": 1.9693741677762984, + "grad_norm": 0.25927241893410596, + "learning_rate": 1.9021739130434784e-05, + "loss": 0.361, + "step": 740 + }, + { + "epoch": 1.9720372836218374, + "grad_norm": 0.26559107296256046, + "learning_rate": 1.8972332015810275e-05, + "loss": 0.3532, + "step": 741 + }, + { + "epoch": 1.9747003994673769, + "grad_norm": 0.23909262831928838, + "learning_rate": 1.892292490118577e-05, + "loss": 0.3458, + "step": 742 + }, + { + "epoch": 1.977363515312916, + "grad_norm": 0.29570607043062813, + "learning_rate": 1.8873517786561264e-05, + "loss": 0.3651, + "step": 743 + }, + { + "epoch": 1.9800266311584553, + "grad_norm": 0.26837566907079335, + "learning_rate": 1.882411067193676e-05, + "loss": 0.3624, + "step": 744 + }, + { + "epoch": 1.9826897470039948, + "grad_norm": 0.24855234703810405, + "learning_rate": 1.8774703557312253e-05, + "loss": 0.3458, + "step": 745 + }, + { + "epoch": 1.9853528628495338, + "grad_norm": 0.2581276414313357, + "learning_rate": 1.8725296442687747e-05, + "loss": 0.3532, + "step": 746 + }, + { + "epoch": 1.9880159786950733, + "grad_norm": 0.2769192507293847, + "learning_rate": 1.867588932806324e-05, + "loss": 0.3662, + "step": 747 + }, + { + "epoch": 1.9906790945406125, + "grad_norm": 0.24782306003081656, + "learning_rate": 1.8626482213438736e-05, + "loss": 0.3444, + "step": 748 + }, + { + "epoch": 1.9933422103861518, + "grad_norm": 0.23338769959338118, + "learning_rate": 1.857707509881423e-05, + "loss": 0.3375, + "step": 749 + }, + { + "epoch": 1.996005326231691, + "grad_norm": 0.2399452380668713, + "learning_rate": 1.8527667984189725e-05, + "loss": 0.3577, + "step": 750 + }, + { + "epoch": 1.9986684420772303, + "grad_norm": 0.24061002934920092, + "learning_rate": 1.8478260869565216e-05, + "loss": 0.3558, + "step": 751 + }, + { + "epoch": 2.0, + "grad_norm": 0.3876397436943037, + "learning_rate": 1.842885375494071e-05, + "loss": 0.3232, + "step": 752 + }, + { + "epoch": 2.0026631158455395, + "grad_norm": 0.3255318592205839, + "learning_rate": 1.8379446640316205e-05, + "loss": 0.2829, + "step": 753 + }, + { + "epoch": 2.0053262316910785, + "grad_norm": 0.2688339427044817, + "learning_rate": 1.83300395256917e-05, + "loss": 0.2808, + "step": 754 + }, + { + "epoch": 2.007989347536618, + "grad_norm": 0.31006819974729777, + "learning_rate": 1.8280632411067193e-05, + "loss": 0.2619, + "step": 755 + }, + { + "epoch": 2.010652463382157, + "grad_norm": 0.3391232912122683, + "learning_rate": 1.8231225296442688e-05, + "loss": 0.2797, + "step": 756 + }, + { + "epoch": 2.0133155792276964, + "grad_norm": 0.22961985808221483, + "learning_rate": 1.8181818181818182e-05, + "loss": 0.2716, + "step": 757 + }, + { + "epoch": 2.015978695073236, + "grad_norm": 0.3029488541333639, + "learning_rate": 1.8132411067193676e-05, + "loss": 0.2748, + "step": 758 + }, + { + "epoch": 2.018641810918775, + "grad_norm": 0.3272089229771229, + "learning_rate": 1.808300395256917e-05, + "loss": 0.259, + "step": 759 + }, + { + "epoch": 2.0213049267643144, + "grad_norm": 0.2632568547847837, + "learning_rate": 1.8033596837944665e-05, + "loss": 0.2765, + "step": 760 + }, + { + "epoch": 2.0239680426098534, + "grad_norm": 0.278440470950714, + "learning_rate": 1.7984189723320156e-05, + "loss": 0.2638, + "step": 761 + }, + { + "epoch": 2.026631158455393, + "grad_norm": 0.3139907981507755, + "learning_rate": 1.793478260869565e-05, + "loss": 0.2805, + "step": 762 + }, + { + "epoch": 2.029294274300932, + "grad_norm": 0.26955412514066035, + "learning_rate": 1.7885375494071145e-05, + "loss": 0.2617, + "step": 763 + }, + { + "epoch": 2.0319573901464714, + "grad_norm": 0.2583856619944918, + "learning_rate": 1.783596837944664e-05, + "loss": 0.2678, + "step": 764 + }, + { + "epoch": 2.034620505992011, + "grad_norm": 0.27298004272506543, + "learning_rate": 1.7786561264822134e-05, + "loss": 0.2674, + "step": 765 + }, + { + "epoch": 2.03728362183755, + "grad_norm": 0.272776301937256, + "learning_rate": 1.7737154150197628e-05, + "loss": 0.2783, + "step": 766 + }, + { + "epoch": 2.0399467376830893, + "grad_norm": 0.23604664211204196, + "learning_rate": 1.7687747035573123e-05, + "loss": 0.2694, + "step": 767 + }, + { + "epoch": 2.0426098535286283, + "grad_norm": 0.2705685089413051, + "learning_rate": 1.7638339920948617e-05, + "loss": 0.2835, + "step": 768 + }, + { + "epoch": 2.045272969374168, + "grad_norm": 0.2348856411632335, + "learning_rate": 1.758893280632411e-05, + "loss": 0.2591, + "step": 769 + }, + { + "epoch": 2.0479360852197073, + "grad_norm": 0.24862768901035942, + "learning_rate": 1.7539525691699606e-05, + "loss": 0.2641, + "step": 770 + }, + { + "epoch": 2.0505992010652463, + "grad_norm": 0.25511185080416404, + "learning_rate": 1.74901185770751e-05, + "loss": 0.2709, + "step": 771 + }, + { + "epoch": 2.0532623169107858, + "grad_norm": 0.24302033763825434, + "learning_rate": 1.7440711462450594e-05, + "loss": 0.2759, + "step": 772 + }, + { + "epoch": 2.0559254327563248, + "grad_norm": 0.20872328589643, + "learning_rate": 1.739130434782609e-05, + "loss": 0.2632, + "step": 773 + }, + { + "epoch": 2.0585885486018642, + "grad_norm": 0.26636593407387676, + "learning_rate": 1.7341897233201583e-05, + "loss": 0.2636, + "step": 774 + }, + { + "epoch": 2.0612516644474033, + "grad_norm": 0.28091568129361494, + "learning_rate": 1.7292490118577078e-05, + "loss": 0.2628, + "step": 775 + }, + { + "epoch": 2.0639147802929427, + "grad_norm": 0.2560746499348802, + "learning_rate": 1.7243083003952572e-05, + "loss": 0.2655, + "step": 776 + }, + { + "epoch": 2.066577896138482, + "grad_norm": 0.26276899174108526, + "learning_rate": 1.7193675889328066e-05, + "loss": 0.2728, + "step": 777 + }, + { + "epoch": 2.069241011984021, + "grad_norm": 0.26384946938199305, + "learning_rate": 1.714426877470356e-05, + "loss": 0.2747, + "step": 778 + }, + { + "epoch": 2.0719041278295607, + "grad_norm": 0.23715984391863434, + "learning_rate": 1.7094861660079055e-05, + "loss": 0.2694, + "step": 779 + }, + { + "epoch": 2.0745672436750997, + "grad_norm": 0.2404103191932088, + "learning_rate": 1.7045454545454546e-05, + "loss": 0.2844, + "step": 780 + }, + { + "epoch": 2.077230359520639, + "grad_norm": 0.2295546055568796, + "learning_rate": 1.699604743083004e-05, + "loss": 0.2563, + "step": 781 + }, + { + "epoch": 2.0798934753661786, + "grad_norm": 0.25081138258701596, + "learning_rate": 1.6946640316205535e-05, + "loss": 0.2657, + "step": 782 + }, + { + "epoch": 2.0825565912117177, + "grad_norm": 0.23299102413940379, + "learning_rate": 1.689723320158103e-05, + "loss": 0.2841, + "step": 783 + }, + { + "epoch": 2.085219707057257, + "grad_norm": 0.2352302932330538, + "learning_rate": 1.6847826086956524e-05, + "loss": 0.2696, + "step": 784 + }, + { + "epoch": 2.087882822902796, + "grad_norm": 0.2396805580902733, + "learning_rate": 1.6798418972332018e-05, + "loss": 0.2687, + "step": 785 + }, + { + "epoch": 2.0905459387483356, + "grad_norm": 0.22897484277870242, + "learning_rate": 1.6749011857707512e-05, + "loss": 0.2678, + "step": 786 + }, + { + "epoch": 2.0932090545938746, + "grad_norm": 0.224891214268194, + "learning_rate": 1.6699604743083007e-05, + "loss": 0.2729, + "step": 787 + }, + { + "epoch": 2.095872170439414, + "grad_norm": 0.26860270920114504, + "learning_rate": 1.66501976284585e-05, + "loss": 0.2581, + "step": 788 + }, + { + "epoch": 2.0985352862849536, + "grad_norm": 0.24961552358211944, + "learning_rate": 1.6600790513833996e-05, + "loss": 0.2624, + "step": 789 + }, + { + "epoch": 2.1011984021304926, + "grad_norm": 0.22308364748740767, + "learning_rate": 1.6551383399209487e-05, + "loss": 0.2647, + "step": 790 + }, + { + "epoch": 2.103861517976032, + "grad_norm": 0.2380839364570976, + "learning_rate": 1.650197628458498e-05, + "loss": 0.271, + "step": 791 + }, + { + "epoch": 2.106524633821571, + "grad_norm": 0.24381955578610937, + "learning_rate": 1.6452569169960475e-05, + "loss": 0.2694, + "step": 792 + }, + { + "epoch": 2.1091877496671105, + "grad_norm": 0.23758646142710013, + "learning_rate": 1.640316205533597e-05, + "loss": 0.2775, + "step": 793 + }, + { + "epoch": 2.11185086551265, + "grad_norm": 0.23538198400085814, + "learning_rate": 1.6353754940711464e-05, + "loss": 0.2814, + "step": 794 + }, + { + "epoch": 2.114513981358189, + "grad_norm": 0.21674748879871775, + "learning_rate": 1.630434782608696e-05, + "loss": 0.2548, + "step": 795 + }, + { + "epoch": 2.1171770972037285, + "grad_norm": 0.24105445224605443, + "learning_rate": 1.6254940711462453e-05, + "loss": 0.2641, + "step": 796 + }, + { + "epoch": 2.1198402130492675, + "grad_norm": 0.23753067329213304, + "learning_rate": 1.6205533596837947e-05, + "loss": 0.2709, + "step": 797 + }, + { + "epoch": 2.122503328894807, + "grad_norm": 0.23404194217010732, + "learning_rate": 1.615612648221344e-05, + "loss": 0.271, + "step": 798 + }, + { + "epoch": 2.125166444740346, + "grad_norm": 0.2121069651623829, + "learning_rate": 1.6106719367588936e-05, + "loss": 0.2627, + "step": 799 + }, + { + "epoch": 2.1278295605858855, + "grad_norm": 0.22624703639894228, + "learning_rate": 1.6057312252964427e-05, + "loss": 0.2538, + "step": 800 + }, + { + "epoch": 2.130492676431425, + "grad_norm": 0.2386292992012449, + "learning_rate": 1.600790513833992e-05, + "loss": 0.2576, + "step": 801 + }, + { + "epoch": 2.133155792276964, + "grad_norm": 0.22877737188756703, + "learning_rate": 1.5958498023715416e-05, + "loss": 0.2727, + "step": 802 + }, + { + "epoch": 2.1358189081225034, + "grad_norm": 0.27117813021650006, + "learning_rate": 1.590909090909091e-05, + "loss": 0.2895, + "step": 803 + }, + { + "epoch": 2.1384820239680424, + "grad_norm": 0.22867337217751538, + "learning_rate": 1.5859683794466405e-05, + "loss": 0.2734, + "step": 804 + }, + { + "epoch": 2.141145139813582, + "grad_norm": 0.24512337588151054, + "learning_rate": 1.58102766798419e-05, + "loss": 0.273, + "step": 805 + }, + { + "epoch": 2.1438082556591214, + "grad_norm": 0.2727608695581687, + "learning_rate": 1.5760869565217393e-05, + "loss": 0.2901, + "step": 806 + }, + { + "epoch": 2.1464713715046604, + "grad_norm": 0.2387866974014394, + "learning_rate": 1.5711462450592888e-05, + "loss": 0.2643, + "step": 807 + }, + { + "epoch": 2.1491344873502, + "grad_norm": 0.22440460077720992, + "learning_rate": 1.5662055335968382e-05, + "loss": 0.2653, + "step": 808 + }, + { + "epoch": 2.151797603195739, + "grad_norm": 0.248288295680679, + "learning_rate": 1.5612648221343873e-05, + "loss": 0.2549, + "step": 809 + }, + { + "epoch": 2.1544607190412783, + "grad_norm": 0.24110717758110342, + "learning_rate": 1.5563241106719367e-05, + "loss": 0.2748, + "step": 810 + }, + { + "epoch": 2.157123834886818, + "grad_norm": 0.23171730936199766, + "learning_rate": 1.5513833992094862e-05, + "loss": 0.2709, + "step": 811 + }, + { + "epoch": 2.159786950732357, + "grad_norm": 0.22345452374040276, + "learning_rate": 1.5464426877470356e-05, + "loss": 0.2688, + "step": 812 + }, + { + "epoch": 2.1624500665778963, + "grad_norm": 0.26551342546130663, + "learning_rate": 1.541501976284585e-05, + "loss": 0.2709, + "step": 813 + }, + { + "epoch": 2.1651131824234353, + "grad_norm": 0.2375754285218798, + "learning_rate": 1.5365612648221345e-05, + "loss": 0.259, + "step": 814 + }, + { + "epoch": 2.1677762982689748, + "grad_norm": 0.2115542246448785, + "learning_rate": 1.531620553359684e-05, + "loss": 0.2684, + "step": 815 + }, + { + "epoch": 2.170439414114514, + "grad_norm": 0.2447171773393202, + "learning_rate": 1.5266798418972334e-05, + "loss": 0.2762, + "step": 816 + }, + { + "epoch": 2.1731025299600533, + "grad_norm": 0.22704904523049146, + "learning_rate": 1.5217391304347828e-05, + "loss": 0.2587, + "step": 817 + }, + { + "epoch": 2.1757656458055927, + "grad_norm": 0.2103985476952429, + "learning_rate": 1.5167984189723323e-05, + "loss": 0.2706, + "step": 818 + }, + { + "epoch": 2.1784287616511318, + "grad_norm": 0.25159263014889965, + "learning_rate": 1.5118577075098814e-05, + "loss": 0.2584, + "step": 819 + }, + { + "epoch": 2.181091877496671, + "grad_norm": 0.24458443995501622, + "learning_rate": 1.5069169960474308e-05, + "loss": 0.2704, + "step": 820 + }, + { + "epoch": 2.1837549933422102, + "grad_norm": 0.22057301940141671, + "learning_rate": 1.5019762845849802e-05, + "loss": 0.2719, + "step": 821 + }, + { + "epoch": 2.1864181091877497, + "grad_norm": 0.267519780973077, + "learning_rate": 1.4970355731225297e-05, + "loss": 0.2716, + "step": 822 + }, + { + "epoch": 2.1890812250332887, + "grad_norm": 0.22154250046870252, + "learning_rate": 1.4920948616600791e-05, + "loss": 0.2591, + "step": 823 + }, + { + "epoch": 2.191744340878828, + "grad_norm": 0.21165234414085649, + "learning_rate": 1.4871541501976285e-05, + "loss": 0.2655, + "step": 824 + }, + { + "epoch": 2.1944074567243677, + "grad_norm": 0.24374815251314244, + "learning_rate": 1.482213438735178e-05, + "loss": 0.2655, + "step": 825 + }, + { + "epoch": 2.1970705725699067, + "grad_norm": 0.2455699195489871, + "learning_rate": 1.4772727272727274e-05, + "loss": 0.2665, + "step": 826 + }, + { + "epoch": 2.199733688415446, + "grad_norm": 0.22958103222280501, + "learning_rate": 1.4723320158102769e-05, + "loss": 0.266, + "step": 827 + }, + { + "epoch": 2.202396804260985, + "grad_norm": 0.22203196516766327, + "learning_rate": 1.4673913043478263e-05, + "loss": 0.2646, + "step": 828 + }, + { + "epoch": 2.2050599201065246, + "grad_norm": 0.24608492700980994, + "learning_rate": 1.4624505928853754e-05, + "loss": 0.2794, + "step": 829 + }, + { + "epoch": 2.207723035952064, + "grad_norm": 0.21991565592070453, + "learning_rate": 1.4575098814229248e-05, + "loss": 0.2721, + "step": 830 + }, + { + "epoch": 2.210386151797603, + "grad_norm": 0.21684224263000038, + "learning_rate": 1.4525691699604743e-05, + "loss": 0.2584, + "step": 831 + }, + { + "epoch": 2.2130492676431426, + "grad_norm": 0.25977569519470245, + "learning_rate": 1.4476284584980237e-05, + "loss": 0.2726, + "step": 832 + }, + { + "epoch": 2.2157123834886816, + "grad_norm": 0.2386084151402447, + "learning_rate": 1.4426877470355732e-05, + "loss": 0.2852, + "step": 833 + }, + { + "epoch": 2.218375499334221, + "grad_norm": 0.21986693449971093, + "learning_rate": 1.4377470355731226e-05, + "loss": 0.2626, + "step": 834 + }, + { + "epoch": 2.2210386151797605, + "grad_norm": 0.21749065277576188, + "learning_rate": 1.432806324110672e-05, + "loss": 0.2602, + "step": 835 + }, + { + "epoch": 2.2237017310252996, + "grad_norm": 0.23989512729814974, + "learning_rate": 1.4278656126482215e-05, + "loss": 0.2692, + "step": 836 + }, + { + "epoch": 2.226364846870839, + "grad_norm": 0.23832582321216103, + "learning_rate": 1.4229249011857709e-05, + "loss": 0.2635, + "step": 837 + }, + { + "epoch": 2.229027962716378, + "grad_norm": 0.2426811597238821, + "learning_rate": 1.4179841897233202e-05, + "loss": 0.2668, + "step": 838 + }, + { + "epoch": 2.2316910785619175, + "grad_norm": 0.22741820303496693, + "learning_rate": 1.4130434782608694e-05, + "loss": 0.2687, + "step": 839 + }, + { + "epoch": 2.2343541944074565, + "grad_norm": 0.2193731262262756, + "learning_rate": 1.4081027667984189e-05, + "loss": 0.2707, + "step": 840 + }, + { + "epoch": 2.237017310252996, + "grad_norm": 0.22566921822696567, + "learning_rate": 1.4031620553359683e-05, + "loss": 0.2676, + "step": 841 + }, + { + "epoch": 2.2396804260985355, + "grad_norm": 0.22383415671065598, + "learning_rate": 1.3982213438735178e-05, + "loss": 0.2652, + "step": 842 + }, + { + "epoch": 2.2423435419440745, + "grad_norm": 0.20320657711674117, + "learning_rate": 1.3932806324110672e-05, + "loss": 0.2595, + "step": 843 + }, + { + "epoch": 2.245006657789614, + "grad_norm": 0.2333067790520279, + "learning_rate": 1.3883399209486166e-05, + "loss": 0.2584, + "step": 844 + }, + { + "epoch": 2.247669773635153, + "grad_norm": 0.2198492093260434, + "learning_rate": 1.383399209486166e-05, + "loss": 0.2787, + "step": 845 + }, + { + "epoch": 2.2503328894806924, + "grad_norm": 0.20578959481390344, + "learning_rate": 1.3784584980237155e-05, + "loss": 0.2717, + "step": 846 + }, + { + "epoch": 2.2529960053262315, + "grad_norm": 0.23821537591362393, + "learning_rate": 1.373517786561265e-05, + "loss": 0.2699, + "step": 847 + }, + { + "epoch": 2.255659121171771, + "grad_norm": 0.22087113735109618, + "learning_rate": 1.3685770750988142e-05, + "loss": 0.2643, + "step": 848 + }, + { + "epoch": 2.2583222370173104, + "grad_norm": 0.21122229854050678, + "learning_rate": 1.3636363636363637e-05, + "loss": 0.2724, + "step": 849 + }, + { + "epoch": 2.2609853528628494, + "grad_norm": 0.21706856754708864, + "learning_rate": 1.3586956521739131e-05, + "loss": 0.2726, + "step": 850 + }, + { + "epoch": 2.263648468708389, + "grad_norm": 0.21623723691120003, + "learning_rate": 1.3537549407114625e-05, + "loss": 0.2551, + "step": 851 + }, + { + "epoch": 2.266311584553928, + "grad_norm": 0.2271100658389757, + "learning_rate": 1.348814229249012e-05, + "loss": 0.2586, + "step": 852 + }, + { + "epoch": 2.2689747003994674, + "grad_norm": 0.2209764109681619, + "learning_rate": 1.3438735177865614e-05, + "loss": 0.2716, + "step": 853 + }, + { + "epoch": 2.271637816245007, + "grad_norm": 0.2178701412614265, + "learning_rate": 1.3389328063241108e-05, + "loss": 0.2891, + "step": 854 + }, + { + "epoch": 2.274300932090546, + "grad_norm": 0.2661642988662999, + "learning_rate": 1.3339920948616603e-05, + "loss": 0.2564, + "step": 855 + }, + { + "epoch": 2.2769640479360853, + "grad_norm": 0.21388446109096484, + "learning_rate": 1.3290513833992097e-05, + "loss": 0.2529, + "step": 856 + }, + { + "epoch": 2.2796271637816243, + "grad_norm": 0.2216576992935052, + "learning_rate": 1.3241106719367592e-05, + "loss": 0.2636, + "step": 857 + }, + { + "epoch": 2.282290279627164, + "grad_norm": 0.23210662511306396, + "learning_rate": 1.3191699604743083e-05, + "loss": 0.2589, + "step": 858 + }, + { + "epoch": 2.2849533954727033, + "grad_norm": 0.2392108261983096, + "learning_rate": 1.3142292490118577e-05, + "loss": 0.265, + "step": 859 + }, + { + "epoch": 2.2876165113182423, + "grad_norm": 0.21786440972478727, + "learning_rate": 1.3092885375494071e-05, + "loss": 0.2793, + "step": 860 + }, + { + "epoch": 2.2902796271637818, + "grad_norm": 0.260403587668551, + "learning_rate": 1.3043478260869566e-05, + "loss": 0.2777, + "step": 861 + }, + { + "epoch": 2.2929427430093208, + "grad_norm": 0.2430960989806936, + "learning_rate": 1.299407114624506e-05, + "loss": 0.2572, + "step": 862 + }, + { + "epoch": 2.2956058588548602, + "grad_norm": 0.21752051573777517, + "learning_rate": 1.2944664031620555e-05, + "loss": 0.2803, + "step": 863 + }, + { + "epoch": 2.2982689747003997, + "grad_norm": 0.2573344766515025, + "learning_rate": 1.2895256916996049e-05, + "loss": 0.2803, + "step": 864 + }, + { + "epoch": 2.3009320905459387, + "grad_norm": 0.24369267722963625, + "learning_rate": 1.2845849802371543e-05, + "loss": 0.2559, + "step": 865 + }, + { + "epoch": 2.303595206391478, + "grad_norm": 0.2676475243278646, + "learning_rate": 1.2796442687747038e-05, + "loss": 0.2634, + "step": 866 + }, + { + "epoch": 2.306258322237017, + "grad_norm": 0.21674298638149098, + "learning_rate": 1.2747035573122532e-05, + "loss": 0.2673, + "step": 867 + }, + { + "epoch": 2.3089214380825567, + "grad_norm": 0.23541545396380092, + "learning_rate": 1.2697628458498023e-05, + "loss": 0.2673, + "step": 868 + }, + { + "epoch": 2.3115845539280957, + "grad_norm": 0.22699711620607352, + "learning_rate": 1.2648221343873517e-05, + "loss": 0.2674, + "step": 869 + }, + { + "epoch": 2.314247669773635, + "grad_norm": 0.22613468537499234, + "learning_rate": 1.2598814229249012e-05, + "loss": 0.2655, + "step": 870 + }, + { + "epoch": 2.316910785619174, + "grad_norm": 0.2253665054481723, + "learning_rate": 1.2549407114624506e-05, + "loss": 0.2542, + "step": 871 + }, + { + "epoch": 2.3195739014647137, + "grad_norm": 0.2389905563347208, + "learning_rate": 1.25e-05, + "loss": 0.2642, + "step": 872 + }, + { + "epoch": 2.322237017310253, + "grad_norm": 0.1972800090188119, + "learning_rate": 1.2450592885375495e-05, + "loss": 0.2738, + "step": 873 + }, + { + "epoch": 2.324900133155792, + "grad_norm": 0.22018172948520282, + "learning_rate": 1.240118577075099e-05, + "loss": 0.2736, + "step": 874 + }, + { + "epoch": 2.3275632490013316, + "grad_norm": 0.22660897800754057, + "learning_rate": 1.2351778656126482e-05, + "loss": 0.2797, + "step": 875 + }, + { + "epoch": 2.3302263648468706, + "grad_norm": 0.22691181432819396, + "learning_rate": 1.2302371541501976e-05, + "loss": 0.2562, + "step": 876 + }, + { + "epoch": 2.33288948069241, + "grad_norm": 0.21367535241766863, + "learning_rate": 1.225296442687747e-05, + "loss": 0.2687, + "step": 877 + }, + { + "epoch": 2.3355525965379496, + "grad_norm": 0.23289737129114052, + "learning_rate": 1.2203557312252965e-05, + "loss": 0.2595, + "step": 878 + }, + { + "epoch": 2.3382157123834886, + "grad_norm": 0.21941025876118542, + "learning_rate": 1.215415019762846e-05, + "loss": 0.2785, + "step": 879 + }, + { + "epoch": 2.340878828229028, + "grad_norm": 0.23113074495001715, + "learning_rate": 1.2104743083003952e-05, + "loss": 0.283, + "step": 880 + }, + { + "epoch": 2.343541944074567, + "grad_norm": 0.21978182787011594, + "learning_rate": 1.2055335968379447e-05, + "loss": 0.2602, + "step": 881 + }, + { + "epoch": 2.3462050599201065, + "grad_norm": 0.22558732477437654, + "learning_rate": 1.2005928853754941e-05, + "loss": 0.2744, + "step": 882 + }, + { + "epoch": 2.348868175765646, + "grad_norm": 0.21761347406156886, + "learning_rate": 1.1956521739130435e-05, + "loss": 0.2702, + "step": 883 + }, + { + "epoch": 2.351531291611185, + "grad_norm": 0.5461188257601155, + "learning_rate": 1.190711462450593e-05, + "loss": 0.2894, + "step": 884 + }, + { + "epoch": 2.3541944074567245, + "grad_norm": 0.21406318400975563, + "learning_rate": 1.1857707509881423e-05, + "loss": 0.2661, + "step": 885 + }, + { + "epoch": 2.3568575233022635, + "grad_norm": 0.1984149911802996, + "learning_rate": 1.1808300395256917e-05, + "loss": 0.266, + "step": 886 + }, + { + "epoch": 2.359520639147803, + "grad_norm": 0.21968916065746072, + "learning_rate": 1.1758893280632411e-05, + "loss": 0.2635, + "step": 887 + }, + { + "epoch": 2.3621837549933424, + "grad_norm": 0.22188429396465353, + "learning_rate": 1.1709486166007906e-05, + "loss": 0.2729, + "step": 888 + }, + { + "epoch": 2.3648468708388815, + "grad_norm": 0.21019336767245783, + "learning_rate": 1.16600790513834e-05, + "loss": 0.2773, + "step": 889 + }, + { + "epoch": 2.367509986684421, + "grad_norm": 0.22711608967366953, + "learning_rate": 1.1610671936758893e-05, + "loss": 0.2714, + "step": 890 + }, + { + "epoch": 2.37017310252996, + "grad_norm": 0.2226773168313416, + "learning_rate": 1.1561264822134387e-05, + "loss": 0.264, + "step": 891 + }, + { + "epoch": 2.3728362183754994, + "grad_norm": 0.21211073663718902, + "learning_rate": 1.1511857707509881e-05, + "loss": 0.2623, + "step": 892 + }, + { + "epoch": 2.3754993342210384, + "grad_norm": 0.22155796804883984, + "learning_rate": 1.1462450592885376e-05, + "loss": 0.2786, + "step": 893 + }, + { + "epoch": 2.378162450066578, + "grad_norm": 0.21152104541352987, + "learning_rate": 1.141304347826087e-05, + "loss": 0.2754, + "step": 894 + }, + { + "epoch": 2.3808255659121174, + "grad_norm": 0.2436663825711812, + "learning_rate": 1.1363636363636365e-05, + "loss": 0.2646, + "step": 895 + }, + { + "epoch": 2.3834886817576564, + "grad_norm": 0.253729858596224, + "learning_rate": 1.1314229249011857e-05, + "loss": 0.2815, + "step": 896 + }, + { + "epoch": 2.386151797603196, + "grad_norm": 0.20642310572208497, + "learning_rate": 1.1264822134387352e-05, + "loss": 0.272, + "step": 897 + }, + { + "epoch": 2.388814913448735, + "grad_norm": 0.22828401957220001, + "learning_rate": 1.1215415019762846e-05, + "loss": 0.2679, + "step": 898 + }, + { + "epoch": 2.3914780292942743, + "grad_norm": 0.2226863403827293, + "learning_rate": 1.116600790513834e-05, + "loss": 0.2783, + "step": 899 + }, + { + "epoch": 2.3941411451398134, + "grad_norm": 0.2380848377629423, + "learning_rate": 1.1116600790513835e-05, + "loss": 0.2688, + "step": 900 + }, + { + "epoch": 2.396804260985353, + "grad_norm": 0.23278674245520006, + "learning_rate": 1.106719367588933e-05, + "loss": 0.271, + "step": 901 + }, + { + "epoch": 2.3994673768308923, + "grad_norm": 0.20419629462602493, + "learning_rate": 1.1017786561264824e-05, + "loss": 0.265, + "step": 902 + }, + { + "epoch": 2.4021304926764313, + "grad_norm": 0.2390569872958442, + "learning_rate": 1.0968379446640318e-05, + "loss": 0.2638, + "step": 903 + }, + { + "epoch": 2.4047936085219708, + "grad_norm": 0.2279702813171203, + "learning_rate": 1.091897233201581e-05, + "loss": 0.2428, + "step": 904 + }, + { + "epoch": 2.40745672436751, + "grad_norm": 0.21319204168497982, + "learning_rate": 1.0869565217391305e-05, + "loss": 0.2738, + "step": 905 + }, + { + "epoch": 2.4101198402130493, + "grad_norm": 0.21016965126306628, + "learning_rate": 1.08201581027668e-05, + "loss": 0.2591, + "step": 906 + }, + { + "epoch": 2.4127829560585887, + "grad_norm": 0.23241828917431315, + "learning_rate": 1.0770750988142294e-05, + "loss": 0.2691, + "step": 907 + }, + { + "epoch": 2.4154460719041277, + "grad_norm": 0.2508034557509808, + "learning_rate": 1.0721343873517788e-05, + "loss": 0.2716, + "step": 908 + }, + { + "epoch": 2.418109187749667, + "grad_norm": 0.24987214036836988, + "learning_rate": 1.0671936758893281e-05, + "loss": 0.2578, + "step": 909 + }, + { + "epoch": 2.4207723035952062, + "grad_norm": 0.2380445170755529, + "learning_rate": 1.0622529644268775e-05, + "loss": 0.264, + "step": 910 + }, + { + "epoch": 2.4234354194407457, + "grad_norm": 0.2201379804572699, + "learning_rate": 1.057312252964427e-05, + "loss": 0.274, + "step": 911 + }, + { + "epoch": 2.426098535286285, + "grad_norm": 0.250942745509917, + "learning_rate": 1.0523715415019764e-05, + "loss": 0.2648, + "step": 912 + }, + { + "epoch": 2.428761651131824, + "grad_norm": 0.22995097984900165, + "learning_rate": 1.0474308300395258e-05, + "loss": 0.2647, + "step": 913 + }, + { + "epoch": 2.4314247669773636, + "grad_norm": 0.23698141688133578, + "learning_rate": 1.0424901185770751e-05, + "loss": 0.2737, + "step": 914 + }, + { + "epoch": 2.4340878828229027, + "grad_norm": 0.21803776160842997, + "learning_rate": 1.0375494071146246e-05, + "loss": 0.272, + "step": 915 + }, + { + "epoch": 2.436750998668442, + "grad_norm": 0.24131490172282968, + "learning_rate": 1.032608695652174e-05, + "loss": 0.2695, + "step": 916 + }, + { + "epoch": 2.4394141145139816, + "grad_norm": 0.21919070590537304, + "learning_rate": 1.0276679841897234e-05, + "loss": 0.2642, + "step": 917 + }, + { + "epoch": 2.4420772303595206, + "grad_norm": 0.22130430229063322, + "learning_rate": 1.0227272727272729e-05, + "loss": 0.2648, + "step": 918 + }, + { + "epoch": 2.44474034620506, + "grad_norm": 0.2207950484316367, + "learning_rate": 1.0177865612648221e-05, + "loss": 0.2661, + "step": 919 + }, + { + "epoch": 2.447403462050599, + "grad_norm": 0.21836484864507769, + "learning_rate": 1.0128458498023716e-05, + "loss": 0.2674, + "step": 920 + }, + { + "epoch": 2.4500665778961386, + "grad_norm": 0.20744131254325618, + "learning_rate": 1.007905138339921e-05, + "loss": 0.2541, + "step": 921 + }, + { + "epoch": 2.4527296937416776, + "grad_norm": 0.21453675745306103, + "learning_rate": 1.0029644268774705e-05, + "loss": 0.2739, + "step": 922 + }, + { + "epoch": 2.455392809587217, + "grad_norm": 0.20834291358903456, + "learning_rate": 9.980237154150199e-06, + "loss": 0.2683, + "step": 923 + }, + { + "epoch": 2.458055925432756, + "grad_norm": 0.2218801415090961, + "learning_rate": 9.930830039525692e-06, + "loss": 0.2725, + "step": 924 + }, + { + "epoch": 2.4607190412782955, + "grad_norm": 0.22892525986093554, + "learning_rate": 9.881422924901186e-06, + "loss": 0.2736, + "step": 925 + }, + { + "epoch": 2.463382157123835, + "grad_norm": 0.21019735025511882, + "learning_rate": 9.83201581027668e-06, + "loss": 0.2667, + "step": 926 + }, + { + "epoch": 2.466045272969374, + "grad_norm": 0.22029826331712365, + "learning_rate": 9.782608695652175e-06, + "loss": 0.2685, + "step": 927 + }, + { + "epoch": 2.4687083888149135, + "grad_norm": 0.2048436758988922, + "learning_rate": 9.733201581027667e-06, + "loss": 0.2675, + "step": 928 + }, + { + "epoch": 2.4713715046604525, + "grad_norm": 0.22910504440789492, + "learning_rate": 9.683794466403162e-06, + "loss": 0.2769, + "step": 929 + }, + { + "epoch": 2.474034620505992, + "grad_norm": 0.22852762946943356, + "learning_rate": 9.634387351778656e-06, + "loss": 0.2834, + "step": 930 + }, + { + "epoch": 2.4766977363515315, + "grad_norm": 0.21897574663546826, + "learning_rate": 9.58498023715415e-06, + "loss": 0.2778, + "step": 931 + }, + { + "epoch": 2.4793608521970705, + "grad_norm": 0.2050794319936511, + "learning_rate": 9.535573122529645e-06, + "loss": 0.2715, + "step": 932 + }, + { + "epoch": 2.48202396804261, + "grad_norm": 0.21728652059101256, + "learning_rate": 9.486166007905138e-06, + "loss": 0.259, + "step": 933 + }, + { + "epoch": 2.484687083888149, + "grad_norm": 0.22845416533089977, + "learning_rate": 9.436758893280632e-06, + "loss": 0.2761, + "step": 934 + }, + { + "epoch": 2.4873501997336884, + "grad_norm": 0.21231590297088435, + "learning_rate": 9.387351778656126e-06, + "loss": 0.2677, + "step": 935 + }, + { + "epoch": 2.490013315579228, + "grad_norm": 0.1926400508160791, + "learning_rate": 9.33794466403162e-06, + "loss": 0.2575, + "step": 936 + }, + { + "epoch": 2.492676431424767, + "grad_norm": 0.22996010092008873, + "learning_rate": 9.288537549407115e-06, + "loss": 0.2548, + "step": 937 + }, + { + "epoch": 2.4953395472703064, + "grad_norm": 0.22619760087939098, + "learning_rate": 9.239130434782608e-06, + "loss": 0.2676, + "step": 938 + }, + { + "epoch": 2.4980026631158454, + "grad_norm": 0.20946128187824178, + "learning_rate": 9.189723320158102e-06, + "loss": 0.2649, + "step": 939 + }, + { + "epoch": 2.500665778961385, + "grad_norm": 0.21291901939824368, + "learning_rate": 9.140316205533597e-06, + "loss": 0.2794, + "step": 940 + }, + { + "epoch": 2.5033288948069243, + "grad_norm": 0.23983156472432737, + "learning_rate": 9.090909090909091e-06, + "loss": 0.2612, + "step": 941 + }, + { + "epoch": 2.5059920106524634, + "grad_norm": 0.21371558486466197, + "learning_rate": 9.041501976284585e-06, + "loss": 0.2715, + "step": 942 + }, + { + "epoch": 2.508655126498003, + "grad_norm": 0.20948609220977954, + "learning_rate": 8.992094861660078e-06, + "loss": 0.2685, + "step": 943 + }, + { + "epoch": 2.511318242343542, + "grad_norm": 0.20326902436416877, + "learning_rate": 8.942687747035572e-06, + "loss": 0.2646, + "step": 944 + }, + { + "epoch": 2.5139813581890813, + "grad_norm": 0.20716732265525145, + "learning_rate": 8.893280632411067e-06, + "loss": 0.2624, + "step": 945 + }, + { + "epoch": 2.5166444740346208, + "grad_norm": 0.21310454845084212, + "learning_rate": 8.843873517786561e-06, + "loss": 0.2666, + "step": 946 + }, + { + "epoch": 2.51930758988016, + "grad_norm": 0.2356341947109539, + "learning_rate": 8.794466403162056e-06, + "loss": 0.2607, + "step": 947 + }, + { + "epoch": 2.521970705725699, + "grad_norm": 0.206705458805249, + "learning_rate": 8.74505928853755e-06, + "loss": 0.2765, + "step": 948 + }, + { + "epoch": 2.5246338215712383, + "grad_norm": 0.19941570008688478, + "learning_rate": 8.695652173913044e-06, + "loss": 0.2774, + "step": 949 + }, + { + "epoch": 2.5272969374167777, + "grad_norm": 0.22426207188439748, + "learning_rate": 8.646245059288539e-06, + "loss": 0.2829, + "step": 950 + }, + { + "epoch": 2.5299600532623168, + "grad_norm": 0.25258528314600287, + "learning_rate": 8.596837944664033e-06, + "loss": 0.2646, + "step": 951 + }, + { + "epoch": 2.5326231691078562, + "grad_norm": 0.2145489784213885, + "learning_rate": 8.547430830039528e-06, + "loss": 0.2607, + "step": 952 + }, + { + "epoch": 2.5352862849533953, + "grad_norm": 0.19599385905462602, + "learning_rate": 8.49802371541502e-06, + "loss": 0.2543, + "step": 953 + }, + { + "epoch": 2.5379494007989347, + "grad_norm": 0.2480014218006241, + "learning_rate": 8.448616600790515e-06, + "loss": 0.2689, + "step": 954 + }, + { + "epoch": 2.540612516644474, + "grad_norm": 0.24788509439736134, + "learning_rate": 8.399209486166009e-06, + "loss": 0.2725, + "step": 955 + }, + { + "epoch": 2.543275632490013, + "grad_norm": 0.2267111546180155, + "learning_rate": 8.349802371541503e-06, + "loss": 0.2635, + "step": 956 + }, + { + "epoch": 2.5459387483355527, + "grad_norm": 0.21182851928367047, + "learning_rate": 8.300395256916998e-06, + "loss": 0.2638, + "step": 957 + }, + { + "epoch": 2.5486018641810917, + "grad_norm": 0.21455676194315262, + "learning_rate": 8.25098814229249e-06, + "loss": 0.2585, + "step": 958 + }, + { + "epoch": 2.551264980026631, + "grad_norm": 0.2169073571862216, + "learning_rate": 8.201581027667985e-06, + "loss": 0.2617, + "step": 959 + }, + { + "epoch": 2.5539280958721706, + "grad_norm": 0.22625888751011447, + "learning_rate": 8.15217391304348e-06, + "loss": 0.271, + "step": 960 + }, + { + "epoch": 2.5565912117177096, + "grad_norm": 0.20470193896466704, + "learning_rate": 8.102766798418974e-06, + "loss": 0.2662, + "step": 961 + }, + { + "epoch": 2.559254327563249, + "grad_norm": 0.21322007235950363, + "learning_rate": 8.053359683794468e-06, + "loss": 0.2556, + "step": 962 + }, + { + "epoch": 2.561917443408788, + "grad_norm": 0.20150617925679104, + "learning_rate": 8.00395256916996e-06, + "loss": 0.2582, + "step": 963 + }, + { + "epoch": 2.5645805592543276, + "grad_norm": 0.2286944491087834, + "learning_rate": 7.954545454545455e-06, + "loss": 0.2661, + "step": 964 + }, + { + "epoch": 2.567243675099867, + "grad_norm": 0.20708520844073464, + "learning_rate": 7.90513833992095e-06, + "loss": 0.2625, + "step": 965 + }, + { + "epoch": 2.569906790945406, + "grad_norm": 0.1993453778786671, + "learning_rate": 7.855731225296444e-06, + "loss": 0.2684, + "step": 966 + }, + { + "epoch": 2.5725699067909455, + "grad_norm": 0.19939625758599083, + "learning_rate": 7.806324110671937e-06, + "loss": 0.2658, + "step": 967 + }, + { + "epoch": 2.5752330226364846, + "grad_norm": 0.20007029899978518, + "learning_rate": 7.756916996047431e-06, + "loss": 0.2612, + "step": 968 + }, + { + "epoch": 2.577896138482024, + "grad_norm": 0.20768490453881108, + "learning_rate": 7.707509881422925e-06, + "loss": 0.2671, + "step": 969 + }, + { + "epoch": 2.5805592543275635, + "grad_norm": 0.21354810130953325, + "learning_rate": 7.65810276679842e-06, + "loss": 0.2578, + "step": 970 + }, + { + "epoch": 2.5832223701731025, + "grad_norm": 0.23174711166338519, + "learning_rate": 7.608695652173914e-06, + "loss": 0.2715, + "step": 971 + }, + { + "epoch": 2.5858854860186415, + "grad_norm": 0.21079000224350897, + "learning_rate": 7.559288537549407e-06, + "loss": 0.2658, + "step": 972 + }, + { + "epoch": 2.588548601864181, + "grad_norm": 0.2001035421079937, + "learning_rate": 7.509881422924901e-06, + "loss": 0.2569, + "step": 973 + }, + { + "epoch": 2.5912117177097205, + "grad_norm": 0.2021065412071498, + "learning_rate": 7.4604743083003955e-06, + "loss": 0.2608, + "step": 974 + }, + { + "epoch": 2.5938748335552595, + "grad_norm": 0.214158147452307, + "learning_rate": 7.41106719367589e-06, + "loss": 0.2779, + "step": 975 + }, + { + "epoch": 2.596537949400799, + "grad_norm": 0.20790431049928293, + "learning_rate": 7.361660079051384e-06, + "loss": 0.2733, + "step": 976 + }, + { + "epoch": 2.599201065246338, + "grad_norm": 0.20549750329181854, + "learning_rate": 7.312252964426877e-06, + "loss": 0.276, + "step": 977 + }, + { + "epoch": 2.6018641810918774, + "grad_norm": 0.20237657523764993, + "learning_rate": 7.262845849802371e-06, + "loss": 0.2735, + "step": 978 + }, + { + "epoch": 2.604527296937417, + "grad_norm": 0.20973877300015645, + "learning_rate": 7.213438735177866e-06, + "loss": 0.281, + "step": 979 + }, + { + "epoch": 2.607190412782956, + "grad_norm": 0.22017905718680691, + "learning_rate": 7.16403162055336e-06, + "loss": 0.2677, + "step": 980 + }, + { + "epoch": 2.6098535286284954, + "grad_norm": 0.2144342458050631, + "learning_rate": 7.1146245059288545e-06, + "loss": 0.2604, + "step": 981 + }, + { + "epoch": 2.6125166444740344, + "grad_norm": 0.2050156532271564, + "learning_rate": 7.065217391304347e-06, + "loss": 0.2701, + "step": 982 + }, + { + "epoch": 2.615179760319574, + "grad_norm": 0.1970203183942734, + "learning_rate": 7.015810276679842e-06, + "loss": 0.2505, + "step": 983 + }, + { + "epoch": 2.6178428761651134, + "grad_norm": 0.20402269570746995, + "learning_rate": 6.966403162055336e-06, + "loss": 0.2599, + "step": 984 + }, + { + "epoch": 2.6205059920106524, + "grad_norm": 0.20759868626386915, + "learning_rate": 6.91699604743083e-06, + "loss": 0.2733, + "step": 985 + }, + { + "epoch": 2.623169107856192, + "grad_norm": 0.22693920517209076, + "learning_rate": 6.867588932806325e-06, + "loss": 0.2627, + "step": 986 + }, + { + "epoch": 2.625832223701731, + "grad_norm": 0.20970122945185465, + "learning_rate": 6.818181818181818e-06, + "loss": 0.2704, + "step": 987 + }, + { + "epoch": 2.6284953395472703, + "grad_norm": 0.20332704992870704, + "learning_rate": 6.768774703557313e-06, + "loss": 0.2762, + "step": 988 + }, + { + "epoch": 2.63115845539281, + "grad_norm": 0.20966961639828544, + "learning_rate": 6.719367588932807e-06, + "loss": 0.2737, + "step": 989 + }, + { + "epoch": 2.633821571238349, + "grad_norm": 0.2392085498215163, + "learning_rate": 6.6699604743083014e-06, + "loss": 0.2639, + "step": 990 + }, + { + "epoch": 2.6364846870838883, + "grad_norm": 0.22069815282030755, + "learning_rate": 6.620553359683796e-06, + "loss": 0.2623, + "step": 991 + }, + { + "epoch": 2.6391478029294273, + "grad_norm": 0.2062130093620195, + "learning_rate": 6.5711462450592885e-06, + "loss": 0.2634, + "step": 992 + }, + { + "epoch": 2.6418109187749668, + "grad_norm": 0.21202212454473487, + "learning_rate": 6.521739130434783e-06, + "loss": 0.2732, + "step": 993 + }, + { + "epoch": 2.6444740346205062, + "grad_norm": 0.20742438691074003, + "learning_rate": 6.472332015810277e-06, + "loss": 0.2775, + "step": 994 + }, + { + "epoch": 2.6471371504660453, + "grad_norm": 0.20539419758832048, + "learning_rate": 6.422924901185772e-06, + "loss": 0.2786, + "step": 995 + }, + { + "epoch": 2.6498002663115847, + "grad_norm": 0.19871961616535505, + "learning_rate": 6.373517786561266e-06, + "loss": 0.2642, + "step": 996 + }, + { + "epoch": 2.6524633821571237, + "grad_norm": 0.2445459224085182, + "learning_rate": 6.324110671936759e-06, + "loss": 0.271, + "step": 997 + }, + { + "epoch": 2.655126498002663, + "grad_norm": 0.20294635449003665, + "learning_rate": 6.274703557312253e-06, + "loss": 0.272, + "step": 998 + }, + { + "epoch": 2.6577896138482027, + "grad_norm": 0.20711520929552674, + "learning_rate": 6.2252964426877475e-06, + "loss": 0.277, + "step": 999 + }, + { + "epoch": 2.6604527296937417, + "grad_norm": 0.19858451035812705, + "learning_rate": 6.175889328063241e-06, + "loss": 0.2781, + "step": 1000 + }, + { + "epoch": 2.6631158455392807, + "grad_norm": 0.2029933078164672, + "learning_rate": 6.126482213438735e-06, + "loss": 0.259, + "step": 1001 + }, + { + "epoch": 2.66577896138482, + "grad_norm": 0.21745287030160018, + "learning_rate": 6.07707509881423e-06, + "loss": 0.27, + "step": 1002 + }, + { + "epoch": 2.6684420772303596, + "grad_norm": 0.19345167090566057, + "learning_rate": 6.027667984189723e-06, + "loss": 0.268, + "step": 1003 + }, + { + "epoch": 2.6711051930758987, + "grad_norm": 0.21568939666641776, + "learning_rate": 5.978260869565218e-06, + "loss": 0.2643, + "step": 1004 + }, + { + "epoch": 2.673768308921438, + "grad_norm": 0.19296044607870885, + "learning_rate": 5.928853754940711e-06, + "loss": 0.2761, + "step": 1005 + }, + { + "epoch": 2.676431424766977, + "grad_norm": 0.20181257150105722, + "learning_rate": 5.879446640316206e-06, + "loss": 0.271, + "step": 1006 + }, + { + "epoch": 2.6790945406125166, + "grad_norm": 0.2073838164023787, + "learning_rate": 5.8300395256917e-06, + "loss": 0.2713, + "step": 1007 + }, + { + "epoch": 2.681757656458056, + "grad_norm": 0.20965825745167907, + "learning_rate": 5.7806324110671936e-06, + "loss": 0.2689, + "step": 1008 + }, + { + "epoch": 2.684420772303595, + "grad_norm": 0.20444583357709556, + "learning_rate": 5.731225296442688e-06, + "loss": 0.2831, + "step": 1009 + }, + { + "epoch": 2.6870838881491346, + "grad_norm": 0.20971896583727812, + "learning_rate": 5.681818181818182e-06, + "loss": 0.2626, + "step": 1010 + }, + { + "epoch": 2.6897470039946736, + "grad_norm": 0.2080555215910288, + "learning_rate": 5.632411067193676e-06, + "loss": 0.2602, + "step": 1011 + }, + { + "epoch": 2.692410119840213, + "grad_norm": 0.2013420667078693, + "learning_rate": 5.58300395256917e-06, + "loss": 0.2653, + "step": 1012 + }, + { + "epoch": 2.6950732356857525, + "grad_norm": 0.19614771328643982, + "learning_rate": 5.533596837944665e-06, + "loss": 0.2556, + "step": 1013 + }, + { + "epoch": 2.6977363515312915, + "grad_norm": 0.20085761642467498, + "learning_rate": 5.484189723320159e-06, + "loss": 0.2744, + "step": 1014 + }, + { + "epoch": 2.700399467376831, + "grad_norm": 0.21544774180757933, + "learning_rate": 5.4347826086956525e-06, + "loss": 0.2602, + "step": 1015 + }, + { + "epoch": 2.70306258322237, + "grad_norm": 0.19696825099825307, + "learning_rate": 5.385375494071147e-06, + "loss": 0.2595, + "step": 1016 + }, + { + "epoch": 2.7057256990679095, + "grad_norm": 0.1924176776922604, + "learning_rate": 5.3359683794466405e-06, + "loss": 0.2619, + "step": 1017 + }, + { + "epoch": 2.708388814913449, + "grad_norm": 0.22132480166121332, + "learning_rate": 5.286561264822135e-06, + "loss": 0.2697, + "step": 1018 + }, + { + "epoch": 2.711051930758988, + "grad_norm": 0.18691262036412767, + "learning_rate": 5.237154150197629e-06, + "loss": 0.2554, + "step": 1019 + }, + { + "epoch": 2.7137150466045274, + "grad_norm": 0.1938229034237995, + "learning_rate": 5.187747035573123e-06, + "loss": 0.2586, + "step": 1020 + }, + { + "epoch": 2.7163781624500665, + "grad_norm": 0.2129748283287826, + "learning_rate": 5.138339920948617e-06, + "loss": 0.2795, + "step": 1021 + }, + { + "epoch": 2.719041278295606, + "grad_norm": 0.20445583537089335, + "learning_rate": 5.088932806324111e-06, + "loss": 0.2658, + "step": 1022 + }, + { + "epoch": 2.7217043941411454, + "grad_norm": 0.1933528504807178, + "learning_rate": 5.039525691699605e-06, + "loss": 0.2621, + "step": 1023 + }, + { + "epoch": 2.7243675099866844, + "grad_norm": 0.21949852883334098, + "learning_rate": 4.9901185770750995e-06, + "loss": 0.2649, + "step": 1024 + }, + { + "epoch": 2.7270306258322234, + "grad_norm": 0.20152020359649447, + "learning_rate": 4.940711462450593e-06, + "loss": 0.265, + "step": 1025 + }, + { + "epoch": 2.729693741677763, + "grad_norm": 0.20583564086259545, + "learning_rate": 4.891304347826087e-06, + "loss": 0.2619, + "step": 1026 + }, + { + "epoch": 2.7323568575233024, + "grad_norm": 0.2007179587300372, + "learning_rate": 4.841897233201581e-06, + "loss": 0.2693, + "step": 1027 + }, + { + "epoch": 2.7350199733688414, + "grad_norm": 0.1998685679119499, + "learning_rate": 4.792490118577075e-06, + "loss": 0.2629, + "step": 1028 + }, + { + "epoch": 2.737683089214381, + "grad_norm": 0.21626697273734094, + "learning_rate": 4.743083003952569e-06, + "loss": 0.269, + "step": 1029 + }, + { + "epoch": 2.74034620505992, + "grad_norm": 0.19448387232242922, + "learning_rate": 4.693675889328063e-06, + "loss": 0.2761, + "step": 1030 + }, + { + "epoch": 2.7430093209054593, + "grad_norm": 0.19395208512967949, + "learning_rate": 4.644268774703558e-06, + "loss": 0.2653, + "step": 1031 + }, + { + "epoch": 2.745672436750999, + "grad_norm": 0.18925291663752578, + "learning_rate": 4.594861660079051e-06, + "loss": 0.2568, + "step": 1032 + }, + { + "epoch": 2.748335552596538, + "grad_norm": 0.20842012726728598, + "learning_rate": 4.5454545454545455e-06, + "loss": 0.2689, + "step": 1033 + }, + { + "epoch": 2.7509986684420773, + "grad_norm": 0.20399895934870427, + "learning_rate": 4.496047430830039e-06, + "loss": 0.262, + "step": 1034 + }, + { + "epoch": 2.7536617842876163, + "grad_norm": 0.21638718896911208, + "learning_rate": 4.4466403162055334e-06, + "loss": 0.2589, + "step": 1035 + }, + { + "epoch": 2.756324900133156, + "grad_norm": 0.19757801710020018, + "learning_rate": 4.397233201581028e-06, + "loss": 0.2575, + "step": 1036 + }, + { + "epoch": 2.7589880159786953, + "grad_norm": 0.1930523815662032, + "learning_rate": 4.347826086956522e-06, + "loss": 0.2589, + "step": 1037 + }, + { + "epoch": 2.7616511318242343, + "grad_norm": 0.20093506678059855, + "learning_rate": 4.298418972332017e-06, + "loss": 0.2686, + "step": 1038 + }, + { + "epoch": 2.7643142476697737, + "grad_norm": 0.20051627815913756, + "learning_rate": 4.24901185770751e-06, + "loss": 0.2709, + "step": 1039 + }, + { + "epoch": 2.7669773635153128, + "grad_norm": 0.196594765327016, + "learning_rate": 4.1996047430830045e-06, + "loss": 0.2617, + "step": 1040 + }, + { + "epoch": 2.7696404793608522, + "grad_norm": 0.19314366189878793, + "learning_rate": 4.150197628458499e-06, + "loss": 0.2851, + "step": 1041 + }, + { + "epoch": 2.7723035952063917, + "grad_norm": 0.2161802526854043, + "learning_rate": 4.1007905138339924e-06, + "loss": 0.2674, + "step": 1042 + }, + { + "epoch": 2.7749667110519307, + "grad_norm": 0.18272700852758644, + "learning_rate": 4.051383399209487e-06, + "loss": 0.2523, + "step": 1043 + }, + { + "epoch": 2.77762982689747, + "grad_norm": 0.1914267001454524, + "learning_rate": 4.00197628458498e-06, + "loss": 0.271, + "step": 1044 + }, + { + "epoch": 2.780292942743009, + "grad_norm": 0.20563053341844564, + "learning_rate": 3.952569169960475e-06, + "loss": 0.2588, + "step": 1045 + }, + { + "epoch": 2.7829560585885487, + "grad_norm": 0.19474283827667518, + "learning_rate": 3.903162055335968e-06, + "loss": 0.259, + "step": 1046 + }, + { + "epoch": 2.785619174434088, + "grad_norm": 0.199541546086498, + "learning_rate": 3.853754940711463e-06, + "loss": 0.2766, + "step": 1047 + }, + { + "epoch": 2.788282290279627, + "grad_norm": 0.1962650749461456, + "learning_rate": 3.804347826086957e-06, + "loss": 0.275, + "step": 1048 + }, + { + "epoch": 2.790945406125166, + "grad_norm": 0.19771877806493995, + "learning_rate": 3.7549407114624506e-06, + "loss": 0.2651, + "step": 1049 + }, + { + "epoch": 2.7936085219707056, + "grad_norm": 0.25769379294942607, + "learning_rate": 3.705533596837945e-06, + "loss": 0.2792, + "step": 1050 + }, + { + "epoch": 2.796271637816245, + "grad_norm": 0.2095398170946154, + "learning_rate": 3.6561264822134385e-06, + "loss": 0.2671, + "step": 1051 + }, + { + "epoch": 2.798934753661784, + "grad_norm": 0.1929871299001819, + "learning_rate": 3.606719367588933e-06, + "loss": 0.2571, + "step": 1052 + }, + { + "epoch": 2.8015978695073236, + "grad_norm": 0.19854196709504868, + "learning_rate": 3.5573122529644273e-06, + "loss": 0.2734, + "step": 1053 + }, + { + "epoch": 2.8042609853528626, + "grad_norm": 0.20342959087962045, + "learning_rate": 3.507905138339921e-06, + "loss": 0.2675, + "step": 1054 + }, + { + "epoch": 2.806924101198402, + "grad_norm": 0.19566813473730155, + "learning_rate": 3.458498023715415e-06, + "loss": 0.2636, + "step": 1055 + }, + { + "epoch": 2.8095872170439415, + "grad_norm": 0.19394868609732532, + "learning_rate": 3.409090909090909e-06, + "loss": 0.2582, + "step": 1056 + }, + { + "epoch": 2.8122503328894806, + "grad_norm": 0.19315741666740258, + "learning_rate": 3.3596837944664035e-06, + "loss": 0.2744, + "step": 1057 + }, + { + "epoch": 2.81491344873502, + "grad_norm": 0.19500591092508857, + "learning_rate": 3.310276679841898e-06, + "loss": 0.2664, + "step": 1058 + }, + { + "epoch": 2.817576564580559, + "grad_norm": 0.20369824754516933, + "learning_rate": 3.2608695652173914e-06, + "loss": 0.2753, + "step": 1059 + }, + { + "epoch": 2.8202396804260985, + "grad_norm": 0.21679482311751339, + "learning_rate": 3.211462450592886e-06, + "loss": 0.2668, + "step": 1060 + }, + { + "epoch": 2.822902796271638, + "grad_norm": 0.19207166020188257, + "learning_rate": 3.1620553359683794e-06, + "loss": 0.2714, + "step": 1061 + }, + { + "epoch": 2.825565912117177, + "grad_norm": 0.18576307265975345, + "learning_rate": 3.1126482213438737e-06, + "loss": 0.2641, + "step": 1062 + }, + { + "epoch": 2.8282290279627165, + "grad_norm": 0.19216814433561258, + "learning_rate": 3.0632411067193677e-06, + "loss": 0.2686, + "step": 1063 + }, + { + "epoch": 2.8308921438082555, + "grad_norm": 0.20223820044568933, + "learning_rate": 3.0138339920948617e-06, + "loss": 0.2681, + "step": 1064 + }, + { + "epoch": 2.833555259653795, + "grad_norm": 0.2025068882484355, + "learning_rate": 2.9644268774703556e-06, + "loss": 0.2671, + "step": 1065 + }, + { + "epoch": 2.8362183754993344, + "grad_norm": 0.19192928047405172, + "learning_rate": 2.91501976284585e-06, + "loss": 0.2738, + "step": 1066 + }, + { + "epoch": 2.8388814913448734, + "grad_norm": 0.18813387022576608, + "learning_rate": 2.865612648221344e-06, + "loss": 0.2555, + "step": 1067 + }, + { + "epoch": 2.841544607190413, + "grad_norm": 0.17981642336035955, + "learning_rate": 2.816205533596838e-06, + "loss": 0.2649, + "step": 1068 + }, + { + "epoch": 2.844207723035952, + "grad_norm": 0.19082585501925517, + "learning_rate": 2.7667984189723323e-06, + "loss": 0.2717, + "step": 1069 + }, + { + "epoch": 2.8468708388814914, + "grad_norm": 0.1934715160744257, + "learning_rate": 2.7173913043478263e-06, + "loss": 0.2588, + "step": 1070 + }, + { + "epoch": 2.849533954727031, + "grad_norm": 0.1943027368827162, + "learning_rate": 2.6679841897233202e-06, + "loss": 0.2612, + "step": 1071 + }, + { + "epoch": 2.85219707057257, + "grad_norm": 0.20463059754180915, + "learning_rate": 2.6185770750988146e-06, + "loss": 0.2654, + "step": 1072 + }, + { + "epoch": 2.8548601864181093, + "grad_norm": 0.21078399413940485, + "learning_rate": 2.5691699604743086e-06, + "loss": 0.2671, + "step": 1073 + }, + { + "epoch": 2.8575233022636484, + "grad_norm": 0.20725181291345451, + "learning_rate": 2.5197628458498025e-06, + "loss": 0.2658, + "step": 1074 + }, + { + "epoch": 2.860186418109188, + "grad_norm": 0.19210859826009163, + "learning_rate": 2.4703557312252965e-06, + "loss": 0.2749, + "step": 1075 + }, + { + "epoch": 2.8628495339547273, + "grad_norm": 0.21087703729971102, + "learning_rate": 2.4209486166007905e-06, + "loss": 0.2565, + "step": 1076 + }, + { + "epoch": 2.8655126498002663, + "grad_norm": 0.1932869202958659, + "learning_rate": 2.3715415019762844e-06, + "loss": 0.2761, + "step": 1077 + }, + { + "epoch": 2.8681757656458053, + "grad_norm": 0.212098446975856, + "learning_rate": 2.322134387351779e-06, + "loss": 0.2739, + "step": 1078 + }, + { + "epoch": 2.870838881491345, + "grad_norm": 0.1907847773078055, + "learning_rate": 2.2727272727272728e-06, + "loss": 0.2645, + "step": 1079 + }, + { + "epoch": 2.8735019973368843, + "grad_norm": 0.2138904901003034, + "learning_rate": 2.2233201581027667e-06, + "loss": 0.266, + "step": 1080 + }, + { + "epoch": 2.8761651131824233, + "grad_norm": 0.19201411133409543, + "learning_rate": 2.173913043478261e-06, + "loss": 0.2589, + "step": 1081 + }, + { + "epoch": 2.8788282290279628, + "grad_norm": 0.1845739978063396, + "learning_rate": 2.124505928853755e-06, + "loss": 0.2597, + "step": 1082 + }, + { + "epoch": 2.881491344873502, + "grad_norm": 0.19569151053283082, + "learning_rate": 2.0750988142292494e-06, + "loss": 0.2617, + "step": 1083 + }, + { + "epoch": 2.8841544607190412, + "grad_norm": 0.19194512760322638, + "learning_rate": 2.0256916996047434e-06, + "loss": 0.2741, + "step": 1084 + }, + { + "epoch": 2.8868175765645807, + "grad_norm": 0.19164700223613637, + "learning_rate": 1.9762845849802374e-06, + "loss": 0.2557, + "step": 1085 + }, + { + "epoch": 2.8894806924101197, + "grad_norm": 0.20722349213232807, + "learning_rate": 1.9268774703557313e-06, + "loss": 0.2811, + "step": 1086 + }, + { + "epoch": 2.892143808255659, + "grad_norm": 0.21395903599582983, + "learning_rate": 1.8774703557312253e-06, + "loss": 0.2697, + "step": 1087 + }, + { + "epoch": 2.894806924101198, + "grad_norm": 0.19932722434475636, + "learning_rate": 1.8280632411067192e-06, + "loss": 0.2848, + "step": 1088 + }, + { + "epoch": 2.8974700399467377, + "grad_norm": 0.19719366657115883, + "learning_rate": 1.7786561264822136e-06, + "loss": 0.2715, + "step": 1089 + }, + { + "epoch": 2.900133155792277, + "grad_norm": 0.1975588211380889, + "learning_rate": 1.7292490118577076e-06, + "loss": 0.263, + "step": 1090 + }, + { + "epoch": 2.902796271637816, + "grad_norm": 0.1939515446139924, + "learning_rate": 1.6798418972332018e-06, + "loss": 0.2576, + "step": 1091 + }, + { + "epoch": 2.9054593874833556, + "grad_norm": 0.21461670844381095, + "learning_rate": 1.6304347826086957e-06, + "loss": 0.2622, + "step": 1092 + }, + { + "epoch": 2.9081225033288947, + "grad_norm": 0.18141714157708164, + "learning_rate": 1.5810276679841897e-06, + "loss": 0.2602, + "step": 1093 + }, + { + "epoch": 2.910785619174434, + "grad_norm": 0.18648909903146674, + "learning_rate": 1.5316205533596839e-06, + "loss": 0.2544, + "step": 1094 + }, + { + "epoch": 2.9134487350199736, + "grad_norm": 0.19749530453878072, + "learning_rate": 1.4822134387351778e-06, + "loss": 0.2511, + "step": 1095 + }, + { + "epoch": 2.9161118508655126, + "grad_norm": 0.2008025174676635, + "learning_rate": 1.432806324110672e-06, + "loss": 0.2621, + "step": 1096 + }, + { + "epoch": 2.918774966711052, + "grad_norm": 0.1926237458483956, + "learning_rate": 1.3833992094861662e-06, + "loss": 0.2584, + "step": 1097 + }, + { + "epoch": 2.921438082556591, + "grad_norm": 0.1917953810867646, + "learning_rate": 1.3339920948616601e-06, + "loss": 0.2696, + "step": 1098 + }, + { + "epoch": 2.9241011984021306, + "grad_norm": 0.18863387793323863, + "learning_rate": 1.2845849802371543e-06, + "loss": 0.269, + "step": 1099 + }, + { + "epoch": 2.92676431424767, + "grad_norm": 0.18859923936820897, + "learning_rate": 1.2351778656126482e-06, + "loss": 0.2629, + "step": 1100 + }, + { + "epoch": 2.929427430093209, + "grad_norm": 0.18918722042687142, + "learning_rate": 1.1857707509881422e-06, + "loss": 0.2659, + "step": 1101 + }, + { + "epoch": 2.932090545938748, + "grad_norm": 0.1909436486504395, + "learning_rate": 1.1363636363636364e-06, + "loss": 0.279, + "step": 1102 + }, + { + "epoch": 2.9347536617842875, + "grad_norm": 0.215394252478964, + "learning_rate": 1.0869565217391306e-06, + "loss": 0.2771, + "step": 1103 + }, + { + "epoch": 2.937416777629827, + "grad_norm": 0.1868050430391036, + "learning_rate": 1.0375494071146247e-06, + "loss": 0.255, + "step": 1104 + }, + { + "epoch": 2.940079893475366, + "grad_norm": 0.18705337019297927, + "learning_rate": 9.881422924901187e-07, + "loss": 0.2472, + "step": 1105 + }, + { + "epoch": 2.9427430093209055, + "grad_norm": 0.1935007995659731, + "learning_rate": 9.387351778656126e-07, + "loss": 0.2713, + "step": 1106 + }, + { + "epoch": 2.9454061251664445, + "grad_norm": 0.18412759277611498, + "learning_rate": 8.893280632411068e-07, + "loss": 0.2653, + "step": 1107 + }, + { + "epoch": 2.948069241011984, + "grad_norm": 0.18330377570006776, + "learning_rate": 8.399209486166009e-07, + "loss": 0.256, + "step": 1108 + }, + { + "epoch": 2.9507323568575234, + "grad_norm": 0.19950543771973236, + "learning_rate": 7.905138339920948e-07, + "loss": 0.2732, + "step": 1109 + }, + { + "epoch": 2.9533954727030625, + "grad_norm": 0.18701751210436693, + "learning_rate": 7.411067193675889e-07, + "loss": 0.2634, + "step": 1110 + }, + { + "epoch": 2.956058588548602, + "grad_norm": 0.18889807484399168, + "learning_rate": 6.916996047430831e-07, + "loss": 0.2519, + "step": 1111 + }, + { + "epoch": 2.958721704394141, + "grad_norm": 0.1898035633014786, + "learning_rate": 6.422924901185771e-07, + "loss": 0.2658, + "step": 1112 + }, + { + "epoch": 2.9613848202396804, + "grad_norm": 0.1864905294817814, + "learning_rate": 5.928853754940711e-07, + "loss": 0.2562, + "step": 1113 + }, + { + "epoch": 2.96404793608522, + "grad_norm": 0.18976880996630371, + "learning_rate": 5.434782608695653e-07, + "loss": 0.264, + "step": 1114 + }, + { + "epoch": 2.966711051930759, + "grad_norm": 0.19331420232956223, + "learning_rate": 4.940711462450593e-07, + "loss": 0.273, + "step": 1115 + }, + { + "epoch": 2.9693741677762984, + "grad_norm": 0.1930205378531215, + "learning_rate": 4.446640316205534e-07, + "loss": 0.2592, + "step": 1116 + }, + { + "epoch": 2.9720372836218374, + "grad_norm": 0.19028897264532088, + "learning_rate": 3.952569169960474e-07, + "loss": 0.2654, + "step": 1117 + }, + { + "epoch": 2.974700399467377, + "grad_norm": 0.19156481816748225, + "learning_rate": 3.4584980237154154e-07, + "loss": 0.261, + "step": 1118 + }, + { + "epoch": 2.9773635153129163, + "grad_norm": 0.1889476580235995, + "learning_rate": 2.9644268774703555e-07, + "loss": 0.2566, + "step": 1119 + }, + { + "epoch": 2.9800266311584553, + "grad_norm": 0.19663277621172817, + "learning_rate": 2.4703557312252967e-07, + "loss": 0.2751, + "step": 1120 + }, + { + "epoch": 2.982689747003995, + "grad_norm": 0.1848208372611624, + "learning_rate": 1.976284584980237e-07, + "loss": 0.2633, + "step": 1121 + }, + { + "epoch": 2.985352862849534, + "grad_norm": 0.18259691758877614, + "learning_rate": 1.4822134387351778e-07, + "loss": 0.2696, + "step": 1122 + }, + { + "epoch": 2.9880159786950733, + "grad_norm": 0.1849664900149779, + "learning_rate": 9.881422924901186e-08, + "loss": 0.2704, + "step": 1123 + }, + { + "epoch": 2.9906790945406128, + "grad_norm": 0.1854714711613864, + "learning_rate": 4.940711462450593e-08, + "loss": 0.2613, + "step": 1124 + }, + { + "epoch": 2.993342210386152, + "grad_norm": 0.18380044771707796, + "learning_rate": 0.0, + "loss": 0.2614, + "step": 1125 + }, + { + "epoch": 2.993342210386152, + "step": 1125, + "total_flos": 9.575573608085586e+17, + "train_loss": 0.4614936934842004, + "train_runtime": 99022.1208, + "train_samples_per_second": 0.182, + "train_steps_per_second": 0.011 + } + ], + "logging_steps": 1, + "max_steps": 1125, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 9.575573608085586e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}