{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9893190921228303, "eval_steps": 500, "global_step": 747, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004005340453938585, "grad_norm": 12.274865344598787, "learning_rate": 0.0, "loss": 0.8646, "step": 1 }, { "epoch": 0.00801068090787717, "grad_norm": 12.535290073417938, "learning_rate": 1.3333333333333336e-07, "loss": 0.8676, "step": 2 }, { "epoch": 0.012016021361815754, "grad_norm": 13.07061199884287, "learning_rate": 2.666666666666667e-07, "loss": 0.8902, "step": 3 }, { "epoch": 0.01602136181575434, "grad_norm": 12.267049803439043, "learning_rate": 4.0000000000000003e-07, "loss": 0.8587, "step": 4 }, { "epoch": 0.020026702269692925, "grad_norm": 11.897699285802622, "learning_rate": 5.333333333333335e-07, "loss": 0.8435, "step": 5 }, { "epoch": 0.02403204272363151, "grad_norm": 13.076081077633422, "learning_rate": 6.666666666666667e-07, "loss": 0.848, "step": 6 }, { "epoch": 0.028037383177570093, "grad_norm": 12.354188617098337, "learning_rate": 8.000000000000001e-07, "loss": 0.8607, "step": 7 }, { "epoch": 0.03204272363150868, "grad_norm": 11.607524627207871, "learning_rate": 9.333333333333334e-07, "loss": 0.8296, "step": 8 }, { "epoch": 0.036048064085447265, "grad_norm": 11.787571525976068, "learning_rate": 1.066666666666667e-06, "loss": 0.8279, "step": 9 }, { "epoch": 0.04005340453938585, "grad_norm": 10.854034800032643, "learning_rate": 1.2000000000000002e-06, "loss": 0.8255, "step": 10 }, { "epoch": 0.044058744993324434, "grad_norm": 10.331302172366403, "learning_rate": 1.3333333333333334e-06, "loss": 0.8452, "step": 11 }, { "epoch": 0.04806408544726302, "grad_norm": 10.51455049392686, "learning_rate": 1.4666666666666669e-06, "loss": 0.8477, "step": 12 }, { "epoch": 0.0520694259012016, "grad_norm": 6.804734157169537, "learning_rate": 1.6000000000000001e-06, "loss": 0.702, "step": 13 }, { "epoch": 0.056074766355140186, "grad_norm": 6.630143554129149, "learning_rate": 1.7333333333333336e-06, "loss": 0.716, "step": 14 }, { "epoch": 0.06008010680907877, "grad_norm": 5.650387053647136, "learning_rate": 1.8666666666666669e-06, "loss": 0.6831, "step": 15 }, { "epoch": 0.06408544726301736, "grad_norm": 5.847220094115504, "learning_rate": 2.0000000000000003e-06, "loss": 0.7188, "step": 16 }, { "epoch": 0.06809078771695594, "grad_norm": 2.7400023117365913, "learning_rate": 2.133333333333334e-06, "loss": 0.6409, "step": 17 }, { "epoch": 0.07209612817089453, "grad_norm": 2.2018255790223518, "learning_rate": 2.266666666666667e-06, "loss": 0.539, "step": 18 }, { "epoch": 0.07610146862483311, "grad_norm": 2.392641868923764, "learning_rate": 2.4000000000000003e-06, "loss": 0.6092, "step": 19 }, { "epoch": 0.0801068090787717, "grad_norm": 2.1787060198095847, "learning_rate": 2.5333333333333338e-06, "loss": 0.6169, "step": 20 }, { "epoch": 0.08411214953271028, "grad_norm": 1.8580036959151014, "learning_rate": 2.666666666666667e-06, "loss": 0.5581, "step": 21 }, { "epoch": 0.08811748998664887, "grad_norm": 1.8125640693516234, "learning_rate": 2.8000000000000003e-06, "loss": 0.5566, "step": 22 }, { "epoch": 0.09212283044058744, "grad_norm": 1.7280121022360342, "learning_rate": 2.9333333333333338e-06, "loss": 0.5672, "step": 23 }, { "epoch": 0.09612817089452604, "grad_norm": 1.8991117514168228, "learning_rate": 3.066666666666667e-06, "loss": 0.5354, "step": 24 }, { "epoch": 0.10013351134846461, "grad_norm": 1.6356492105183125, "learning_rate": 3.2000000000000003e-06, "loss": 0.5275, "step": 25 }, { "epoch": 0.1041388518024032, "grad_norm": 1.6267128477825465, "learning_rate": 3.3333333333333333e-06, "loss": 0.5338, "step": 26 }, { "epoch": 0.1081441922563418, "grad_norm": 1.404686847432176, "learning_rate": 3.4666666666666672e-06, "loss": 0.5197, "step": 27 }, { "epoch": 0.11214953271028037, "grad_norm": 1.303663556401033, "learning_rate": 3.6000000000000003e-06, "loss": 0.5308, "step": 28 }, { "epoch": 0.11615487316421896, "grad_norm": 1.1046386263286005, "learning_rate": 3.7333333333333337e-06, "loss": 0.5012, "step": 29 }, { "epoch": 0.12016021361815754, "grad_norm": 1.0758105382558327, "learning_rate": 3.866666666666667e-06, "loss": 0.4804, "step": 30 }, { "epoch": 0.12416555407209613, "grad_norm": 0.889059378144954, "learning_rate": 4.000000000000001e-06, "loss": 0.4571, "step": 31 }, { "epoch": 0.12817089452603472, "grad_norm": 0.9541992345873649, "learning_rate": 4.133333333333333e-06, "loss": 0.4188, "step": 32 }, { "epoch": 0.1321762349799733, "grad_norm": 0.9939649352643045, "learning_rate": 4.266666666666668e-06, "loss": 0.4638, "step": 33 }, { "epoch": 0.13618157543391188, "grad_norm": 1.0368182385408335, "learning_rate": 4.4e-06, "loss": 0.4594, "step": 34 }, { "epoch": 0.14018691588785046, "grad_norm": 1.0252643282112182, "learning_rate": 4.533333333333334e-06, "loss": 0.4349, "step": 35 }, { "epoch": 0.14419225634178906, "grad_norm": 0.8930734240919034, "learning_rate": 4.666666666666667e-06, "loss": 0.4105, "step": 36 }, { "epoch": 0.14819759679572764, "grad_norm": 0.8638620093928763, "learning_rate": 4.800000000000001e-06, "loss": 0.444, "step": 37 }, { "epoch": 0.15220293724966621, "grad_norm": 0.8266472764867793, "learning_rate": 4.933333333333334e-06, "loss": 0.4264, "step": 38 }, { "epoch": 0.15620827770360482, "grad_norm": 0.7587973597324337, "learning_rate": 5.0666666666666676e-06, "loss": 0.4153, "step": 39 }, { "epoch": 0.1602136181575434, "grad_norm": 0.7046790303627571, "learning_rate": 5.2e-06, "loss": 0.3968, "step": 40 }, { "epoch": 0.16421895861148197, "grad_norm": 0.7828655737674856, "learning_rate": 5.333333333333334e-06, "loss": 0.425, "step": 41 }, { "epoch": 0.16822429906542055, "grad_norm": 0.7970939916520573, "learning_rate": 5.466666666666667e-06, "loss": 0.4055, "step": 42 }, { "epoch": 0.17222963951935916, "grad_norm": 0.6788619839771596, "learning_rate": 5.600000000000001e-06, "loss": 0.3942, "step": 43 }, { "epoch": 0.17623497997329773, "grad_norm": 0.6213101486173681, "learning_rate": 5.733333333333334e-06, "loss": 0.4008, "step": 44 }, { "epoch": 0.1802403204272363, "grad_norm": 0.5942610468735896, "learning_rate": 5.8666666666666675e-06, "loss": 0.3918, "step": 45 }, { "epoch": 0.1842456608811749, "grad_norm": 0.661249969118244, "learning_rate": 6e-06, "loss": 0.3713, "step": 46 }, { "epoch": 0.1882510013351135, "grad_norm": 0.6165605112645042, "learning_rate": 6.133333333333334e-06, "loss": 0.3695, "step": 47 }, { "epoch": 0.19225634178905207, "grad_norm": 0.6418004850122087, "learning_rate": 6.266666666666668e-06, "loss": 0.3855, "step": 48 }, { "epoch": 0.19626168224299065, "grad_norm": 0.671209019626683, "learning_rate": 6.4000000000000006e-06, "loss": 0.4228, "step": 49 }, { "epoch": 0.20026702269692923, "grad_norm": 0.6303030288370243, "learning_rate": 6.533333333333334e-06, "loss": 0.3711, "step": 50 }, { "epoch": 0.20427236315086783, "grad_norm": 0.6417652044922048, "learning_rate": 6.666666666666667e-06, "loss": 0.389, "step": 51 }, { "epoch": 0.2082777036048064, "grad_norm": 0.572170970965431, "learning_rate": 6.800000000000001e-06, "loss": 0.3795, "step": 52 }, { "epoch": 0.21228304405874499, "grad_norm": 0.5711694232400057, "learning_rate": 6.9333333333333344e-06, "loss": 0.3689, "step": 53 }, { "epoch": 0.2162883845126836, "grad_norm": 0.5910040436075836, "learning_rate": 7.066666666666667e-06, "loss": 0.365, "step": 54 }, { "epoch": 0.22029372496662217, "grad_norm": 0.6284207342849625, "learning_rate": 7.2000000000000005e-06, "loss": 0.4132, "step": 55 }, { "epoch": 0.22429906542056074, "grad_norm": 0.5849289722490485, "learning_rate": 7.333333333333333e-06, "loss": 0.373, "step": 56 }, { "epoch": 0.22830440587449932, "grad_norm": 0.6341921136746668, "learning_rate": 7.4666666666666675e-06, "loss": 0.3918, "step": 57 }, { "epoch": 0.23230974632843793, "grad_norm": 0.5938896188604564, "learning_rate": 7.600000000000001e-06, "loss": 0.3663, "step": 58 }, { "epoch": 0.2363150867823765, "grad_norm": 0.5821270563686713, "learning_rate": 7.733333333333334e-06, "loss": 0.3465, "step": 59 }, { "epoch": 0.24032042723631508, "grad_norm": 0.5958193467288128, "learning_rate": 7.866666666666667e-06, "loss": 0.3619, "step": 60 }, { "epoch": 0.24432576769025366, "grad_norm": 0.5778869298012563, "learning_rate": 8.000000000000001e-06, "loss": 0.3501, "step": 61 }, { "epoch": 0.24833110814419226, "grad_norm": 0.5809265935247063, "learning_rate": 8.133333333333334e-06, "loss": 0.3593, "step": 62 }, { "epoch": 0.2523364485981308, "grad_norm": 0.5301298964648872, "learning_rate": 8.266666666666667e-06, "loss": 0.3262, "step": 63 }, { "epoch": 0.25634178905206945, "grad_norm": 0.6073029142771318, "learning_rate": 8.400000000000001e-06, "loss": 0.3458, "step": 64 }, { "epoch": 0.260347129506008, "grad_norm": 0.5862505044336555, "learning_rate": 8.533333333333335e-06, "loss": 0.3444, "step": 65 }, { "epoch": 0.2643524699599466, "grad_norm": 0.5545311112728927, "learning_rate": 8.666666666666668e-06, "loss": 0.379, "step": 66 }, { "epoch": 0.2683578104138852, "grad_norm": 0.5979912535625811, "learning_rate": 8.8e-06, "loss": 0.3582, "step": 67 }, { "epoch": 0.27236315086782376, "grad_norm": 0.5727967554965969, "learning_rate": 8.933333333333333e-06, "loss": 0.3428, "step": 68 }, { "epoch": 0.27636849132176233, "grad_norm": 0.6017340630111007, "learning_rate": 9.066666666666667e-06, "loss": 0.3587, "step": 69 }, { "epoch": 0.2803738317757009, "grad_norm": 0.5498581806098397, "learning_rate": 9.200000000000002e-06, "loss": 0.3567, "step": 70 }, { "epoch": 0.28437917222963954, "grad_norm": 0.5526640416700183, "learning_rate": 9.333333333333334e-06, "loss": 0.3337, "step": 71 }, { "epoch": 0.2883845126835781, "grad_norm": 0.5492315682122837, "learning_rate": 9.466666666666667e-06, "loss": 0.3486, "step": 72 }, { "epoch": 0.2923898531375167, "grad_norm": 0.5494699596828775, "learning_rate": 9.600000000000001e-06, "loss": 0.3374, "step": 73 }, { "epoch": 0.2963951935914553, "grad_norm": 0.630131268447689, "learning_rate": 9.733333333333334e-06, "loss": 0.3568, "step": 74 }, { "epoch": 0.30040053404539385, "grad_norm": 0.6336383497338373, "learning_rate": 9.866666666666668e-06, "loss": 0.3616, "step": 75 }, { "epoch": 0.30440587449933243, "grad_norm": 0.5624776217319135, "learning_rate": 1e-05, "loss": 0.3505, "step": 76 }, { "epoch": 0.308411214953271, "grad_norm": 0.5899336003315098, "learning_rate": 9.999945361292553e-06, "loss": 0.3576, "step": 77 }, { "epoch": 0.31241655540720964, "grad_norm": 0.6756783302903452, "learning_rate": 9.999781446364366e-06, "loss": 0.3519, "step": 78 }, { "epoch": 0.3164218958611482, "grad_norm": 0.5644425121126243, "learning_rate": 9.999508258797876e-06, "loss": 0.3164, "step": 79 }, { "epoch": 0.3204272363150868, "grad_norm": 0.587830154954018, "learning_rate": 9.999125804563732e-06, "loss": 0.3268, "step": 80 }, { "epoch": 0.32443257676902537, "grad_norm": 0.5884963318825209, "learning_rate": 9.998634092020659e-06, "loss": 0.345, "step": 81 }, { "epoch": 0.32843791722296395, "grad_norm": 0.6887604600916913, "learning_rate": 9.998033131915266e-06, "loss": 0.3803, "step": 82 }, { "epoch": 0.3324432576769025, "grad_norm": 0.5384174093778301, "learning_rate": 9.997322937381829e-06, "loss": 0.3344, "step": 83 }, { "epoch": 0.3364485981308411, "grad_norm": 0.5281542891218585, "learning_rate": 9.996503523941994e-06, "loss": 0.32, "step": 84 }, { "epoch": 0.3404539385847797, "grad_norm": 0.5164890027100179, "learning_rate": 9.995574909504434e-06, "loss": 0.3204, "step": 85 }, { "epoch": 0.3444592790387183, "grad_norm": 0.6252952243404047, "learning_rate": 9.994537114364471e-06, "loss": 0.3335, "step": 86 }, { "epoch": 0.3484646194926569, "grad_norm": 0.5558865473599024, "learning_rate": 9.993390161203615e-06, "loss": 0.3311, "step": 87 }, { "epoch": 0.35246995994659547, "grad_norm": 0.5761999832501623, "learning_rate": 9.992134075089085e-06, "loss": 0.3429, "step": 88 }, { "epoch": 0.35647530040053405, "grad_norm": 0.5374104324302127, "learning_rate": 9.990768883473243e-06, "loss": 0.3302, "step": 89 }, { "epoch": 0.3604806408544726, "grad_norm": 0.5310051502544871, "learning_rate": 9.989294616193018e-06, "loss": 0.345, "step": 90 }, { "epoch": 0.3644859813084112, "grad_norm": 0.6006990772867254, "learning_rate": 9.987711305469232e-06, "loss": 0.3351, "step": 91 }, { "epoch": 0.3684913217623498, "grad_norm": 0.559674099597398, "learning_rate": 9.986018985905901e-06, "loss": 0.3423, "step": 92 }, { "epoch": 0.3724966622162884, "grad_norm": 0.5126707083736739, "learning_rate": 9.984217694489493e-06, "loss": 0.344, "step": 93 }, { "epoch": 0.376502002670227, "grad_norm": 0.6357067005494667, "learning_rate": 9.982307470588097e-06, "loss": 0.3356, "step": 94 }, { "epoch": 0.38050734312416556, "grad_norm": 0.5258363366242368, "learning_rate": 9.98028835595058e-06, "loss": 0.3405, "step": 95 }, { "epoch": 0.38451268357810414, "grad_norm": 0.6022837706168479, "learning_rate": 9.978160394705669e-06, "loss": 0.3451, "step": 96 }, { "epoch": 0.3885180240320427, "grad_norm": 0.5617885913949726, "learning_rate": 9.975923633360985e-06, "loss": 0.3141, "step": 97 }, { "epoch": 0.3925233644859813, "grad_norm": 0.6322761944732146, "learning_rate": 9.973578120802025e-06, "loss": 0.3225, "step": 98 }, { "epoch": 0.3965287049399199, "grad_norm": 0.5513939521450553, "learning_rate": 9.971123908291103e-06, "loss": 0.3269, "step": 99 }, { "epoch": 0.40053404539385845, "grad_norm": 0.5692799860975164, "learning_rate": 9.968561049466214e-06, "loss": 0.337, "step": 100 }, { "epoch": 0.4045393858477971, "grad_norm": 0.552465429973677, "learning_rate": 9.965889600339877e-06, "loss": 0.3256, "step": 101 }, { "epoch": 0.40854472630173566, "grad_norm": 0.5542585997979107, "learning_rate": 9.963109619297905e-06, "loss": 0.3147, "step": 102 }, { "epoch": 0.41255006675567424, "grad_norm": 0.5724996614177005, "learning_rate": 9.960221167098124e-06, "loss": 0.3034, "step": 103 }, { "epoch": 0.4165554072096128, "grad_norm": 0.5546269037589538, "learning_rate": 9.957224306869053e-06, "loss": 0.3283, "step": 104 }, { "epoch": 0.4205607476635514, "grad_norm": 0.5445864929651966, "learning_rate": 9.95411910410852e-06, "loss": 0.3161, "step": 105 }, { "epoch": 0.42456608811748997, "grad_norm": 0.5679124498352474, "learning_rate": 9.950905626682229e-06, "loss": 0.3205, "step": 106 }, { "epoch": 0.42857142857142855, "grad_norm": 0.5746860342884514, "learning_rate": 9.947583944822284e-06, "loss": 0.3087, "step": 107 }, { "epoch": 0.4325767690253672, "grad_norm": 0.6293503344651058, "learning_rate": 9.944154131125643e-06, "loss": 0.3481, "step": 108 }, { "epoch": 0.43658210947930576, "grad_norm": 0.5733682700314644, "learning_rate": 9.940616260552545e-06, "loss": 0.3292, "step": 109 }, { "epoch": 0.44058744993324434, "grad_norm": 0.5593414000264296, "learning_rate": 9.936970410424857e-06, "loss": 0.3282, "step": 110 }, { "epoch": 0.4445927903871829, "grad_norm": 0.5928528566284356, "learning_rate": 9.933216660424396e-06, "loss": 0.3305, "step": 111 }, { "epoch": 0.4485981308411215, "grad_norm": 0.5958761955618564, "learning_rate": 9.92935509259118e-06, "loss": 0.3372, "step": 112 }, { "epoch": 0.45260347129506007, "grad_norm": 0.5974132983016888, "learning_rate": 9.92538579132164e-06, "loss": 0.3258, "step": 113 }, { "epoch": 0.45660881174899864, "grad_norm": 0.576878904820484, "learning_rate": 9.921308843366773e-06, "loss": 0.3223, "step": 114 }, { "epoch": 0.4606141522029373, "grad_norm": 0.5705071294697854, "learning_rate": 9.917124337830242e-06, "loss": 0.3078, "step": 115 }, { "epoch": 0.46461949265687585, "grad_norm": 0.630779206880613, "learning_rate": 9.912832366166443e-06, "loss": 0.3405, "step": 116 }, { "epoch": 0.46862483311081443, "grad_norm": 0.612175246698219, "learning_rate": 9.908433022178484e-06, "loss": 0.3247, "step": 117 }, { "epoch": 0.472630173564753, "grad_norm": 0.5888384503018512, "learning_rate": 9.903926402016153e-06, "loss": 0.3237, "step": 118 }, { "epoch": 0.4766355140186916, "grad_norm": 0.584706307851654, "learning_rate": 9.899312604173814e-06, "loss": 0.3289, "step": 119 }, { "epoch": 0.48064085447263016, "grad_norm": 0.5850586724735871, "learning_rate": 9.894591729488243e-06, "loss": 0.3103, "step": 120 }, { "epoch": 0.48464619492656874, "grad_norm": 0.5676874241983956, "learning_rate": 9.889763881136439e-06, "loss": 0.3416, "step": 121 }, { "epoch": 0.4886515353805073, "grad_norm": 0.5507142647058878, "learning_rate": 9.884829164633359e-06, "loss": 0.332, "step": 122 }, { "epoch": 0.49265687583444595, "grad_norm": 0.5883422169944877, "learning_rate": 9.879787687829616e-06, "loss": 0.341, "step": 123 }, { "epoch": 0.49666221628838453, "grad_norm": 0.6052974016126247, "learning_rate": 9.874639560909118e-06, "loss": 0.3145, "step": 124 }, { "epoch": 0.5006675567423231, "grad_norm": 0.5623497695712161, "learning_rate": 9.869384896386669e-06, "loss": 0.324, "step": 125 }, { "epoch": 0.5046728971962616, "grad_norm": 0.637548494608964, "learning_rate": 9.864023809105497e-06, "loss": 0.3512, "step": 126 }, { "epoch": 0.5086782376502003, "grad_norm": 0.5948450995539613, "learning_rate": 9.858556416234755e-06, "loss": 0.3323, "step": 127 }, { "epoch": 0.5126835781041389, "grad_norm": 0.6071663583544622, "learning_rate": 9.852982837266955e-06, "loss": 0.3106, "step": 128 }, { "epoch": 0.5166889185580774, "grad_norm": 0.5702598727693834, "learning_rate": 9.847303194015358e-06, "loss": 0.2964, "step": 129 }, { "epoch": 0.520694259012016, "grad_norm": 0.5427287917310376, "learning_rate": 9.841517610611309e-06, "loss": 0.3146, "step": 130 }, { "epoch": 0.5246995994659546, "grad_norm": 0.5757835163942887, "learning_rate": 9.835626213501526e-06, "loss": 0.2962, "step": 131 }, { "epoch": 0.5287049399198932, "grad_norm": 0.5698018296606896, "learning_rate": 9.829629131445342e-06, "loss": 0.3193, "step": 132 }, { "epoch": 0.5327102803738317, "grad_norm": 0.5410707033561443, "learning_rate": 9.82352649551188e-06, "loss": 0.3106, "step": 133 }, { "epoch": 0.5367156208277704, "grad_norm": 0.5502177437943316, "learning_rate": 9.817318439077197e-06, "loss": 0.3085, "step": 134 }, { "epoch": 0.540720961281709, "grad_norm": 0.5582555787315889, "learning_rate": 9.811005097821362e-06, "loss": 0.3151, "step": 135 }, { "epoch": 0.5447263017356475, "grad_norm": 0.530130894795682, "learning_rate": 9.804586609725499e-06, "loss": 0.3144, "step": 136 }, { "epoch": 0.5487316421895861, "grad_norm": 0.5740015673727541, "learning_rate": 9.798063115068766e-06, "loss": 0.3306, "step": 137 }, { "epoch": 0.5527369826435247, "grad_norm": 0.5191944650205522, "learning_rate": 9.791434756425288e-06, "loss": 0.3084, "step": 138 }, { "epoch": 0.5567423230974633, "grad_norm": 0.5972892802378095, "learning_rate": 9.784701678661045e-06, "loss": 0.3163, "step": 139 }, { "epoch": 0.5607476635514018, "grad_norm": 0.5504117854519875, "learning_rate": 9.777864028930705e-06, "loss": 0.3167, "step": 140 }, { "epoch": 0.5647530040053405, "grad_norm": 0.537223650006093, "learning_rate": 9.770921956674402e-06, "loss": 0.3006, "step": 141 }, { "epoch": 0.5687583444592791, "grad_norm": 0.5781957843358095, "learning_rate": 9.763875613614482e-06, "loss": 0.3123, "step": 142 }, { "epoch": 0.5727636849132176, "grad_norm": 0.5957683062334633, "learning_rate": 9.756725153752173e-06, "loss": 0.3154, "step": 143 }, { "epoch": 0.5767690253671562, "grad_norm": 0.5368008525982312, "learning_rate": 9.749470733364231e-06, "loss": 0.3108, "step": 144 }, { "epoch": 0.5807743658210948, "grad_norm": 0.5390147399817238, "learning_rate": 9.742112510999516e-06, "loss": 0.3267, "step": 145 }, { "epoch": 0.5847797062750334, "grad_norm": 0.538592138527249, "learning_rate": 9.73465064747553e-06, "loss": 0.3034, "step": 146 }, { "epoch": 0.5887850467289719, "grad_norm": 0.5909802194217371, "learning_rate": 9.727085305874906e-06, "loss": 0.3273, "step": 147 }, { "epoch": 0.5927903871829105, "grad_norm": 0.5625936142604115, "learning_rate": 9.719416651541839e-06, "loss": 0.3229, "step": 148 }, { "epoch": 0.5967957276368492, "grad_norm": 0.5606718697695303, "learning_rate": 9.711644852078472e-06, "loss": 0.3107, "step": 149 }, { "epoch": 0.6008010680907877, "grad_norm": 0.6078858789603624, "learning_rate": 9.703770077341236e-06, "loss": 0.3229, "step": 150 }, { "epoch": 0.6048064085447263, "grad_norm": 0.5549984708342697, "learning_rate": 9.69579249943714e-06, "loss": 0.3129, "step": 151 }, { "epoch": 0.6088117489986649, "grad_norm": 0.507347145487873, "learning_rate": 9.687712292719997e-06, "loss": 0.3002, "step": 152 }, { "epoch": 0.6128170894526035, "grad_norm": 0.6833991975396279, "learning_rate": 9.67952963378663e-06, "loss": 0.3087, "step": 153 }, { "epoch": 0.616822429906542, "grad_norm": 0.5915486951914966, "learning_rate": 9.671244701472999e-06, "loss": 0.3393, "step": 154 }, { "epoch": 0.6208277703604806, "grad_norm": 0.5643268444514835, "learning_rate": 9.662857676850306e-06, "loss": 0.2944, "step": 155 }, { "epoch": 0.6248331108144193, "grad_norm": 0.6409184914823735, "learning_rate": 9.654368743221022e-06, "loss": 0.3247, "step": 156 }, { "epoch": 0.6288384512683578, "grad_norm": 0.625767486890822, "learning_rate": 9.645778086114892e-06, "loss": 0.3134, "step": 157 }, { "epoch": 0.6328437917222964, "grad_norm": 0.5700497840235004, "learning_rate": 9.637085893284875e-06, "loss": 0.3023, "step": 158 }, { "epoch": 0.636849132176235, "grad_norm": 0.5790034198291902, "learning_rate": 9.628292354703046e-06, "loss": 0.2933, "step": 159 }, { "epoch": 0.6408544726301736, "grad_norm": 0.5713100840362291, "learning_rate": 9.619397662556434e-06, "loss": 0.3042, "step": 160 }, { "epoch": 0.6448598130841121, "grad_norm": 0.5741995306695465, "learning_rate": 9.610402011242837e-06, "loss": 0.3196, "step": 161 }, { "epoch": 0.6488651535380507, "grad_norm": 0.5674889795972151, "learning_rate": 9.601305597366553e-06, "loss": 0.3071, "step": 162 }, { "epoch": 0.6528704939919893, "grad_norm": 0.5369636049566915, "learning_rate": 9.592108619734107e-06, "loss": 0.3247, "step": 163 }, { "epoch": 0.6568758344459279, "grad_norm": 0.5443809471875736, "learning_rate": 9.582811279349881e-06, "loss": 0.3072, "step": 164 }, { "epoch": 0.6608811748998665, "grad_norm": 0.5953124014685344, "learning_rate": 9.573413779411745e-06, "loss": 0.3085, "step": 165 }, { "epoch": 0.664886515353805, "grad_norm": 0.5401734564217464, "learning_rate": 9.563916325306595e-06, "loss": 0.29, "step": 166 }, { "epoch": 0.6688918558077437, "grad_norm": 0.5444349651712469, "learning_rate": 9.55431912460588e-06, "loss": 0.3054, "step": 167 }, { "epoch": 0.6728971962616822, "grad_norm": 0.510267722435052, "learning_rate": 9.544622387061055e-06, "loss": 0.28, "step": 168 }, { "epoch": 0.6769025367156208, "grad_norm": 0.5184264543864224, "learning_rate": 9.534826324599002e-06, "loss": 0.2955, "step": 169 }, { "epoch": 0.6809078771695594, "grad_norm": 0.5637636626391551, "learning_rate": 9.5249311513174e-06, "loss": 0.2792, "step": 170 }, { "epoch": 0.684913217623498, "grad_norm": 0.5428313722322577, "learning_rate": 9.514937083480037e-06, "loss": 0.2945, "step": 171 }, { "epoch": 0.6889185580774366, "grad_norm": 0.5561412606219924, "learning_rate": 9.504844339512096e-06, "loss": 0.315, "step": 172 }, { "epoch": 0.6929238985313751, "grad_norm": 0.5081631254602269, "learning_rate": 9.494653139995368e-06, "loss": 0.3066, "step": 173 }, { "epoch": 0.6969292389853138, "grad_norm": 0.5856758014884262, "learning_rate": 9.484363707663443e-06, "loss": 0.2801, "step": 174 }, { "epoch": 0.7009345794392523, "grad_norm": 0.5414939052665023, "learning_rate": 9.473976267396831e-06, "loss": 0.2894, "step": 175 }, { "epoch": 0.7049399198931909, "grad_norm": 0.5188788468344311, "learning_rate": 9.463491046218058e-06, "loss": 0.2917, "step": 176 }, { "epoch": 0.7089452603471295, "grad_norm": 0.6208165205427856, "learning_rate": 9.452908273286699e-06, "loss": 0.3124, "step": 177 }, { "epoch": 0.7129506008010681, "grad_norm": 0.4892884392964166, "learning_rate": 9.442228179894362e-06, "loss": 0.2937, "step": 178 }, { "epoch": 0.7169559412550067, "grad_norm": 0.5126422005865922, "learning_rate": 9.431450999459653e-06, "loss": 0.2902, "step": 179 }, { "epoch": 0.7209612817089452, "grad_norm": 0.5578472838688182, "learning_rate": 9.420576967523049e-06, "loss": 0.2886, "step": 180 }, { "epoch": 0.7249666221628839, "grad_norm": 0.5457635354712708, "learning_rate": 9.409606321741776e-06, "loss": 0.299, "step": 181 }, { "epoch": 0.7289719626168224, "grad_norm": 0.5665035398169768, "learning_rate": 9.398539301884592e-06, "loss": 0.2975, "step": 182 }, { "epoch": 0.732977303070761, "grad_norm": 0.5286590948141091, "learning_rate": 9.387376149826564e-06, "loss": 0.2767, "step": 183 }, { "epoch": 0.7369826435246996, "grad_norm": 0.5743317468381327, "learning_rate": 9.376117109543769e-06, "loss": 0.2909, "step": 184 }, { "epoch": 0.7409879839786382, "grad_norm": 0.6431953313269012, "learning_rate": 9.364762427107971e-06, "loss": 0.3004, "step": 185 }, { "epoch": 0.7449933244325768, "grad_norm": 0.6117784289038739, "learning_rate": 9.353312350681242e-06, "loss": 0.3062, "step": 186 }, { "epoch": 0.7489986648865153, "grad_norm": 0.5466166236913528, "learning_rate": 9.341767130510529e-06, "loss": 0.3047, "step": 187 }, { "epoch": 0.753004005340454, "grad_norm": 0.5672388678846847, "learning_rate": 9.330127018922195e-06, "loss": 0.3099, "step": 188 }, { "epoch": 0.7570093457943925, "grad_norm": 0.5854324070547063, "learning_rate": 9.318392270316501e-06, "loss": 0.3097, "step": 189 }, { "epoch": 0.7610146862483311, "grad_norm": 0.5582358269319914, "learning_rate": 9.306563141162046e-06, "loss": 0.3061, "step": 190 }, { "epoch": 0.7650200267022697, "grad_norm": 0.5807552655282949, "learning_rate": 9.29463988999016e-06, "loss": 0.3004, "step": 191 }, { "epoch": 0.7690253671562083, "grad_norm": 0.5445709450895333, "learning_rate": 9.282622777389258e-06, "loss": 0.2864, "step": 192 }, { "epoch": 0.7730307076101469, "grad_norm": 0.6479747482171502, "learning_rate": 9.270512065999139e-06, "loss": 0.2979, "step": 193 }, { "epoch": 0.7770360480640854, "grad_norm": 0.5604311181657256, "learning_rate": 9.258308020505247e-06, "loss": 0.2997, "step": 194 }, { "epoch": 0.7810413885180241, "grad_norm": 0.5829812198586952, "learning_rate": 9.246010907632894e-06, "loss": 0.3233, "step": 195 }, { "epoch": 0.7850467289719626, "grad_norm": 0.580492515245855, "learning_rate": 9.233620996141421e-06, "loss": 0.299, "step": 196 }, { "epoch": 0.7890520694259012, "grad_norm": 0.592467400441266, "learning_rate": 9.221138556818327e-06, "loss": 0.2967, "step": 197 }, { "epoch": 0.7930574098798397, "grad_norm": 0.5997985716991547, "learning_rate": 9.20856386247335e-06, "loss": 0.3123, "step": 198 }, { "epoch": 0.7970627503337784, "grad_norm": 0.5397407737667249, "learning_rate": 9.195897187932513e-06, "loss": 0.2953, "step": 199 }, { "epoch": 0.8010680907877169, "grad_norm": 0.5362871237537865, "learning_rate": 9.1831388100321e-06, "loss": 0.283, "step": 200 }, { "epoch": 0.8050734312416555, "grad_norm": 0.5737882959212091, "learning_rate": 9.170289007612625e-06, "loss": 0.2922, "step": 201 }, { "epoch": 0.8090787716955942, "grad_norm": 0.5932417559868806, "learning_rate": 9.157348061512728e-06, "loss": 0.2955, "step": 202 }, { "epoch": 0.8130841121495327, "grad_norm": 0.5072437104528961, "learning_rate": 9.144316254563032e-06, "loss": 0.2696, "step": 203 }, { "epoch": 0.8170894526034713, "grad_norm": 0.557818245925382, "learning_rate": 9.131193871579975e-06, "loss": 0.2994, "step": 204 }, { "epoch": 0.8210947930574098, "grad_norm": 0.5973919395531655, "learning_rate": 9.117981199359575e-06, "loss": 0.3008, "step": 205 }, { "epoch": 0.8251001335113485, "grad_norm": 0.574306055152991, "learning_rate": 9.104678526671162e-06, "loss": 0.3086, "step": 206 }, { "epoch": 0.829105473965287, "grad_norm": 0.6115940443338315, "learning_rate": 9.091286144251077e-06, "loss": 0.2893, "step": 207 }, { "epoch": 0.8331108144192256, "grad_norm": 0.4939307262823331, "learning_rate": 9.077804344796302e-06, "loss": 0.2758, "step": 208 }, { "epoch": 0.8371161548731643, "grad_norm": 0.5523854629181829, "learning_rate": 9.064233422958078e-06, "loss": 0.2761, "step": 209 }, { "epoch": 0.8411214953271028, "grad_norm": 0.5641559704847691, "learning_rate": 9.050573675335453e-06, "loss": 0.2702, "step": 210 }, { "epoch": 0.8451268357810414, "grad_norm": 0.538525400332704, "learning_rate": 9.036825400468814e-06, "loss": 0.2625, "step": 211 }, { "epoch": 0.8491321762349799, "grad_norm": 0.5571857402527918, "learning_rate": 9.022988898833342e-06, "loss": 0.2812, "step": 212 }, { "epoch": 0.8531375166889186, "grad_norm": 0.5875899405650873, "learning_rate": 9.009064472832468e-06, "loss": 0.3085, "step": 213 }, { "epoch": 0.8571428571428571, "grad_norm": 0.5955430126526817, "learning_rate": 8.995052426791247e-06, "loss": 0.2921, "step": 214 }, { "epoch": 0.8611481975967957, "grad_norm": 0.5826373250067038, "learning_rate": 8.980953066949708e-06, "loss": 0.2912, "step": 215 }, { "epoch": 0.8651535380507344, "grad_norm": 0.5537935549036811, "learning_rate": 8.966766701456177e-06, "loss": 0.2809, "step": 216 }, { "epoch": 0.8691588785046729, "grad_norm": 0.5838980864534432, "learning_rate": 8.952493640360518e-06, "loss": 0.2909, "step": 217 }, { "epoch": 0.8731642189586115, "grad_norm": 0.5875405818886061, "learning_rate": 8.938134195607378e-06, "loss": 0.2952, "step": 218 }, { "epoch": 0.87716955941255, "grad_norm": 0.6226962389915714, "learning_rate": 8.923688681029356e-06, "loss": 0.325, "step": 219 }, { "epoch": 0.8811748998664887, "grad_norm": 0.5984817454130974, "learning_rate": 8.90915741234015e-06, "loss": 0.3124, "step": 220 }, { "epoch": 0.8851802403204272, "grad_norm": 0.5487459145803628, "learning_rate": 8.894540707127655e-06, "loss": 0.2926, "step": 221 }, { "epoch": 0.8891855807743658, "grad_norm": 0.5437039065539668, "learning_rate": 8.879838884847025e-06, "loss": 0.2769, "step": 222 }, { "epoch": 0.8931909212283045, "grad_norm": 0.5361919135525014, "learning_rate": 8.865052266813686e-06, "loss": 0.2565, "step": 223 }, { "epoch": 0.897196261682243, "grad_norm": 0.6022317598018883, "learning_rate": 8.850181176196316e-06, "loss": 0.2904, "step": 224 }, { "epoch": 0.9012016021361816, "grad_norm": 0.5777383647207497, "learning_rate": 8.835225938009781e-06, "loss": 0.2942, "step": 225 }, { "epoch": 0.9052069425901201, "grad_norm": 0.5517455644071223, "learning_rate": 8.820186879108038e-06, "loss": 0.2827, "step": 226 }, { "epoch": 0.9092122830440588, "grad_norm": 0.5746060945697256, "learning_rate": 8.80506432817698e-06, "loss": 0.2901, "step": 227 }, { "epoch": 0.9132176234979973, "grad_norm": 0.5678185790220811, "learning_rate": 8.789858615727266e-06, "loss": 0.277, "step": 228 }, { "epoch": 0.9172229639519359, "grad_norm": 0.5818515518798564, "learning_rate": 8.77457007408708e-06, "loss": 0.2805, "step": 229 }, { "epoch": 0.9212283044058746, "grad_norm": 0.5828788812029032, "learning_rate": 8.759199037394888e-06, "loss": 0.3054, "step": 230 }, { "epoch": 0.9252336448598131, "grad_norm": 0.5578305381732657, "learning_rate": 8.743745841592118e-06, "loss": 0.279, "step": 231 }, { "epoch": 0.9292389853137517, "grad_norm": 0.5710661098406483, "learning_rate": 8.728210824415829e-06, "loss": 0.2734, "step": 232 }, { "epoch": 0.9332443257676902, "grad_norm": 0.5767939333864601, "learning_rate": 8.712594325391324e-06, "loss": 0.2699, "step": 233 }, { "epoch": 0.9372496662216289, "grad_norm": 0.574270861342953, "learning_rate": 8.69689668582473e-06, "loss": 0.2766, "step": 234 }, { "epoch": 0.9412550066755674, "grad_norm": 0.5757498082823792, "learning_rate": 8.681118248795548e-06, "loss": 0.2818, "step": 235 }, { "epoch": 0.945260347129506, "grad_norm": 0.6077724733956369, "learning_rate": 8.665259359149132e-06, "loss": 0.2969, "step": 236 }, { "epoch": 0.9492656875834445, "grad_norm": 0.549944516647202, "learning_rate": 8.649320363489178e-06, "loss": 0.2609, "step": 237 }, { "epoch": 0.9532710280373832, "grad_norm": 0.5456975844935816, "learning_rate": 8.633301610170136e-06, "loss": 0.287, "step": 238 }, { "epoch": 0.9572763684913218, "grad_norm": 0.5280058829694398, "learning_rate": 8.617203449289593e-06, "loss": 0.2644, "step": 239 }, { "epoch": 0.9612817089452603, "grad_norm": 0.5920277658059444, "learning_rate": 8.601026232680634e-06, "loss": 0.291, "step": 240 }, { "epoch": 0.965287049399199, "grad_norm": 0.6029720289705192, "learning_rate": 8.584770313904138e-06, "loss": 0.2883, "step": 241 }, { "epoch": 0.9692923898531375, "grad_norm": 0.5726602829217651, "learning_rate": 8.568436048241062e-06, "loss": 0.265, "step": 242 }, { "epoch": 0.9732977303070761, "grad_norm": 0.5825245425360942, "learning_rate": 8.552023792684672e-06, "loss": 0.2868, "step": 243 }, { "epoch": 0.9773030707610146, "grad_norm": 0.6673081291871541, "learning_rate": 8.535533905932739e-06, "loss": 0.3054, "step": 244 }, { "epoch": 0.9813084112149533, "grad_norm": 0.6031779392976561, "learning_rate": 8.518966748379702e-06, "loss": 0.2851, "step": 245 }, { "epoch": 0.9853137516688919, "grad_norm": 0.6325037575700174, "learning_rate": 8.502322682108792e-06, "loss": 0.269, "step": 246 }, { "epoch": 0.9893190921228304, "grad_norm": 0.630286272644598, "learning_rate": 8.485602070884118e-06, "loss": 0.2835, "step": 247 }, { "epoch": 0.9933244325767691, "grad_norm": 0.5550792941097276, "learning_rate": 8.46880528014271e-06, "loss": 0.2716, "step": 248 }, { "epoch": 0.9973297730307076, "grad_norm": 0.5955656163999381, "learning_rate": 8.451932676986543e-06, "loss": 0.2919, "step": 249 }, { "epoch": 1.0, "grad_norm": 0.5955656163999381, "learning_rate": 8.43498463017451e-06, "loss": 0.2708, "step": 250 }, { "epoch": 1.0040053404539386, "grad_norm": 0.762897141846903, "learning_rate": 8.417961510114357e-06, "loss": 0.2589, "step": 251 }, { "epoch": 1.0080106809078773, "grad_norm": 0.5667256244239827, "learning_rate": 8.400863688854598e-06, "loss": 0.2347, "step": 252 }, { "epoch": 1.0120160213618157, "grad_norm": 0.5182073558824543, "learning_rate": 8.383691540076372e-06, "loss": 0.2473, "step": 253 }, { "epoch": 1.0160213618157543, "grad_norm": 0.5411427558297038, "learning_rate": 8.366445439085286e-06, "loss": 0.239, "step": 254 }, { "epoch": 1.020026702269693, "grad_norm": 0.5411386326348857, "learning_rate": 8.349125762803204e-06, "loss": 0.255, "step": 255 }, { "epoch": 1.0240320427236316, "grad_norm": 0.5768680583207191, "learning_rate": 8.331732889760021e-06, "loss": 0.2304, "step": 256 }, { "epoch": 1.02803738317757, "grad_norm": 0.5663192339105902, "learning_rate": 8.314267200085373e-06, "loss": 0.2364, "step": 257 }, { "epoch": 1.0320427236315086, "grad_norm": 0.5782681146396127, "learning_rate": 8.296729075500345e-06, "loss": 0.2554, "step": 258 }, { "epoch": 1.0360480640854473, "grad_norm": 0.5924117511990626, "learning_rate": 8.279118899309121e-06, "loss": 0.2381, "step": 259 }, { "epoch": 1.0400534045393859, "grad_norm": 0.5849704627036197, "learning_rate": 8.261437056390607e-06, "loss": 0.2592, "step": 260 }, { "epoch": 1.0440587449933245, "grad_norm": 0.5462578431369289, "learning_rate": 8.243683933190019e-06, "loss": 0.2481, "step": 261 }, { "epoch": 1.048064085447263, "grad_norm": 0.5687469203522241, "learning_rate": 8.22585991771044e-06, "loss": 0.2406, "step": 262 }, { "epoch": 1.0520694259012016, "grad_norm": 0.5764520043363477, "learning_rate": 8.207965399504334e-06, "loss": 0.2435, "step": 263 }, { "epoch": 1.0560747663551402, "grad_norm": 0.6130154606997985, "learning_rate": 8.190000769665044e-06, "loss": 0.2494, "step": 264 }, { "epoch": 1.0600801068090788, "grad_norm": 0.5421995984684055, "learning_rate": 8.171966420818227e-06, "loss": 0.2435, "step": 265 }, { "epoch": 1.0640854472630175, "grad_norm": 0.5828640036968468, "learning_rate": 8.153862747113293e-06, "loss": 0.2353, "step": 266 }, { "epoch": 1.0680907877169559, "grad_norm": 0.5148839059504708, "learning_rate": 8.135690144214767e-06, "loss": 0.2318, "step": 267 }, { "epoch": 1.0720961281708945, "grad_norm": 0.5486187246706559, "learning_rate": 8.117449009293668e-06, "loss": 0.2416, "step": 268 }, { "epoch": 1.0761014686248331, "grad_norm": 0.564502169912709, "learning_rate": 8.099139741018809e-06, "loss": 0.2364, "step": 269 }, { "epoch": 1.0801068090787718, "grad_norm": 0.6097314278041118, "learning_rate": 8.08076273954809e-06, "loss": 0.2598, "step": 270 }, { "epoch": 1.0841121495327102, "grad_norm": 0.6059107746858474, "learning_rate": 8.062318406519751e-06, "loss": 0.2507, "step": 271 }, { "epoch": 1.0881174899866488, "grad_norm": 0.6241821796107588, "learning_rate": 8.043807145043604e-06, "loss": 0.2479, "step": 272 }, { "epoch": 1.0921228304405874, "grad_norm": 0.5883002088770041, "learning_rate": 8.025229359692206e-06, "loss": 0.2504, "step": 273 }, { "epoch": 1.096128170894526, "grad_norm": 0.5888253144437603, "learning_rate": 8.00658545649203e-06, "loss": 0.2346, "step": 274 }, { "epoch": 1.1001335113484647, "grad_norm": 0.5409284658955128, "learning_rate": 7.987875842914583e-06, "loss": 0.2357, "step": 275 }, { "epoch": 1.1041388518024031, "grad_norm": 0.5648850017659398, "learning_rate": 7.969100927867508e-06, "loss": 0.2479, "step": 276 }, { "epoch": 1.1081441922563418, "grad_norm": 0.6139375755294754, "learning_rate": 7.950261121685642e-06, "loss": 0.2452, "step": 277 }, { "epoch": 1.1121495327102804, "grad_norm": 0.6246425570636841, "learning_rate": 7.931356836122046e-06, "loss": 0.2404, "step": 278 }, { "epoch": 1.116154873164219, "grad_norm": 0.5298624506016548, "learning_rate": 7.912388484339012e-06, "loss": 0.2318, "step": 279 }, { "epoch": 1.1201602136181577, "grad_norm": 0.5727259445791012, "learning_rate": 7.89335648089903e-06, "loss": 0.2444, "step": 280 }, { "epoch": 1.124165554072096, "grad_norm": 0.568496396477039, "learning_rate": 7.874261241755726e-06, "loss": 0.2361, "step": 281 }, { "epoch": 1.1281708945260347, "grad_norm": 0.5698858845026502, "learning_rate": 7.855103184244777e-06, "loss": 0.2475, "step": 282 }, { "epoch": 1.1321762349799733, "grad_norm": 0.6033437616235542, "learning_rate": 7.835882727074779e-06, "loss": 0.2483, "step": 283 }, { "epoch": 1.136181575433912, "grad_norm": 0.6137682287341324, "learning_rate": 7.81660029031811e-06, "loss": 0.2485, "step": 284 }, { "epoch": 1.1401869158878504, "grad_norm": 0.5389251730544439, "learning_rate": 7.797256295401738e-06, "loss": 0.2287, "step": 285 }, { "epoch": 1.144192256341789, "grad_norm": 0.582366745214894, "learning_rate": 7.777851165098012e-06, "loss": 0.247, "step": 286 }, { "epoch": 1.1481975967957276, "grad_norm": 0.5617439076162762, "learning_rate": 7.75838532351543e-06, "loss": 0.223, "step": 287 }, { "epoch": 1.1522029372496663, "grad_norm": 0.5457463297035726, "learning_rate": 7.738859196089358e-06, "loss": 0.2481, "step": 288 }, { "epoch": 1.156208277703605, "grad_norm": 0.6026062150338968, "learning_rate": 7.719273209572745e-06, "loss": 0.2602, "step": 289 }, { "epoch": 1.1602136181575433, "grad_norm": 0.595754963300469, "learning_rate": 7.699627792026784e-06, "loss": 0.2388, "step": 290 }, { "epoch": 1.164218958611482, "grad_norm": 0.5245236864467587, "learning_rate": 7.679923372811564e-06, "loss": 0.2353, "step": 291 }, { "epoch": 1.1682242990654206, "grad_norm": 0.6296844005130243, "learning_rate": 7.660160382576683e-06, "loss": 0.2342, "step": 292 }, { "epoch": 1.1722296395193592, "grad_norm": 0.5981183888141479, "learning_rate": 7.64033925325184e-06, "loss": 0.2416, "step": 293 }, { "epoch": 1.1762349799732976, "grad_norm": 0.584776287003421, "learning_rate": 7.620460418037388e-06, "loss": 0.228, "step": 294 }, { "epoch": 1.1802403204272363, "grad_norm": 0.5906417247227626, "learning_rate": 7.600524311394873e-06, "loss": 0.2323, "step": 295 }, { "epoch": 1.1842456608811749, "grad_norm": 0.5834730538207583, "learning_rate": 7.580531369037534e-06, "loss": 0.2428, "step": 296 }, { "epoch": 1.1882510013351135, "grad_norm": 0.592770510365303, "learning_rate": 7.5604820279207816e-06, "loss": 0.2311, "step": 297 }, { "epoch": 1.1922563417890522, "grad_norm": 0.5932974539859142, "learning_rate": 7.540376726232648e-06, "loss": 0.2456, "step": 298 }, { "epoch": 1.1962616822429906, "grad_norm": 0.599679016601175, "learning_rate": 7.520215903384215e-06, "loss": 0.2319, "step": 299 }, { "epoch": 1.2002670226969292, "grad_norm": 0.543968089573662, "learning_rate": 7.500000000000001e-06, "loss": 0.2451, "step": 300 }, { "epoch": 1.2042723631508678, "grad_norm": 0.5983969022372734, "learning_rate": 7.4797294579083405e-06, "loss": 0.2491, "step": 301 }, { "epoch": 1.2082777036048065, "grad_norm": 0.5538744153921799, "learning_rate": 7.459404720131717e-06, "loss": 0.233, "step": 302 }, { "epoch": 1.2122830440587449, "grad_norm": 0.5900495952853351, "learning_rate": 7.439026230877096e-06, "loss": 0.2297, "step": 303 }, { "epoch": 1.2162883845126835, "grad_norm": 0.5465667743441658, "learning_rate": 7.4185944355261996e-06, "loss": 0.2528, "step": 304 }, { "epoch": 1.2202937249666221, "grad_norm": 0.6269598978844211, "learning_rate": 7.398109780625784e-06, "loss": 0.2501, "step": 305 }, { "epoch": 1.2242990654205608, "grad_norm": 0.5806451562215877, "learning_rate": 7.3775727138778776e-06, "loss": 0.2391, "step": 306 }, { "epoch": 1.2283044058744994, "grad_norm": 0.6320385790504774, "learning_rate": 7.3569836841299905e-06, "loss": 0.2464, "step": 307 }, { "epoch": 1.232309746328438, "grad_norm": 0.5737559605551135, "learning_rate": 7.336343141365311e-06, "loss": 0.2441, "step": 308 }, { "epoch": 1.2363150867823764, "grad_norm": 0.5952972084149591, "learning_rate": 7.315651536692873e-06, "loss": 0.2564, "step": 309 }, { "epoch": 1.240320427236315, "grad_norm": 0.7146063936502873, "learning_rate": 7.294909322337689e-06, "loss": 0.2313, "step": 310 }, { "epoch": 1.2443257676902537, "grad_norm": 0.595320191460265, "learning_rate": 7.274116951630873e-06, "loss": 0.2368, "step": 311 }, { "epoch": 1.2483311081441923, "grad_norm": 0.5744358155705048, "learning_rate": 7.253274878999728e-06, "loss": 0.2282, "step": 312 }, { "epoch": 1.2523364485981308, "grad_norm": 0.6298803013176558, "learning_rate": 7.232383559957815e-06, "loss": 0.2418, "step": 313 }, { "epoch": 1.2563417890520694, "grad_norm": 0.6145313830867569, "learning_rate": 7.211443451095007e-06, "loss": 0.2365, "step": 314 }, { "epoch": 1.260347129506008, "grad_norm": 0.6285532186169481, "learning_rate": 7.190455010067494e-06, "loss": 0.2347, "step": 315 }, { "epoch": 1.2643524699599467, "grad_norm": 0.590793052150211, "learning_rate": 7.169418695587791e-06, "loss": 0.2303, "step": 316 }, { "epoch": 1.2683578104138853, "grad_norm": 0.5713241346393119, "learning_rate": 7.1483349674147125e-06, "loss": 0.2242, "step": 317 }, { "epoch": 1.2723631508678237, "grad_norm": 0.5490796477452554, "learning_rate": 7.127204286343321e-06, "loss": 0.2338, "step": 318 }, { "epoch": 1.2763684913217623, "grad_norm": 0.6071154182227954, "learning_rate": 7.106027114194856e-06, "loss": 0.225, "step": 319 }, { "epoch": 1.280373831775701, "grad_norm": 0.5963758796133684, "learning_rate": 7.084803913806642e-06, "loss": 0.2369, "step": 320 }, { "epoch": 1.2843791722296396, "grad_norm": 0.6339033842544861, "learning_rate": 7.063535149021974e-06, "loss": 0.2441, "step": 321 }, { "epoch": 1.288384512683578, "grad_norm": 0.6284003653179433, "learning_rate": 7.042221284679982e-06, "loss": 0.2402, "step": 322 }, { "epoch": 1.2923898531375166, "grad_norm": 0.6593560684745596, "learning_rate": 7.02086278660546e-06, "loss": 0.2535, "step": 323 }, { "epoch": 1.2963951935914553, "grad_norm": 0.6387070843334016, "learning_rate": 6.999460121598704e-06, "loss": 0.2297, "step": 324 }, { "epoch": 1.300400534045394, "grad_norm": 0.5750425275519615, "learning_rate": 6.978013757425295e-06, "loss": 0.2355, "step": 325 }, { "epoch": 1.3044058744993325, "grad_norm": 0.5586774593218413, "learning_rate": 6.956524162805875e-06, "loss": 0.2384, "step": 326 }, { "epoch": 1.308411214953271, "grad_norm": 0.5596782830604753, "learning_rate": 6.934991807405919e-06, "loss": 0.2305, "step": 327 }, { "epoch": 1.3124165554072096, "grad_norm": 0.5665505846964202, "learning_rate": 6.913417161825449e-06, "loss": 0.2239, "step": 328 }, { "epoch": 1.3164218958611482, "grad_norm": 0.5958541676468069, "learning_rate": 6.8918006975887685e-06, "loss": 0.2441, "step": 329 }, { "epoch": 1.3204272363150868, "grad_norm": 0.5843413853980698, "learning_rate": 6.870142887134141e-06, "loss": 0.2221, "step": 330 }, { "epoch": 1.3244325767690253, "grad_norm": 0.5533441366477334, "learning_rate": 6.848444203803476e-06, "loss": 0.224, "step": 331 }, { "epoch": 1.328437917222964, "grad_norm": 0.5944276005227449, "learning_rate": 6.8267051218319766e-06, "loss": 0.2333, "step": 332 }, { "epoch": 1.3324432576769025, "grad_norm": 0.5816039732327815, "learning_rate": 6.804926116337779e-06, "loss": 0.2332, "step": 333 }, { "epoch": 1.3364485981308412, "grad_norm": 0.5997442007990729, "learning_rate": 6.783107663311566e-06, "loss": 0.2288, "step": 334 }, { "epoch": 1.3404539385847798, "grad_norm": 0.5758289065119726, "learning_rate": 6.7612502396061685e-06, "loss": 0.238, "step": 335 }, { "epoch": 1.3444592790387184, "grad_norm": 0.578849426349599, "learning_rate": 6.739354322926136e-06, "loss": 0.2382, "step": 336 }, { "epoch": 1.3484646194926568, "grad_norm": 0.571090431459051, "learning_rate": 6.717420391817306e-06, "loss": 0.2663, "step": 337 }, { "epoch": 1.3524699599465955, "grad_norm": 0.6196058792645048, "learning_rate": 6.6954489256563334e-06, "loss": 0.2274, "step": 338 }, { "epoch": 1.356475300400534, "grad_norm": 0.5922325724177396, "learning_rate": 6.6734404046402256e-06, "loss": 0.2199, "step": 339 }, { "epoch": 1.3604806408544725, "grad_norm": 0.5523613961098914, "learning_rate": 6.651395309775837e-06, "loss": 0.2352, "step": 340 }, { "epoch": 1.3644859813084111, "grad_norm": 0.6615232115067652, "learning_rate": 6.629314122869363e-06, "loss": 0.2259, "step": 341 }, { "epoch": 1.3684913217623498, "grad_norm": 0.5749887582077661, "learning_rate": 6.607197326515808e-06, "loss": 0.2515, "step": 342 }, { "epoch": 1.3724966622162884, "grad_norm": 0.6229806856360468, "learning_rate": 6.585045404088442e-06, "loss": 0.2446, "step": 343 }, { "epoch": 1.376502002670227, "grad_norm": 0.6535943167246338, "learning_rate": 6.562858839728224e-06, "loss": 0.233, "step": 344 }, { "epoch": 1.3805073431241657, "grad_norm": 0.564932235491322, "learning_rate": 6.540638118333235e-06, "loss": 0.2377, "step": 345 }, { "epoch": 1.384512683578104, "grad_norm": 0.5864382063135621, "learning_rate": 6.518383725548074e-06, "loss": 0.2351, "step": 346 }, { "epoch": 1.3885180240320427, "grad_norm": 0.5719846231002432, "learning_rate": 6.4960961477532444e-06, "loss": 0.2213, "step": 347 }, { "epoch": 1.3925233644859814, "grad_norm": 0.5996971644613003, "learning_rate": 6.473775872054522e-06, "loss": 0.2315, "step": 348 }, { "epoch": 1.3965287049399198, "grad_norm": 0.6164036127115975, "learning_rate": 6.451423386272312e-06, "loss": 0.233, "step": 349 }, { "epoch": 1.4005340453938584, "grad_norm": 0.6375735882940162, "learning_rate": 6.429039178930989e-06, "loss": 0.2303, "step": 350 }, { "epoch": 1.404539385847797, "grad_norm": 0.6098688935758428, "learning_rate": 6.406623739248214e-06, "loss": 0.2337, "step": 351 }, { "epoch": 1.4085447263017357, "grad_norm": 0.5853193211453952, "learning_rate": 6.384177557124247e-06, "loss": 0.2317, "step": 352 }, { "epoch": 1.4125500667556743, "grad_norm": 0.5660416243135848, "learning_rate": 6.361701123131242e-06, "loss": 0.2399, "step": 353 }, { "epoch": 1.416555407209613, "grad_norm": 0.6522270889233022, "learning_rate": 6.339194928502516e-06, "loss": 0.2438, "step": 354 }, { "epoch": 1.4205607476635513, "grad_norm": 0.6108617575895426, "learning_rate": 6.3166594651218235e-06, "loss": 0.2273, "step": 355 }, { "epoch": 1.42456608811749, "grad_norm": 0.5025445202572053, "learning_rate": 6.294095225512604e-06, "loss": 0.2134, "step": 356 }, { "epoch": 1.4285714285714286, "grad_norm": 0.5765280021465139, "learning_rate": 6.271502702827209e-06, "loss": 0.2249, "step": 357 }, { "epoch": 1.4325767690253672, "grad_norm": 0.5862375523852222, "learning_rate": 6.248882390836135e-06, "loss": 0.2326, "step": 358 }, { "epoch": 1.4365821094793056, "grad_norm": 0.6482330755264025, "learning_rate": 6.226234783917224e-06, "loss": 0.2264, "step": 359 }, { "epoch": 1.4405874499332443, "grad_norm": 0.5899710814731541, "learning_rate": 6.2035603770448664e-06, "loss": 0.2261, "step": 360 }, { "epoch": 1.444592790387183, "grad_norm": 0.5866295112067526, "learning_rate": 6.180859665779173e-06, "loss": 0.2523, "step": 361 }, { "epoch": 1.4485981308411215, "grad_norm": 0.6837319847065889, "learning_rate": 6.158133146255153e-06, "loss": 0.2423, "step": 362 }, { "epoch": 1.4526034712950602, "grad_norm": 0.6307432382267119, "learning_rate": 6.135381315171867e-06, "loss": 0.2425, "step": 363 }, { "epoch": 1.4566088117489986, "grad_norm": 0.5774555129513689, "learning_rate": 6.112604669781572e-06, "loss": 0.244, "step": 364 }, { "epoch": 1.4606141522029372, "grad_norm": 0.5910862302024886, "learning_rate": 6.089803707878855e-06, "loss": 0.2466, "step": 365 }, { "epoch": 1.4646194926568759, "grad_norm": 0.5816261166399118, "learning_rate": 6.066978927789751e-06, "loss": 0.2166, "step": 366 }, { "epoch": 1.4686248331108145, "grad_norm": 0.5583031010374657, "learning_rate": 6.04413082836085e-06, "loss": 0.2274, "step": 367 }, { "epoch": 1.472630173564753, "grad_norm": 0.6020357565888714, "learning_rate": 6.0212599089484026e-06, "loss": 0.2423, "step": 368 }, { "epoch": 1.4766355140186915, "grad_norm": 0.6086024579173414, "learning_rate": 5.998366669407398e-06, "loss": 0.2347, "step": 369 }, { "epoch": 1.4806408544726302, "grad_norm": 0.6639055801702823, "learning_rate": 5.975451610080643e-06, "loss": 0.2257, "step": 370 }, { "epoch": 1.4846461949265688, "grad_norm": 0.5765943328550973, "learning_rate": 5.952515231787825e-06, "loss": 0.2299, "step": 371 }, { "epoch": 1.4886515353805074, "grad_norm": 0.6523998837222308, "learning_rate": 5.929558035814574e-06, "loss": 0.232, "step": 372 }, { "epoch": 1.492656875834446, "grad_norm": 0.6525256233306673, "learning_rate": 5.906580523901493e-06, "loss": 0.2249, "step": 373 }, { "epoch": 1.4966622162883845, "grad_norm": 0.6002500523113792, "learning_rate": 5.883583198233202e-06, "loss": 0.2296, "step": 374 }, { "epoch": 1.500667556742323, "grad_norm": 0.5859819020795045, "learning_rate": 5.86056656142736e-06, "loss": 0.245, "step": 375 }, { "epoch": 1.5046728971962615, "grad_norm": 0.6224565671534654, "learning_rate": 5.837531116523683e-06, "loss": 0.2144, "step": 376 }, { "epoch": 1.5086782376502001, "grad_norm": 0.5645624857480281, "learning_rate": 5.814477366972945e-06, "loss": 0.2379, "step": 377 }, { "epoch": 1.5126835781041388, "grad_norm": 0.6412786483722962, "learning_rate": 5.791405816625974e-06, "loss": 0.2307, "step": 378 }, { "epoch": 1.5166889185580774, "grad_norm": 0.6304941089005965, "learning_rate": 5.768316969722651e-06, "loss": 0.2225, "step": 379 }, { "epoch": 1.520694259012016, "grad_norm": 0.5988757758307962, "learning_rate": 5.745211330880872e-06, "loss": 0.2258, "step": 380 }, { "epoch": 1.5246995994659547, "grad_norm": 0.5815105109581388, "learning_rate": 5.722089405085537e-06, "loss": 0.2414, "step": 381 }, { "epoch": 1.5287049399198933, "grad_norm": 0.6249599659350047, "learning_rate": 5.698951697677498e-06, "loss": 0.2174, "step": 382 }, { "epoch": 1.5327102803738317, "grad_norm": 0.6137726726584709, "learning_rate": 5.6757987143425276e-06, "loss": 0.2236, "step": 383 }, { "epoch": 1.5367156208277704, "grad_norm": 0.5347431576806394, "learning_rate": 5.65263096110026e-06, "loss": 0.2269, "step": 384 }, { "epoch": 1.540720961281709, "grad_norm": 0.5727634021907735, "learning_rate": 5.629448944293128e-06, "loss": 0.2005, "step": 385 }, { "epoch": 1.5447263017356474, "grad_norm": 0.5965865418656213, "learning_rate": 5.6062531705753075e-06, "loss": 0.2424, "step": 386 }, { "epoch": 1.548731642189586, "grad_norm": 0.6169279262529003, "learning_rate": 5.583044146901638e-06, "loss": 0.2232, "step": 387 }, { "epoch": 1.5527369826435247, "grad_norm": 0.6333814417689827, "learning_rate": 5.559822380516539e-06, "loss": 0.2353, "step": 388 }, { "epoch": 1.5567423230974633, "grad_norm": 0.5578859188906384, "learning_rate": 5.536588378942933e-06, "loss": 0.2279, "step": 389 }, { "epoch": 1.560747663551402, "grad_norm": 0.5725028706674725, "learning_rate": 5.513342649971143e-06, "loss": 0.2166, "step": 390 }, { "epoch": 1.5647530040053406, "grad_norm": 0.5718602933152237, "learning_rate": 5.490085701647805e-06, "loss": 0.2248, "step": 391 }, { "epoch": 1.5687583444592792, "grad_norm": 0.5624960499027032, "learning_rate": 5.466818042264754e-06, "loss": 0.2218, "step": 392 }, { "epoch": 1.5727636849132176, "grad_norm": 0.5783959440608054, "learning_rate": 5.443540180347927e-06, "loss": 0.2271, "step": 393 }, { "epoch": 1.5767690253671562, "grad_norm": 0.6463973620860086, "learning_rate": 5.420252624646238e-06, "loss": 0.2297, "step": 394 }, { "epoch": 1.5807743658210947, "grad_norm": 0.5796227960968064, "learning_rate": 5.396955884120465e-06, "loss": 0.2249, "step": 395 }, { "epoch": 1.5847797062750333, "grad_norm": 0.6378570148256715, "learning_rate": 5.373650467932122e-06, "loss": 0.1968, "step": 396 }, { "epoch": 1.588785046728972, "grad_norm": 0.537428645891956, "learning_rate": 5.350336885432337e-06, "loss": 0.2161, "step": 397 }, { "epoch": 1.5927903871829105, "grad_norm": 0.6322802889879201, "learning_rate": 5.327015646150716e-06, "loss": 0.2342, "step": 398 }, { "epoch": 1.5967957276368492, "grad_norm": 0.6773017918561144, "learning_rate": 5.303687259784206e-06, "loss": 0.2234, "step": 399 }, { "epoch": 1.6008010680907878, "grad_norm": 0.5862814494934032, "learning_rate": 5.2803522361859596e-06, "loss": 0.2101, "step": 400 }, { "epoch": 1.6048064085447264, "grad_norm": 0.5824962855214252, "learning_rate": 5.257011085354187e-06, "loss": 0.2432, "step": 401 }, { "epoch": 1.6088117489986649, "grad_norm": 0.6031958185300317, "learning_rate": 5.233664317421012e-06, "loss": 0.232, "step": 402 }, { "epoch": 1.6128170894526035, "grad_norm": 0.5862841084106785, "learning_rate": 5.210312442641327e-06, "loss": 0.2216, "step": 403 }, { "epoch": 1.616822429906542, "grad_norm": 0.6566275421066564, "learning_rate": 5.18695597138163e-06, "loss": 0.2285, "step": 404 }, { "epoch": 1.6208277703604805, "grad_norm": 0.6361225120156005, "learning_rate": 5.1635954141088815e-06, "loss": 0.2243, "step": 405 }, { "epoch": 1.6248331108144192, "grad_norm": 0.6538742068020424, "learning_rate": 5.140231281379345e-06, "loss": 0.2301, "step": 406 }, { "epoch": 1.6288384512683578, "grad_norm": 0.6553343367565488, "learning_rate": 5.116864083827425e-06, "loss": 0.2437, "step": 407 }, { "epoch": 1.6328437917222964, "grad_norm": 0.5611348158619709, "learning_rate": 5.093494332154511e-06, "loss": 0.2146, "step": 408 }, { "epoch": 1.636849132176235, "grad_norm": 0.549466753270084, "learning_rate": 5.070122537117812e-06, "loss": 0.2496, "step": 409 }, { "epoch": 1.6408544726301737, "grad_norm": 0.636480400336947, "learning_rate": 5.046749209519197e-06, "loss": 0.2032, "step": 410 }, { "epoch": 1.644859813084112, "grad_norm": 0.5984681247401569, "learning_rate": 5.023374860194028e-06, "loss": 0.204, "step": 411 }, { "epoch": 1.6488651535380507, "grad_norm": 0.5956484635411822, "learning_rate": 5e-06, "loss": 0.2227, "step": 412 }, { "epoch": 1.6528704939919892, "grad_norm": 0.5495043593104894, "learning_rate": 4.976625139805974e-06, "loss": 0.2201, "step": 413 }, { "epoch": 1.6568758344459278, "grad_norm": 0.5546136194425144, "learning_rate": 4.953250790480805e-06, "loss": 0.2139, "step": 414 }, { "epoch": 1.6608811748998664, "grad_norm": 0.5761321975039401, "learning_rate": 4.92987746288219e-06, "loss": 0.2164, "step": 415 }, { "epoch": 1.664886515353805, "grad_norm": 0.5842428190534428, "learning_rate": 4.90650566784549e-06, "loss": 0.2367, "step": 416 }, { "epoch": 1.6688918558077437, "grad_norm": 0.6113684961889835, "learning_rate": 4.883135916172576e-06, "loss": 0.2367, "step": 417 }, { "epoch": 1.6728971962616823, "grad_norm": 0.6680898005899061, "learning_rate": 4.859768718620656e-06, "loss": 0.2132, "step": 418 }, { "epoch": 1.676902536715621, "grad_norm": 0.5445125151437461, "learning_rate": 4.83640458589112e-06, "loss": 0.2138, "step": 419 }, { "epoch": 1.6809078771695594, "grad_norm": 0.5559680738599577, "learning_rate": 4.8130440286183725e-06, "loss": 0.2267, "step": 420 }, { "epoch": 1.684913217623498, "grad_norm": 0.6144551876120194, "learning_rate": 4.789687557358676e-06, "loss": 0.2182, "step": 421 }, { "epoch": 1.6889185580774366, "grad_norm": 0.5897924033640597, "learning_rate": 4.7663356825789894e-06, "loss": 0.2122, "step": 422 }, { "epoch": 1.692923898531375, "grad_norm": 0.5598416977353012, "learning_rate": 4.742988914645814e-06, "loss": 0.2216, "step": 423 }, { "epoch": 1.6969292389853137, "grad_norm": 0.5601120553987341, "learning_rate": 4.719647763814041e-06, "loss": 0.2177, "step": 424 }, { "epoch": 1.7009345794392523, "grad_norm": 0.5891529754303583, "learning_rate": 4.696312740215794e-06, "loss": 0.2005, "step": 425 }, { "epoch": 1.704939919893191, "grad_norm": 0.5680866327716454, "learning_rate": 4.672984353849285e-06, "loss": 0.2326, "step": 426 }, { "epoch": 1.7089452603471296, "grad_norm": 0.5657360719382496, "learning_rate": 4.649663114567663e-06, "loss": 0.2131, "step": 427 }, { "epoch": 1.7129506008010682, "grad_norm": 0.5740029198869598, "learning_rate": 4.626349532067879e-06, "loss": 0.2138, "step": 428 }, { "epoch": 1.7169559412550068, "grad_norm": 0.6388774843029362, "learning_rate": 4.603044115879536e-06, "loss": 0.2251, "step": 429 }, { "epoch": 1.7209612817089452, "grad_norm": 0.6877647851263269, "learning_rate": 4.579747375353763e-06, "loss": 0.2212, "step": 430 }, { "epoch": 1.7249666221628839, "grad_norm": 0.6414623377958295, "learning_rate": 4.556459819652074e-06, "loss": 0.2414, "step": 431 }, { "epoch": 1.7289719626168223, "grad_norm": 0.6323839747789576, "learning_rate": 4.533181957735247e-06, "loss": 0.2339, "step": 432 }, { "epoch": 1.732977303070761, "grad_norm": 0.639653221825577, "learning_rate": 4.509914298352197e-06, "loss": 0.2215, "step": 433 }, { "epoch": 1.7369826435246996, "grad_norm": 0.6284764820090778, "learning_rate": 4.486657350028859e-06, "loss": 0.2119, "step": 434 }, { "epoch": 1.7409879839786382, "grad_norm": 0.6067872660667347, "learning_rate": 4.463411621057068e-06, "loss": 0.2177, "step": 435 }, { "epoch": 1.7449933244325768, "grad_norm": 0.6234859050465342, "learning_rate": 4.4401776194834615e-06, "loss": 0.207, "step": 436 }, { "epoch": 1.7489986648865155, "grad_norm": 0.6562941478198795, "learning_rate": 4.4169558530983635e-06, "loss": 0.2043, "step": 437 }, { "epoch": 1.753004005340454, "grad_norm": 0.5909532065510827, "learning_rate": 4.393746829424693e-06, "loss": 0.2124, "step": 438 }, { "epoch": 1.7570093457943925, "grad_norm": 0.6266064150386942, "learning_rate": 4.3705510557068746e-06, "loss": 0.2156, "step": 439 }, { "epoch": 1.7610146862483311, "grad_norm": 0.5569141516837799, "learning_rate": 4.347369038899744e-06, "loss": 0.1997, "step": 440 }, { "epoch": 1.7650200267022695, "grad_norm": 0.6592779504723204, "learning_rate": 4.324201285657474e-06, "loss": 0.215, "step": 441 }, { "epoch": 1.7690253671562082, "grad_norm": 0.6388494836347968, "learning_rate": 4.3010483023225045e-06, "loss": 0.2293, "step": 442 }, { "epoch": 1.7730307076101468, "grad_norm": 0.691528320994546, "learning_rate": 4.277910594914466e-06, "loss": 0.1971, "step": 443 }, { "epoch": 1.7770360480640854, "grad_norm": 0.6052938531570694, "learning_rate": 4.254788669119127e-06, "loss": 0.215, "step": 444 }, { "epoch": 1.781041388518024, "grad_norm": 0.5915787822047736, "learning_rate": 4.231683030277349e-06, "loss": 0.22, "step": 445 }, { "epoch": 1.7850467289719627, "grad_norm": 0.669779074015281, "learning_rate": 4.208594183374026e-06, "loss": 0.2139, "step": 446 }, { "epoch": 1.7890520694259013, "grad_norm": 0.5637355181902335, "learning_rate": 4.185522633027057e-06, "loss": 0.2074, "step": 447 }, { "epoch": 1.7930574098798397, "grad_norm": 0.6354293774014055, "learning_rate": 4.162468883476319e-06, "loss": 0.2029, "step": 448 }, { "epoch": 1.7970627503337784, "grad_norm": 0.5585098291406209, "learning_rate": 4.139433438572641e-06, "loss": 0.2149, "step": 449 }, { "epoch": 1.8010680907877168, "grad_norm": 0.5982104585865691, "learning_rate": 4.116416801766801e-06, "loss": 0.2272, "step": 450 }, { "epoch": 1.8050734312416554, "grad_norm": 0.5958872678061956, "learning_rate": 4.0934194760985095e-06, "loss": 0.2033, "step": 451 }, { "epoch": 1.809078771695594, "grad_norm": 0.6149274909600425, "learning_rate": 4.070441964185428e-06, "loss": 0.2047, "step": 452 }, { "epoch": 1.8130841121495327, "grad_norm": 0.649920066288117, "learning_rate": 4.047484768212175e-06, "loss": 0.2127, "step": 453 }, { "epoch": 1.8170894526034713, "grad_norm": 0.6231664353854522, "learning_rate": 4.02454838991936e-06, "loss": 0.214, "step": 454 }, { "epoch": 1.82109479305741, "grad_norm": 0.6324772858655361, "learning_rate": 4.001633330592604e-06, "loss": 0.2279, "step": 455 }, { "epoch": 1.8251001335113486, "grad_norm": 0.6387099548525226, "learning_rate": 3.978740091051599e-06, "loss": 0.231, "step": 456 }, { "epoch": 1.829105473965287, "grad_norm": 0.5888035825906338, "learning_rate": 3.955869171639151e-06, "loss": 0.2043, "step": 457 }, { "epoch": 1.8331108144192256, "grad_norm": 0.5900537278953912, "learning_rate": 3.933021072210251e-06, "loss": 0.2189, "step": 458 }, { "epoch": 1.8371161548731643, "grad_norm": 0.6155751843123176, "learning_rate": 3.910196292121147e-06, "loss": 0.2172, "step": 459 }, { "epoch": 1.8411214953271027, "grad_norm": 0.6185704225951081, "learning_rate": 3.887395330218429e-06, "loss": 0.2433, "step": 460 }, { "epoch": 1.8451268357810413, "grad_norm": 0.5757793970809637, "learning_rate": 3.864618684828135e-06, "loss": 0.2086, "step": 461 }, { "epoch": 1.84913217623498, "grad_norm": 0.5715122189230345, "learning_rate": 3.84186685374485e-06, "loss": 0.2174, "step": 462 }, { "epoch": 1.8531375166889186, "grad_norm": 0.562934776821966, "learning_rate": 3.81914033422083e-06, "loss": 0.1969, "step": 463 }, { "epoch": 1.8571428571428572, "grad_norm": 0.584422709822903, "learning_rate": 3.7964396229551365e-06, "loss": 0.2264, "step": 464 }, { "epoch": 1.8611481975967958, "grad_norm": 0.5887703309285004, "learning_rate": 3.7737652160827752e-06, "loss": 0.2118, "step": 465 }, { "epoch": 1.8651535380507345, "grad_norm": 0.5978955576130686, "learning_rate": 3.751117609163865e-06, "loss": 0.2162, "step": 466 }, { "epoch": 1.8691588785046729, "grad_norm": 0.6119642861090717, "learning_rate": 3.7284972971727907e-06, "loss": 0.225, "step": 467 }, { "epoch": 1.8731642189586115, "grad_norm": 0.6210379401883211, "learning_rate": 3.705904774487396e-06, "loss": 0.2056, "step": 468 }, { "epoch": 1.87716955941255, "grad_norm": 0.6081726738147702, "learning_rate": 3.683340534878176e-06, "loss": 0.2046, "step": 469 }, { "epoch": 1.8811748998664886, "grad_norm": 0.6467889875621615, "learning_rate": 3.6608050714974854e-06, "loss": 0.2503, "step": 470 }, { "epoch": 1.8851802403204272, "grad_norm": 0.6021455582975629, "learning_rate": 3.63829887686876e-06, "loss": 0.2244, "step": 471 }, { "epoch": 1.8891855807743658, "grad_norm": 0.634859275238841, "learning_rate": 3.6158224428757538e-06, "loss": 0.2208, "step": 472 }, { "epoch": 1.8931909212283045, "grad_norm": 0.6211383819455257, "learning_rate": 3.5933762607517875e-06, "loss": 0.2201, "step": 473 }, { "epoch": 1.897196261682243, "grad_norm": 0.5805284245558593, "learning_rate": 3.5709608210690127e-06, "loss": 0.2171, "step": 474 }, { "epoch": 1.9012016021361817, "grad_norm": 0.5969539880951342, "learning_rate": 3.5485766137276894e-06, "loss": 0.1989, "step": 475 }, { "epoch": 1.9052069425901201, "grad_norm": 0.5763877271910841, "learning_rate": 3.526224127945479e-06, "loss": 0.2051, "step": 476 }, { "epoch": 1.9092122830440588, "grad_norm": 0.573639466833024, "learning_rate": 3.5039038522467572e-06, "loss": 0.2216, "step": 477 }, { "epoch": 1.9132176234979972, "grad_norm": 0.6187640637863968, "learning_rate": 3.4816162744519266e-06, "loss": 0.2207, "step": 478 }, { "epoch": 1.9172229639519358, "grad_norm": 0.6135054921454743, "learning_rate": 3.459361881666766e-06, "loss": 0.216, "step": 479 }, { "epoch": 1.9212283044058744, "grad_norm": 0.5975760329649652, "learning_rate": 3.4371411602717785e-06, "loss": 0.1997, "step": 480 }, { "epoch": 1.925233644859813, "grad_norm": 0.5864424343197752, "learning_rate": 3.4149545959115604e-06, "loss": 0.1997, "step": 481 }, { "epoch": 1.9292389853137517, "grad_norm": 0.6206284942820284, "learning_rate": 3.3928026734841935e-06, "loss": 0.1958, "step": 482 }, { "epoch": 1.9332443257676903, "grad_norm": 0.627723357086264, "learning_rate": 3.3706858771306393e-06, "loss": 0.2099, "step": 483 }, { "epoch": 1.937249666221629, "grad_norm": 0.6391449489733926, "learning_rate": 3.3486046902241663e-06, "loss": 0.1946, "step": 484 }, { "epoch": 1.9412550066755674, "grad_norm": 0.6232074962194952, "learning_rate": 3.3265595953597774e-06, "loss": 0.2167, "step": 485 }, { "epoch": 1.945260347129506, "grad_norm": 0.7178387576084606, "learning_rate": 3.3045510743436665e-06, "loss": 0.2052, "step": 486 }, { "epoch": 1.9492656875834444, "grad_norm": 0.66259069636869, "learning_rate": 3.2825796081826943e-06, "loss": 0.2103, "step": 487 }, { "epoch": 1.953271028037383, "grad_norm": 0.6319630042493534, "learning_rate": 3.2606456770738636e-06, "loss": 0.1899, "step": 488 }, { "epoch": 1.9572763684913217, "grad_norm": 0.6583941141635503, "learning_rate": 3.2387497603938327e-06, "loss": 0.2057, "step": 489 }, { "epoch": 1.9612817089452603, "grad_norm": 0.6583371328399131, "learning_rate": 3.216892336688435e-06, "loss": 0.208, "step": 490 }, { "epoch": 1.965287049399199, "grad_norm": 0.6626656334140933, "learning_rate": 3.1950738836622226e-06, "loss": 0.213, "step": 491 }, { "epoch": 1.9692923898531376, "grad_norm": 0.57636570851929, "learning_rate": 3.173294878168025e-06, "loss": 0.2123, "step": 492 }, { "epoch": 1.9732977303070762, "grad_norm": 0.628166413351637, "learning_rate": 3.1515557961965254e-06, "loss": 0.2082, "step": 493 }, { "epoch": 1.9773030707610146, "grad_norm": 0.6161007175331755, "learning_rate": 3.1298571128658593e-06, "loss": 0.2071, "step": 494 }, { "epoch": 1.9813084112149533, "grad_norm": 0.5578291746094638, "learning_rate": 3.1081993024112328e-06, "loss": 0.1953, "step": 495 }, { "epoch": 1.985313751668892, "grad_norm": 0.5742324867913953, "learning_rate": 3.0865828381745515e-06, "loss": 0.2183, "step": 496 }, { "epoch": 1.9893190921228303, "grad_norm": 0.5651971848567813, "learning_rate": 3.0650081925940834e-06, "loss": 0.1881, "step": 497 }, { "epoch": 1.993324432576769, "grad_norm": 0.5805509174084333, "learning_rate": 3.043475837194126e-06, "loss": 0.2148, "step": 498 }, { "epoch": 1.9973297730307076, "grad_norm": 0.753242305787025, "learning_rate": 3.021986242574707e-06, "loss": 0.2226, "step": 499 }, { "epoch": 2.0, "grad_norm": 0.7484446599170808, "learning_rate": 3.000539878401296e-06, "loss": 0.2156, "step": 500 }, { "epoch": 2.0040053404539386, "grad_norm": 0.7266923236604501, "learning_rate": 2.9791372133945405e-06, "loss": 0.1868, "step": 501 }, { "epoch": 2.0080106809078773, "grad_norm": 0.6838990546286972, "learning_rate": 2.95777871532002e-06, "loss": 0.1693, "step": 502 }, { "epoch": 2.012016021361816, "grad_norm": 0.6522038544426395, "learning_rate": 2.936464850978027e-06, "loss": 0.1648, "step": 503 }, { "epoch": 2.0160213618157545, "grad_norm": 0.6932425119304833, "learning_rate": 2.9151960861933616e-06, "loss": 0.1781, "step": 504 }, { "epoch": 2.0200267022696927, "grad_norm": 0.6187151564266777, "learning_rate": 2.893972885805148e-06, "loss": 0.1814, "step": 505 }, { "epoch": 2.0240320427236314, "grad_norm": 0.5852591736999052, "learning_rate": 2.8727957136566825e-06, "loss": 0.1616, "step": 506 }, { "epoch": 2.02803738317757, "grad_norm": 0.6216187678840704, "learning_rate": 2.8516650325852883e-06, "loss": 0.1776, "step": 507 }, { "epoch": 2.0320427236315086, "grad_norm": 0.6805660651282351, "learning_rate": 2.83058130441221e-06, "loss": 0.1848, "step": 508 }, { "epoch": 2.0360480640854473, "grad_norm": 0.8026841589589839, "learning_rate": 2.809544989932508e-06, "loss": 0.1905, "step": 509 }, { "epoch": 2.040053404539386, "grad_norm": 0.661638883853578, "learning_rate": 2.7885565489049948e-06, "loss": 0.1568, "step": 510 }, { "epoch": 2.0440587449933245, "grad_norm": 0.6472428396855733, "learning_rate": 2.7676164400421864e-06, "loss": 0.1767, "step": 511 }, { "epoch": 2.048064085447263, "grad_norm": 0.6917047834821735, "learning_rate": 2.746725121000273e-06, "loss": 0.1871, "step": 512 }, { "epoch": 2.052069425901202, "grad_norm": 0.6021301697708129, "learning_rate": 2.725883048369128e-06, "loss": 0.1695, "step": 513 }, { "epoch": 2.05607476635514, "grad_norm": 0.6089561158393074, "learning_rate": 2.705090677662311e-06, "loss": 0.1743, "step": 514 }, { "epoch": 2.0600801068090786, "grad_norm": 0.6433217615782736, "learning_rate": 2.684348463307128e-06, "loss": 0.1648, "step": 515 }, { "epoch": 2.0640854472630172, "grad_norm": 0.6675245713273296, "learning_rate": 2.66365685863469e-06, "loss": 0.1695, "step": 516 }, { "epoch": 2.068090787716956, "grad_norm": 0.5639272937571744, "learning_rate": 2.6430163158700116e-06, "loss": 0.1552, "step": 517 }, { "epoch": 2.0720961281708945, "grad_norm": 0.6756211271225167, "learning_rate": 2.6224272861221245e-06, "loss": 0.1871, "step": 518 }, { "epoch": 2.076101468624833, "grad_norm": 0.56951104964083, "learning_rate": 2.601890219374217e-06, "loss": 0.175, "step": 519 }, { "epoch": 2.0801068090787718, "grad_norm": 0.5849605152037826, "learning_rate": 2.5814055644738013e-06, "loss": 0.1577, "step": 520 }, { "epoch": 2.0841121495327104, "grad_norm": 0.6208978605303654, "learning_rate": 2.5609737691229055e-06, "loss": 0.1676, "step": 521 }, { "epoch": 2.088117489986649, "grad_norm": 0.6300535483875338, "learning_rate": 2.5405952798682844e-06, "loss": 0.1767, "step": 522 }, { "epoch": 2.0921228304405872, "grad_norm": 0.6515325771939616, "learning_rate": 2.520270542091663e-06, "loss": 0.1862, "step": 523 }, { "epoch": 2.096128170894526, "grad_norm": 0.6573962419086741, "learning_rate": 2.5000000000000015e-06, "loss": 0.1707, "step": 524 }, { "epoch": 2.1001335113484645, "grad_norm": 0.6008492264289961, "learning_rate": 2.4797840966157877e-06, "loss": 0.1562, "step": 525 }, { "epoch": 2.104138851802403, "grad_norm": 0.5554906783739959, "learning_rate": 2.4596232737673544e-06, "loss": 0.1753, "step": 526 }, { "epoch": 2.1081441922563418, "grad_norm": 0.5883215878515664, "learning_rate": 2.439517972079222e-06, "loss": 0.1739, "step": 527 }, { "epoch": 2.1121495327102804, "grad_norm": 0.6225552692825768, "learning_rate": 2.4194686309624664e-06, "loss": 0.1736, "step": 528 }, { "epoch": 2.116154873164219, "grad_norm": 0.6538525824437825, "learning_rate": 2.3994756886051267e-06, "loss": 0.1793, "step": 529 }, { "epoch": 2.1201602136181577, "grad_norm": 0.6426343380918219, "learning_rate": 2.3795395819626116e-06, "loss": 0.1636, "step": 530 }, { "epoch": 2.1241655540720963, "grad_norm": 0.6984876016877216, "learning_rate": 2.3596607467481602e-06, "loss": 0.1818, "step": 531 }, { "epoch": 2.128170894526035, "grad_norm": 0.5969581027123277, "learning_rate": 2.339839617423318e-06, "loss": 0.1761, "step": 532 }, { "epoch": 2.132176234979973, "grad_norm": 0.5965591945629233, "learning_rate": 2.320076627188438e-06, "loss": 0.1699, "step": 533 }, { "epoch": 2.1361815754339117, "grad_norm": 0.6646391923991977, "learning_rate": 2.300372207973219e-06, "loss": 0.1642, "step": 534 }, { "epoch": 2.1401869158878504, "grad_norm": 0.5996069077675478, "learning_rate": 2.280726790427258e-06, "loss": 0.1721, "step": 535 }, { "epoch": 2.144192256341789, "grad_norm": 0.6741481126361855, "learning_rate": 2.261140803910644e-06, "loss": 0.1727, "step": 536 }, { "epoch": 2.1481975967957276, "grad_norm": 0.6323694946147757, "learning_rate": 2.2416146764845733e-06, "loss": 0.1702, "step": 537 }, { "epoch": 2.1522029372496663, "grad_norm": 0.6862267704077283, "learning_rate": 2.2221488349019903e-06, "loss": 0.1729, "step": 538 }, { "epoch": 2.156208277703605, "grad_norm": 0.6070081128579359, "learning_rate": 2.202743704598263e-06, "loss": 0.1593, "step": 539 }, { "epoch": 2.1602136181575435, "grad_norm": 0.7278096682292641, "learning_rate": 2.1833997096818897e-06, "loss": 0.1836, "step": 540 }, { "epoch": 2.164218958611482, "grad_norm": 0.632259449291054, "learning_rate": 2.1641172729252206e-06, "loss": 0.1711, "step": 541 }, { "epoch": 2.1682242990654204, "grad_norm": 0.6117574742799209, "learning_rate": 2.1448968157552243e-06, "loss": 0.1632, "step": 542 }, { "epoch": 2.172229639519359, "grad_norm": 0.631961523767331, "learning_rate": 2.1257387582442746e-06, "loss": 0.1694, "step": 543 }, { "epoch": 2.1762349799732976, "grad_norm": 0.6156725197172384, "learning_rate": 2.1066435191009717e-06, "loss": 0.1643, "step": 544 }, { "epoch": 2.1802403204272363, "grad_norm": 0.6322664734036844, "learning_rate": 2.08761151566099e-06, "loss": 0.1798, "step": 545 }, { "epoch": 2.184245660881175, "grad_norm": 0.6464718958812395, "learning_rate": 2.0686431638779564e-06, "loss": 0.1731, "step": 546 }, { "epoch": 2.1882510013351135, "grad_norm": 0.6399447883731574, "learning_rate": 2.04973887831436e-06, "loss": 0.1835, "step": 547 }, { "epoch": 2.192256341789052, "grad_norm": 0.5720412483866, "learning_rate": 2.030899072132493e-06, "loss": 0.1574, "step": 548 }, { "epoch": 2.196261682242991, "grad_norm": 0.5860714566791593, "learning_rate": 2.0121241570854165e-06, "loss": 0.182, "step": 549 }, { "epoch": 2.2002670226969294, "grad_norm": 0.6083557073323276, "learning_rate": 1.9934145435079705e-06, "loss": 0.1661, "step": 550 }, { "epoch": 2.204272363150868, "grad_norm": 0.6104569659276108, "learning_rate": 1.9747706403077943e-06, "loss": 0.1979, "step": 551 }, { "epoch": 2.2082777036048062, "grad_norm": 0.62244894576839, "learning_rate": 1.956192854956397e-06, "loss": 0.1774, "step": 552 }, { "epoch": 2.212283044058745, "grad_norm": 0.630950626155233, "learning_rate": 1.9376815934802496e-06, "loss": 0.1758, "step": 553 }, { "epoch": 2.2162883845126835, "grad_norm": 0.6625191361259387, "learning_rate": 1.9192372604519127e-06, "loss": 0.1928, "step": 554 }, { "epoch": 2.220293724966622, "grad_norm": 0.5820926209462741, "learning_rate": 1.9008602589811931e-06, "loss": 0.1565, "step": 555 }, { "epoch": 2.2242990654205608, "grad_norm": 0.5567632552609313, "learning_rate": 1.8825509907063328e-06, "loss": 0.1756, "step": 556 }, { "epoch": 2.2283044058744994, "grad_norm": 0.5980702955281321, "learning_rate": 1.864309855785234e-06, "loss": 0.1594, "step": 557 }, { "epoch": 2.232309746328438, "grad_norm": 0.5820363892907623, "learning_rate": 1.8461372528867095e-06, "loss": 0.1768, "step": 558 }, { "epoch": 2.2363150867823767, "grad_norm": 0.5967644083406489, "learning_rate": 1.8280335791817733e-06, "loss": 0.1689, "step": 559 }, { "epoch": 2.2403204272363153, "grad_norm": 0.6250075972601391, "learning_rate": 1.809999230334958e-06, "loss": 0.1748, "step": 560 }, { "epoch": 2.2443257676902535, "grad_norm": 0.606205688498456, "learning_rate": 1.7920346004956673e-06, "loss": 0.1834, "step": 561 }, { "epoch": 2.248331108144192, "grad_norm": 0.5940182864906453, "learning_rate": 1.7741400822895633e-06, "loss": 0.1691, "step": 562 }, { "epoch": 2.2523364485981308, "grad_norm": 0.5683156268730282, "learning_rate": 1.7563160668099838e-06, "loss": 0.1726, "step": 563 }, { "epoch": 2.2563417890520694, "grad_norm": 0.6031458823381572, "learning_rate": 1.7385629436093958e-06, "loss": 0.1618, "step": 564 }, { "epoch": 2.260347129506008, "grad_norm": 0.6310913458129014, "learning_rate": 1.7208811006908798e-06, "loss": 0.18, "step": 565 }, { "epoch": 2.2643524699599467, "grad_norm": 0.6148520423770238, "learning_rate": 1.7032709244996559e-06, "loss": 0.1699, "step": 566 }, { "epoch": 2.2683578104138853, "grad_norm": 0.6101595073175935, "learning_rate": 1.6857327999146284e-06, "loss": 0.1623, "step": 567 }, { "epoch": 2.272363150867824, "grad_norm": 0.5775767049567889, "learning_rate": 1.6682671102399806e-06, "loss": 0.1611, "step": 568 }, { "epoch": 2.2763684913217626, "grad_norm": 0.6215298766632447, "learning_rate": 1.6508742371967962e-06, "loss": 0.1708, "step": 569 }, { "epoch": 2.2803738317757007, "grad_norm": 0.6064011848210037, "learning_rate": 1.633554560914714e-06, "loss": 0.1793, "step": 570 }, { "epoch": 2.2843791722296394, "grad_norm": 0.5913150625405588, "learning_rate": 1.6163084599236278e-06, "loss": 0.1734, "step": 571 }, { "epoch": 2.288384512683578, "grad_norm": 0.6290184807128071, "learning_rate": 1.5991363111454023e-06, "loss": 0.1643, "step": 572 }, { "epoch": 2.2923898531375166, "grad_norm": 0.6150638457127714, "learning_rate": 1.5820384898856433e-06, "loss": 0.1662, "step": 573 }, { "epoch": 2.2963951935914553, "grad_norm": 0.6496821212159293, "learning_rate": 1.5650153698254916e-06, "loss": 0.1854, "step": 574 }, { "epoch": 2.300400534045394, "grad_norm": 0.6678529895051883, "learning_rate": 1.5480673230134585e-06, "loss": 0.1618, "step": 575 }, { "epoch": 2.3044058744993325, "grad_norm": 0.6202814553309792, "learning_rate": 1.5311947198572918e-06, "loss": 0.1669, "step": 576 }, { "epoch": 2.308411214953271, "grad_norm": 0.5462233451328915, "learning_rate": 1.514397929115884e-06, "loss": 0.1578, "step": 577 }, { "epoch": 2.31241655540721, "grad_norm": 0.6123799648363422, "learning_rate": 1.4976773178912085e-06, "loss": 0.1678, "step": 578 }, { "epoch": 2.316421895861148, "grad_norm": 0.6365385572090578, "learning_rate": 1.481033251620299e-06, "loss": 0.1686, "step": 579 }, { "epoch": 2.3204272363150866, "grad_norm": 0.6063441561728166, "learning_rate": 1.4644660940672628e-06, "loss": 0.1772, "step": 580 }, { "epoch": 2.3244325767690253, "grad_norm": 0.658171365772341, "learning_rate": 1.4479762073153304e-06, "loss": 0.1726, "step": 581 }, { "epoch": 2.328437917222964, "grad_norm": 0.6269724587094964, "learning_rate": 1.4315639517589398e-06, "loss": 0.1662, "step": 582 }, { "epoch": 2.3324432576769025, "grad_norm": 0.6133548152558246, "learning_rate": 1.4152296860958641e-06, "loss": 0.1702, "step": 583 }, { "epoch": 2.336448598130841, "grad_norm": 0.66729626203352, "learning_rate": 1.3989737673193682e-06, "loss": 0.1926, "step": 584 }, { "epoch": 2.34045393858478, "grad_norm": 0.6087596853512452, "learning_rate": 1.382796550710408e-06, "loss": 0.1886, "step": 585 }, { "epoch": 2.3444592790387184, "grad_norm": 0.5915578437600094, "learning_rate": 1.3666983898298659e-06, "loss": 0.1541, "step": 586 }, { "epoch": 2.348464619492657, "grad_norm": 0.6098604276701993, "learning_rate": 1.3506796365108232e-06, "loss": 0.1739, "step": 587 }, { "epoch": 2.3524699599465952, "grad_norm": 0.6102908288565849, "learning_rate": 1.3347406408508695e-06, "loss": 0.1715, "step": 588 }, { "epoch": 2.356475300400534, "grad_norm": 0.6133775478993069, "learning_rate": 1.3188817512044544e-06, "loss": 0.1646, "step": 589 }, { "epoch": 2.3604806408544725, "grad_norm": 0.6293540508319713, "learning_rate": 1.3031033141752702e-06, "loss": 0.1711, "step": 590 }, { "epoch": 2.364485981308411, "grad_norm": 0.7443933199698777, "learning_rate": 1.2874056746086772e-06, "loss": 0.152, "step": 591 }, { "epoch": 2.3684913217623498, "grad_norm": 0.5731726505456445, "learning_rate": 1.2717891755841722e-06, "loss": 0.1509, "step": 592 }, { "epoch": 2.3724966622162884, "grad_norm": 0.5987015572774188, "learning_rate": 1.2562541584078835e-06, "loss": 0.1664, "step": 593 }, { "epoch": 2.376502002670227, "grad_norm": 0.6300995737372567, "learning_rate": 1.2408009626051137e-06, "loss": 0.1744, "step": 594 }, { "epoch": 2.3805073431241657, "grad_norm": 0.6142455245728455, "learning_rate": 1.225429925912921e-06, "loss": 0.1563, "step": 595 }, { "epoch": 2.3845126835781043, "grad_norm": 0.6234917981255259, "learning_rate": 1.2101413842727345e-06, "loss": 0.1648, "step": 596 }, { "epoch": 2.3885180240320425, "grad_norm": 0.5825435971591906, "learning_rate": 1.1949356718230188e-06, "loss": 0.1602, "step": 597 }, { "epoch": 2.392523364485981, "grad_norm": 0.6402288566179174, "learning_rate": 1.1798131208919628e-06, "loss": 0.1851, "step": 598 }, { "epoch": 2.3965287049399198, "grad_norm": 0.6037042606359994, "learning_rate": 1.1647740619902193e-06, "loss": 0.1747, "step": 599 }, { "epoch": 2.4005340453938584, "grad_norm": 0.5948400476478892, "learning_rate": 1.1498188238036862e-06, "loss": 0.1554, "step": 600 }, { "epoch": 2.404539385847797, "grad_norm": 0.6966147020255966, "learning_rate": 1.134947733186315e-06, "loss": 0.1779, "step": 601 }, { "epoch": 2.4085447263017357, "grad_norm": 0.6005191719492351, "learning_rate": 1.1201611151529756e-06, "loss": 0.1607, "step": 602 }, { "epoch": 2.4125500667556743, "grad_norm": 0.5820830342726288, "learning_rate": 1.105459292872345e-06, "loss": 0.165, "step": 603 }, { "epoch": 2.416555407209613, "grad_norm": 0.5655842631268676, "learning_rate": 1.0908425876598512e-06, "loss": 0.1528, "step": 604 }, { "epoch": 2.4205607476635516, "grad_norm": 0.5964382967668805, "learning_rate": 1.0763113189706453e-06, "loss": 0.1694, "step": 605 }, { "epoch": 2.4245660881174897, "grad_norm": 0.6456074354041683, "learning_rate": 1.0618658043926233e-06, "loss": 0.1747, "step": 606 }, { "epoch": 2.4285714285714284, "grad_norm": 0.7700257765536643, "learning_rate": 1.047506359639483e-06, "loss": 0.1804, "step": 607 }, { "epoch": 2.432576769025367, "grad_norm": 0.6024820562838693, "learning_rate": 1.0332332985438248e-06, "loss": 0.1704, "step": 608 }, { "epoch": 2.4365821094793056, "grad_norm": 0.6345644986717863, "learning_rate": 1.0190469330502928e-06, "loss": 0.1782, "step": 609 }, { "epoch": 2.4405874499332443, "grad_norm": 0.5806972189010675, "learning_rate": 1.004947573208756e-06, "loss": 0.1803, "step": 610 }, { "epoch": 2.444592790387183, "grad_norm": 0.6158669549354134, "learning_rate": 9.909355271675335e-07, "loss": 0.1531, "step": 611 }, { "epoch": 2.4485981308411215, "grad_norm": 0.6579929468353193, "learning_rate": 9.770111011666582e-07, "loss": 0.1821, "step": 612 }, { "epoch": 2.45260347129506, "grad_norm": 0.5991038573483655, "learning_rate": 9.631745995311881e-07, "loss": 0.1658, "step": 613 }, { "epoch": 2.456608811748999, "grad_norm": 0.5782202975221776, "learning_rate": 9.494263246645474e-07, "loss": 0.1525, "step": 614 }, { "epoch": 2.4606141522029374, "grad_norm": 0.6234695823453394, "learning_rate": 9.357665770419244e-07, "loss": 0.1676, "step": 615 }, { "epoch": 2.464619492656876, "grad_norm": 0.6614765092596231, "learning_rate": 9.221956552036992e-07, "loss": 0.1809, "step": 616 }, { "epoch": 2.4686248331108143, "grad_norm": 0.5631502943362868, "learning_rate": 9.08713855748925e-07, "loss": 0.1511, "step": 617 }, { "epoch": 2.472630173564753, "grad_norm": 0.6001768275330218, "learning_rate": 8.953214733288384e-07, "loss": 0.1685, "step": 618 }, { "epoch": 2.4766355140186915, "grad_norm": 0.6128594108692941, "learning_rate": 8.820188006404268e-07, "loss": 0.167, "step": 619 }, { "epoch": 2.48064085447263, "grad_norm": 0.6003517878466088, "learning_rate": 8.688061284200266e-07, "loss": 0.16, "step": 620 }, { "epoch": 2.484646194926569, "grad_norm": 0.6125927993221344, "learning_rate": 8.556837454369698e-07, "loss": 0.1548, "step": 621 }, { "epoch": 2.4886515353805074, "grad_norm": 0.620566464502616, "learning_rate": 8.426519384872733e-07, "loss": 0.1753, "step": 622 }, { "epoch": 2.492656875834446, "grad_norm": 0.6137925843567528, "learning_rate": 8.297109923873753e-07, "loss": 0.1552, "step": 623 }, { "epoch": 2.4966622162883847, "grad_norm": 0.5783321494225999, "learning_rate": 8.168611899679013e-07, "loss": 0.1643, "step": 624 }, { "epoch": 2.5006675567423233, "grad_norm": 0.565759829851567, "learning_rate": 8.041028120674894e-07, "loss": 0.1568, "step": 625 }, { "epoch": 2.5046728971962615, "grad_norm": 0.6438983607349034, "learning_rate": 7.914361375266505e-07, "loss": 0.167, "step": 626 }, { "epoch": 2.5086782376502, "grad_norm": 0.7266085897712414, "learning_rate": 7.788614431816743e-07, "loss": 0.1775, "step": 627 }, { "epoch": 2.512683578104139, "grad_norm": 0.5951499553068635, "learning_rate": 7.663790038585794e-07, "loss": 0.1567, "step": 628 }, { "epoch": 2.5166889185580774, "grad_norm": 0.6024813088753026, "learning_rate": 7.539890923671061e-07, "loss": 0.1654, "step": 629 }, { "epoch": 2.520694259012016, "grad_norm": 0.5611519260516766, "learning_rate": 7.416919794947536e-07, "loss": 0.1666, "step": 630 }, { "epoch": 2.5246995994659547, "grad_norm": 0.6940972612783466, "learning_rate": 7.294879340008632e-07, "loss": 0.1745, "step": 631 }, { "epoch": 2.5287049399198933, "grad_norm": 0.5590592473739715, "learning_rate": 7.173772226107434e-07, "loss": 0.1698, "step": 632 }, { "epoch": 2.5327102803738315, "grad_norm": 0.591058116554628, "learning_rate": 7.053601100098401e-07, "loss": 0.1671, "step": 633 }, { "epoch": 2.5367156208277706, "grad_norm": 0.6077726003073021, "learning_rate": 6.934368588379553e-07, "loss": 0.1847, "step": 634 }, { "epoch": 2.5407209612817088, "grad_norm": 0.5900365979656913, "learning_rate": 6.816077296835006e-07, "loss": 0.1632, "step": 635 }, { "epoch": 2.5447263017356474, "grad_norm": 0.6291526388062625, "learning_rate": 6.698729810778065e-07, "loss": 0.1669, "step": 636 }, { "epoch": 2.548731642189586, "grad_norm": 0.6405676228401413, "learning_rate": 6.582328694894729e-07, "loss": 0.1678, "step": 637 }, { "epoch": 2.5527369826435247, "grad_norm": 0.5916007034338923, "learning_rate": 6.46687649318759e-07, "loss": 0.1687, "step": 638 }, { "epoch": 2.5567423230974633, "grad_norm": 0.6197597194779472, "learning_rate": 6.352375728920285e-07, "loss": 0.159, "step": 639 }, { "epoch": 2.560747663551402, "grad_norm": 0.6078888459723736, "learning_rate": 6.238828904562316e-07, "loss": 0.1818, "step": 640 }, { "epoch": 2.5647530040053406, "grad_norm": 0.5953442248473361, "learning_rate": 6.126238501734372e-07, "loss": 0.1747, "step": 641 }, { "epoch": 2.568758344459279, "grad_norm": 0.6197098462200006, "learning_rate": 6.014606981154086e-07, "loss": 0.1705, "step": 642 }, { "epoch": 2.572763684913218, "grad_norm": 0.6171410856963269, "learning_rate": 5.903936782582253e-07, "loss": 0.1771, "step": 643 }, { "epoch": 2.576769025367156, "grad_norm": 0.5991169552059425, "learning_rate": 5.794230324769518e-07, "loss": 0.1669, "step": 644 }, { "epoch": 2.5807743658210947, "grad_norm": 0.6103321073889593, "learning_rate": 5.685490005403499e-07, "loss": 0.1726, "step": 645 }, { "epoch": 2.5847797062750333, "grad_norm": 0.6256121075513487, "learning_rate": 5.577718201056392e-07, "loss": 0.1557, "step": 646 }, { "epoch": 2.588785046728972, "grad_norm": 0.6204867013864792, "learning_rate": 5.470917267133041e-07, "loss": 0.1603, "step": 647 }, { "epoch": 2.5927903871829105, "grad_norm": 0.5859779013723672, "learning_rate": 5.365089537819435e-07, "loss": 0.1717, "step": 648 }, { "epoch": 2.596795727636849, "grad_norm": 0.5950814615589916, "learning_rate": 5.260237326031698e-07, "loss": 0.1684, "step": 649 }, { "epoch": 2.600801068090788, "grad_norm": 0.5788284827989015, "learning_rate": 5.156362923365587e-07, "loss": 0.1748, "step": 650 }, { "epoch": 2.6048064085447264, "grad_norm": 0.6500848201090982, "learning_rate": 5.053468600046324e-07, "loss": 0.1551, "step": 651 }, { "epoch": 2.608811748998665, "grad_norm": 0.579677458355888, "learning_rate": 4.951556604879049e-07, "loss": 0.1561, "step": 652 }, { "epoch": 2.6128170894526033, "grad_norm": 0.5938327964966817, "learning_rate": 4.850629165199627e-07, "loss": 0.1748, "step": 653 }, { "epoch": 2.616822429906542, "grad_norm": 0.5844525078342523, "learning_rate": 4.7506884868259996e-07, "loss": 0.1565, "step": 654 }, { "epoch": 2.6208277703604805, "grad_norm": 0.5834653222280398, "learning_rate": 4.651736754009972e-07, "loss": 0.1631, "step": 655 }, { "epoch": 2.624833110814419, "grad_norm": 0.5977062556990842, "learning_rate": 4.5537761293894535e-07, "loss": 0.1555, "step": 656 }, { "epoch": 2.628838451268358, "grad_norm": 0.595817231192985, "learning_rate": 4.456808753941205e-07, "loss": 0.1881, "step": 657 }, { "epoch": 2.6328437917222964, "grad_norm": 0.6038863958838983, "learning_rate": 4.3608367469340553e-07, "loss": 0.1611, "step": 658 }, { "epoch": 2.636849132176235, "grad_norm": 0.5765506637261613, "learning_rate": 4.265862205882559e-07, "loss": 0.1669, "step": 659 }, { "epoch": 2.6408544726301737, "grad_norm": 0.6081323911406266, "learning_rate": 4.171887206501191e-07, "loss": 0.166, "step": 660 }, { "epoch": 2.6448598130841123, "grad_norm": 0.7942617005475607, "learning_rate": 4.078913802658946e-07, "loss": 0.1526, "step": 661 }, { "epoch": 2.6488651535380505, "grad_norm": 0.5745245099031036, "learning_rate": 3.9869440263344714e-07, "loss": 0.1865, "step": 662 }, { "epoch": 2.652870493991989, "grad_norm": 0.6362782671167186, "learning_rate": 3.895979887571649e-07, "loss": 0.1702, "step": 663 }, { "epoch": 2.656875834445928, "grad_norm": 0.6659260281530986, "learning_rate": 3.8060233744356634e-07, "loss": 0.1588, "step": 664 }, { "epoch": 2.6608811748998664, "grad_norm": 0.5821725085601599, "learning_rate": 3.717076452969559e-07, "loss": 0.1585, "step": 665 }, { "epoch": 2.664886515353805, "grad_norm": 0.5721973191552109, "learning_rate": 3.6291410671512597e-07, "loss": 0.1546, "step": 666 }, { "epoch": 2.6688918558077437, "grad_norm": 0.6108240231082995, "learning_rate": 3.542219138851094e-07, "loss": 0.165, "step": 667 }, { "epoch": 2.6728971962616823, "grad_norm": 0.5890590495394973, "learning_rate": 3.4563125677897936e-07, "loss": 0.1697, "step": 668 }, { "epoch": 2.676902536715621, "grad_norm": 0.5687849267564792, "learning_rate": 3.371423231496951e-07, "loss": 0.1737, "step": 669 }, { "epoch": 2.6809078771695596, "grad_norm": 0.6029561874285849, "learning_rate": 3.287552985270015e-07, "loss": 0.1618, "step": 670 }, { "epoch": 2.6849132176234978, "grad_norm": 0.6317046685851485, "learning_rate": 3.204703662133724e-07, "loss": 0.1761, "step": 671 }, { "epoch": 2.688918558077437, "grad_norm": 0.6617100618302314, "learning_rate": 3.122877072800046e-07, "loss": 0.1656, "step": 672 }, { "epoch": 2.692923898531375, "grad_norm": 0.5447463259022025, "learning_rate": 3.0420750056286195e-07, "loss": 0.1611, "step": 673 }, { "epoch": 2.6969292389853137, "grad_norm": 0.6232513934357247, "learning_rate": 2.962299226587639e-07, "loss": 0.1613, "step": 674 }, { "epoch": 2.7009345794392523, "grad_norm": 0.620106407560748, "learning_rate": 2.8835514792152854e-07, "loss": 0.1525, "step": 675 }, { "epoch": 2.704939919893191, "grad_norm": 0.6010185411049328, "learning_rate": 2.8058334845816214e-07, "loss": 0.1491, "step": 676 }, { "epoch": 2.7089452603471296, "grad_norm": 0.5685858072848753, "learning_rate": 2.729146941250954e-07, "loss": 0.1558, "step": 677 }, { "epoch": 2.712950600801068, "grad_norm": 0.605493206929418, "learning_rate": 2.653493525244721e-07, "loss": 0.167, "step": 678 }, { "epoch": 2.716955941255007, "grad_norm": 0.5867378767850222, "learning_rate": 2.5788748900048676e-07, "loss": 0.1622, "step": 679 }, { "epoch": 2.720961281708945, "grad_norm": 0.6167298470042248, "learning_rate": 2.5052926663577006e-07, "loss": 0.161, "step": 680 }, { "epoch": 2.724966622162884, "grad_norm": 0.6730247619234008, "learning_rate": 2.4327484624782684e-07, "loss": 0.159, "step": 681 }, { "epoch": 2.7289719626168223, "grad_norm": 0.5614764326727409, "learning_rate": 2.3612438638551837e-07, "loss": 0.173, "step": 682 }, { "epoch": 2.732977303070761, "grad_norm": 0.6302046681068347, "learning_rate": 2.290780433255979e-07, "loss": 0.1823, "step": 683 }, { "epoch": 2.7369826435246996, "grad_norm": 0.6339293472571709, "learning_rate": 2.2213597106929608e-07, "loss": 0.1653, "step": 684 }, { "epoch": 2.740987983978638, "grad_norm": 0.6169502312976588, "learning_rate": 2.152983213389559e-07, "loss": 0.1738, "step": 685 }, { "epoch": 2.744993324432577, "grad_norm": 0.59777733584903, "learning_rate": 2.085652435747132e-07, "loss": 0.1728, "step": 686 }, { "epoch": 2.7489986648865155, "grad_norm": 0.6587479139210242, "learning_rate": 2.0193688493123588e-07, "loss": 0.1748, "step": 687 }, { "epoch": 2.753004005340454, "grad_norm": 0.5962185804964122, "learning_rate": 1.9541339027450256e-07, "loss": 0.1617, "step": 688 }, { "epoch": 2.7570093457943923, "grad_norm": 0.7505303172182166, "learning_rate": 1.889949021786397e-07, "loss": 0.1722, "step": 689 }, { "epoch": 2.7610146862483314, "grad_norm": 0.5964493036331348, "learning_rate": 1.8268156092280498e-07, "loss": 0.1654, "step": 690 }, { "epoch": 2.7650200267022695, "grad_norm": 0.5762622321084703, "learning_rate": 1.7647350448812105e-07, "loss": 0.1583, "step": 691 }, { "epoch": 2.769025367156208, "grad_norm": 0.6801945089414425, "learning_rate": 1.7037086855465902e-07, "loss": 0.1768, "step": 692 }, { "epoch": 2.773030707610147, "grad_norm": 0.6911899110033116, "learning_rate": 1.6437378649847458e-07, "loss": 0.1732, "step": 693 }, { "epoch": 2.7770360480640854, "grad_norm": 0.6400292646247479, "learning_rate": 1.5848238938869332e-07, "loss": 0.1713, "step": 694 }, { "epoch": 2.781041388518024, "grad_norm": 0.6293782671500883, "learning_rate": 1.5269680598464342e-07, "loss": 0.1698, "step": 695 }, { "epoch": 2.7850467289719627, "grad_norm": 0.6326255144077011, "learning_rate": 1.4701716273304524e-07, "loss": 0.1617, "step": 696 }, { "epoch": 2.7890520694259013, "grad_norm": 0.5877129222871145, "learning_rate": 1.4144358376524504e-07, "loss": 0.1736, "step": 697 }, { "epoch": 2.7930574098798395, "grad_norm": 0.5768063266465919, "learning_rate": 1.3597619089450343e-07, "loss": 0.1678, "step": 698 }, { "epoch": 2.7970627503337786, "grad_norm": 0.6141881975935009, "learning_rate": 1.3061510361333186e-07, "loss": 0.174, "step": 699 }, { "epoch": 2.801068090787717, "grad_norm": 0.6135094476904235, "learning_rate": 1.253604390908819e-07, "loss": 0.1643, "step": 700 }, { "epoch": 2.8050734312416554, "grad_norm": 0.5963941238541673, "learning_rate": 1.2021231217038522e-07, "loss": 0.1719, "step": 701 }, { "epoch": 2.809078771695594, "grad_norm": 0.6188679804552468, "learning_rate": 1.1517083536664142e-07, "loss": 0.1732, "step": 702 }, { "epoch": 2.8130841121495327, "grad_norm": 0.6378524366699763, "learning_rate": 1.10236118863562e-07, "loss": 0.1657, "step": 703 }, { "epoch": 2.8170894526034713, "grad_norm": 0.6624926543396782, "learning_rate": 1.0540827051175817e-07, "loss": 0.163, "step": 704 }, { "epoch": 2.82109479305741, "grad_norm": 0.6241183789468542, "learning_rate": 1.0068739582618781e-07, "loss": 0.1738, "step": 705 }, { "epoch": 2.8251001335113486, "grad_norm": 0.5929380446422712, "learning_rate": 9.607359798384785e-08, "loss": 0.1698, "step": 706 }, { "epoch": 2.8291054739652868, "grad_norm": 0.6379777759896461, "learning_rate": 9.15669778215178e-08, "loss": 0.178, "step": 707 }, { "epoch": 2.833110814419226, "grad_norm": 0.5934788386620728, "learning_rate": 8.716763383355863e-08, "loss": 0.1781, "step": 708 }, { "epoch": 2.837116154873164, "grad_norm": 0.67169486389099, "learning_rate": 8.287566216975795e-08, "loss": 0.1468, "step": 709 }, { "epoch": 2.8411214953271027, "grad_norm": 0.5945998400168926, "learning_rate": 7.869115663322879e-08, "loss": 0.1677, "step": 710 }, { "epoch": 2.8451268357810413, "grad_norm": 0.6379381928228921, "learning_rate": 7.461420867836078e-08, "loss": 0.1596, "step": 711 }, { "epoch": 2.84913217623498, "grad_norm": 0.5694853594582754, "learning_rate": 7.064490740882057e-08, "loss": 0.1564, "step": 712 }, { "epoch": 2.8531375166889186, "grad_norm": 0.6431564447838445, "learning_rate": 6.678333957560513e-08, "loss": 0.1779, "step": 713 }, { "epoch": 2.857142857142857, "grad_norm": 0.603659494625915, "learning_rate": 6.302958957514372e-08, "loss": 0.1703, "step": 714 }, { "epoch": 2.861148197596796, "grad_norm": 0.5731990501391916, "learning_rate": 5.938373944745612e-08, "loss": 0.1687, "step": 715 }, { "epoch": 2.8651535380507345, "grad_norm": 0.6437255392307442, "learning_rate": 5.584586887435739e-08, "loss": 0.1727, "step": 716 }, { "epoch": 2.869158878504673, "grad_norm": 0.5868868712177298, "learning_rate": 5.241605517771753e-08, "loss": 0.1621, "step": 717 }, { "epoch": 2.8731642189586113, "grad_norm": 0.6333270306830352, "learning_rate": 4.909437331777178e-08, "loss": 0.1635, "step": 718 }, { "epoch": 2.87716955941255, "grad_norm": 0.6290385820686785, "learning_rate": 4.588089589148192e-08, "loss": 0.1642, "step": 719 }, { "epoch": 2.8811748998664886, "grad_norm": 0.6191579999515221, "learning_rate": 4.2775693130948094e-08, "loss": 0.1789, "step": 720 }, { "epoch": 2.885180240320427, "grad_norm": 0.5476898057957497, "learning_rate": 3.977883290187667e-08, "loss": 0.1506, "step": 721 }, { "epoch": 2.889185580774366, "grad_norm": 0.578059211687489, "learning_rate": 3.689038070209594e-08, "loss": 0.1592, "step": 722 }, { "epoch": 2.8931909212283045, "grad_norm": 0.5958692191590165, "learning_rate": 3.4110399660123306e-08, "loss": 0.1733, "step": 723 }, { "epoch": 2.897196261682243, "grad_norm": 0.5808682012621369, "learning_rate": 3.143895053378698e-08, "loss": 0.151, "step": 724 }, { "epoch": 2.9012016021361817, "grad_norm": 0.5943310418284603, "learning_rate": 2.8876091708898714e-08, "loss": 0.1733, "step": 725 }, { "epoch": 2.9052069425901204, "grad_norm": 0.5647474381435812, "learning_rate": 2.642187919797479e-08, "loss": 0.1621, "step": 726 }, { "epoch": 2.9092122830440585, "grad_norm": 0.5948502812847656, "learning_rate": 2.4076366639015914e-08, "loss": 0.158, "step": 727 }, { "epoch": 2.913217623497997, "grad_norm": 0.595088477171892, "learning_rate": 2.1839605294330935e-08, "loss": 0.1754, "step": 728 }, { "epoch": 2.917222963951936, "grad_norm": 0.5612839333804125, "learning_rate": 1.97116440494205e-08, "loss": 0.1539, "step": 729 }, { "epoch": 2.9212283044058744, "grad_norm": 0.6167465533535336, "learning_rate": 1.769252941190458e-08, "loss": 0.1708, "step": 730 }, { "epoch": 2.925233644859813, "grad_norm": 0.6811415193373171, "learning_rate": 1.5782305510508855e-08, "loss": 0.1712, "step": 731 }, { "epoch": 2.9292389853137517, "grad_norm": 0.6295676411628186, "learning_rate": 1.3981014094099354e-08, "loss": 0.1606, "step": 732 }, { "epoch": 2.9332443257676903, "grad_norm": 0.5883747534045934, "learning_rate": 1.2288694530769862e-08, "loss": 0.1713, "step": 733 }, { "epoch": 2.937249666221629, "grad_norm": 0.6467588103154122, "learning_rate": 1.0705383806982606e-08, "loss": 0.1882, "step": 734 }, { "epoch": 2.9412550066755676, "grad_norm": 0.7011732558711389, "learning_rate": 9.231116526757234e-09, "loss": 0.1704, "step": 735 }, { "epoch": 2.945260347129506, "grad_norm": 0.6116779172964563, "learning_rate": 7.865924910916977e-09, "loss": 0.1698, "step": 736 }, { "epoch": 2.9492656875834444, "grad_norm": 0.6138331601067519, "learning_rate": 6.609838796385326e-09, "loss": 0.1621, "step": 737 }, { "epoch": 2.953271028037383, "grad_norm": 0.5884709984985781, "learning_rate": 5.4628856355293245e-09, "loss": 0.1704, "step": 738 }, { "epoch": 2.9572763684913217, "grad_norm": 0.6040205984929831, "learning_rate": 4.4250904955656095e-09, "loss": 0.1665, "step": 739 }, { "epoch": 2.9612817089452603, "grad_norm": 0.5944128522570793, "learning_rate": 3.496476058006959e-09, "loss": 0.1696, "step": 740 }, { "epoch": 2.965287049399199, "grad_norm": 0.5824311819553499, "learning_rate": 2.6770626181715776e-09, "loss": 0.1878, "step": 741 }, { "epoch": 2.9692923898531376, "grad_norm": 0.69435894344096, "learning_rate": 1.9668680847356735e-09, "loss": 0.1689, "step": 742 }, { "epoch": 2.9732977303070762, "grad_norm": 0.6630378694543846, "learning_rate": 1.3659079793432173e-09, "loss": 0.1518, "step": 743 }, { "epoch": 2.977303070761015, "grad_norm": 0.5642162664981942, "learning_rate": 8.741954362678773e-10, "loss": 0.1691, "step": 744 }, { "epoch": 2.981308411214953, "grad_norm": 0.5752610766888291, "learning_rate": 4.91741202124918e-10, "loss": 0.1637, "step": 745 }, { "epoch": 2.985313751668892, "grad_norm": 0.6316838155198957, "learning_rate": 2.1855363563638708e-10, "loss": 0.188, "step": 746 }, { "epoch": 2.9893190921228303, "grad_norm": 0.5868868040589227, "learning_rate": 5.4638707447929315e-11, "loss": 0.1496, "step": 747 }, { "epoch": 2.9893190921228303, "step": 747, "total_flos": 194126978285568.0, "train_loss": 0.254960169514499, "train_runtime": 5438.7593, "train_samples_per_second": 13.217, "train_steps_per_second": 0.137 } ], "logging_steps": 1, "max_steps": 747, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": -747, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 194126978285568.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }