{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.05755395683453238, "eval_steps": 50, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 5.755395683453237e-05, "grad_norm": 19.742076873779297, "learning_rate": 0.0, "loss": 1.1824, "step": 1 }, { "epoch": 0.00011510791366906474, "grad_norm": 14.090814590454102, "learning_rate": 2.0000000000000002e-07, "loss": 1.5161, "step": 2 }, { "epoch": 0.00017266187050359714, "grad_norm": 9.585816383361816, "learning_rate": 4.0000000000000003e-07, "loss": 0.4535, "step": 3 }, { "epoch": 0.0002302158273381295, "grad_norm": 13.000758171081543, "learning_rate": 6.000000000000001e-07, "loss": 1.1017, "step": 4 }, { "epoch": 0.00028776978417266187, "grad_norm": 7.664703369140625, "learning_rate": 8.000000000000001e-07, "loss": 0.7945, "step": 5 }, { "epoch": 0.00034532374100719427, "grad_norm": 12.25283145904541, "learning_rate": 1.0000000000000002e-06, "loss": 1.1652, "step": 6 }, { "epoch": 0.0004028776978417266, "grad_norm": 11.274255752563477, "learning_rate": 1.2000000000000002e-06, "loss": 0.719, "step": 7 }, { "epoch": 0.000460431654676259, "grad_norm": 7.998445510864258, "learning_rate": 1.4000000000000001e-06, "loss": 0.6316, "step": 8 }, { "epoch": 0.0005179856115107913, "grad_norm": 8.421154022216797, "learning_rate": 1.6000000000000001e-06, "loss": 0.8872, "step": 9 }, { "epoch": 0.0005755395683453237, "grad_norm": 9.403829574584961, "learning_rate": 1.8000000000000001e-06, "loss": 0.7519, "step": 10 }, { "epoch": 0.0006330935251798561, "grad_norm": 14.926182746887207, "learning_rate": 2.0000000000000003e-06, "loss": 0.8313, "step": 11 }, { "epoch": 0.0006906474820143885, "grad_norm": 5.170341491699219, "learning_rate": 2.2e-06, "loss": 0.4195, "step": 12 }, { "epoch": 0.0007482014388489208, "grad_norm": 4.143105506896973, "learning_rate": 2.4000000000000003e-06, "loss": 0.3894, "step": 13 }, { "epoch": 0.0008057553956834532, "grad_norm": 7.147073268890381, "learning_rate": 2.6e-06, "loss": 0.5323, "step": 14 }, { "epoch": 0.0008633093525179857, "grad_norm": 5.619192600250244, "learning_rate": 2.8000000000000003e-06, "loss": 0.3687, "step": 15 }, { "epoch": 0.000920863309352518, "grad_norm": 6.790902614593506, "learning_rate": 3e-06, "loss": 0.8943, "step": 16 }, { "epoch": 0.0009784172661870504, "grad_norm": 7.149299621582031, "learning_rate": 3.2000000000000003e-06, "loss": 0.5624, "step": 17 }, { "epoch": 0.0010359712230215827, "grad_norm": 6.0668439865112305, "learning_rate": 3.4000000000000005e-06, "loss": 0.5257, "step": 18 }, { "epoch": 0.0010935251798561152, "grad_norm": 7.927511692047119, "learning_rate": 3.6000000000000003e-06, "loss": 0.7319, "step": 19 }, { "epoch": 0.0011510791366906475, "grad_norm": 3.352524757385254, "learning_rate": 3.8000000000000005e-06, "loss": 0.3106, "step": 20 }, { "epoch": 0.0012086330935251798, "grad_norm": 17.440523147583008, "learning_rate": 4.000000000000001e-06, "loss": 1.4364, "step": 21 }, { "epoch": 0.0012661870503597123, "grad_norm": 3.9956955909729004, "learning_rate": 4.2000000000000004e-06, "loss": 0.4213, "step": 22 }, { "epoch": 0.0013237410071942446, "grad_norm": 8.989081382751465, "learning_rate": 4.4e-06, "loss": 0.552, "step": 23 }, { "epoch": 0.001381294964028777, "grad_norm": 8.471776008605957, "learning_rate": 4.600000000000001e-06, "loss": 0.8194, "step": 24 }, { "epoch": 0.0014388489208633094, "grad_norm": 9.24136734008789, "learning_rate": 4.800000000000001e-06, "loss": 0.7052, "step": 25 }, { "epoch": 0.0014964028776978417, "grad_norm": 4.3442158699035645, "learning_rate": 5e-06, "loss": 0.3331, "step": 26 }, { "epoch": 0.0015539568345323742, "grad_norm": 9.657572746276855, "learning_rate": 5.2e-06, "loss": 0.5311, "step": 27 }, { "epoch": 0.0016115107913669065, "grad_norm": 7.1384992599487305, "learning_rate": 5.400000000000001e-06, "loss": 0.3951, "step": 28 }, { "epoch": 0.0016690647482014388, "grad_norm": 16.853891372680664, "learning_rate": 5.600000000000001e-06, "loss": 1.3869, "step": 29 }, { "epoch": 0.0017266187050359713, "grad_norm": 9.793233871459961, "learning_rate": 5.8e-06, "loss": 0.8952, "step": 30 }, { "epoch": 0.0017841726618705036, "grad_norm": 7.212555408477783, "learning_rate": 6e-06, "loss": 0.7198, "step": 31 }, { "epoch": 0.001841726618705036, "grad_norm": 6.492921829223633, "learning_rate": 6.200000000000001e-06, "loss": 0.5619, "step": 32 }, { "epoch": 0.0018992805755395684, "grad_norm": 6.3283915519714355, "learning_rate": 6.4000000000000006e-06, "loss": 1.0968, "step": 33 }, { "epoch": 0.0019568345323741007, "grad_norm": 9.577136993408203, "learning_rate": 6.600000000000001e-06, "loss": 0.8173, "step": 34 }, { "epoch": 0.002014388489208633, "grad_norm": 9.083664894104004, "learning_rate": 6.800000000000001e-06, "loss": 0.8457, "step": 35 }, { "epoch": 0.0020719424460431653, "grad_norm": 4.176952362060547, "learning_rate": 7e-06, "loss": 0.3089, "step": 36 }, { "epoch": 0.002129496402877698, "grad_norm": 5.284554958343506, "learning_rate": 7.2000000000000005e-06, "loss": 0.3452, "step": 37 }, { "epoch": 0.0021870503597122303, "grad_norm": 12.007137298583984, "learning_rate": 7.4e-06, "loss": 1.4472, "step": 38 }, { "epoch": 0.0022446043165467626, "grad_norm": 9.785158157348633, "learning_rate": 7.600000000000001e-06, "loss": 1.2897, "step": 39 }, { "epoch": 0.002302158273381295, "grad_norm": 8.333564758300781, "learning_rate": 7.800000000000002e-06, "loss": 0.3833, "step": 40 }, { "epoch": 0.0023597122302158272, "grad_norm": 5.145090103149414, "learning_rate": 8.000000000000001e-06, "loss": 0.2996, "step": 41 }, { "epoch": 0.0024172661870503595, "grad_norm": 10.46713924407959, "learning_rate": 8.2e-06, "loss": 0.6667, "step": 42 }, { "epoch": 0.0024748201438848923, "grad_norm": 4.508591175079346, "learning_rate": 8.400000000000001e-06, "loss": 0.4044, "step": 43 }, { "epoch": 0.0025323741007194246, "grad_norm": 9.932887077331543, "learning_rate": 8.6e-06, "loss": 0.6707, "step": 44 }, { "epoch": 0.002589928057553957, "grad_norm": 4.800012588500977, "learning_rate": 8.8e-06, "loss": 0.5579, "step": 45 }, { "epoch": 0.002647482014388489, "grad_norm": 2.2920901775360107, "learning_rate": 9e-06, "loss": 0.2296, "step": 46 }, { "epoch": 0.0027050359712230214, "grad_norm": 7.941588878631592, "learning_rate": 9.200000000000002e-06, "loss": 0.5172, "step": 47 }, { "epoch": 0.002762589928057554, "grad_norm": 13.144063949584961, "learning_rate": 9.4e-06, "loss": 1.1074, "step": 48 }, { "epoch": 0.0028201438848920865, "grad_norm": 4.377220630645752, "learning_rate": 9.600000000000001e-06, "loss": 0.2969, "step": 49 }, { "epoch": 0.0028776978417266188, "grad_norm": 6.273579120635986, "learning_rate": 9.800000000000001e-06, "loss": 0.382, "step": 50 }, { "epoch": 0.002935251798561151, "grad_norm": 6.432647705078125, "learning_rate": 1e-05, "loss": 0.7747, "step": 51 }, { "epoch": 0.0029928057553956834, "grad_norm": 9.977968215942383, "learning_rate": 1.02e-05, "loss": 0.6861, "step": 52 }, { "epoch": 0.0030503597122302157, "grad_norm": 6.838860034942627, "learning_rate": 1.04e-05, "loss": 0.4142, "step": 53 }, { "epoch": 0.0031079136690647484, "grad_norm": 15.348876953125, "learning_rate": 1.0600000000000002e-05, "loss": 1.1043, "step": 54 }, { "epoch": 0.0031654676258992807, "grad_norm": 4.6307291984558105, "learning_rate": 1.0800000000000002e-05, "loss": 0.5295, "step": 55 }, { "epoch": 0.003223021582733813, "grad_norm": 13.125807762145996, "learning_rate": 1.1000000000000001e-05, "loss": 0.9708, "step": 56 }, { "epoch": 0.0032805755395683453, "grad_norm": 11.310308456420898, "learning_rate": 1.1200000000000001e-05, "loss": 1.0685, "step": 57 }, { "epoch": 0.0033381294964028776, "grad_norm": 4.394148826599121, "learning_rate": 1.14e-05, "loss": 0.274, "step": 58 }, { "epoch": 0.00339568345323741, "grad_norm": 2.017930030822754, "learning_rate": 1.16e-05, "loss": 0.1795, "step": 59 }, { "epoch": 0.0034532374100719426, "grad_norm": 5.994359970092773, "learning_rate": 1.18e-05, "loss": 0.5362, "step": 60 }, { "epoch": 0.003510791366906475, "grad_norm": 5.567146301269531, "learning_rate": 1.2e-05, "loss": 0.7449, "step": 61 }, { "epoch": 0.003568345323741007, "grad_norm": 4.293890476226807, "learning_rate": 1.22e-05, "loss": 0.5452, "step": 62 }, { "epoch": 0.0036258992805755395, "grad_norm": 8.327472686767578, "learning_rate": 1.2400000000000002e-05, "loss": 0.6844, "step": 63 }, { "epoch": 0.003683453237410072, "grad_norm": 4.723773002624512, "learning_rate": 1.2600000000000001e-05, "loss": 0.3286, "step": 64 }, { "epoch": 0.0037410071942446045, "grad_norm": 10.427044868469238, "learning_rate": 1.2800000000000001e-05, "loss": 0.7127, "step": 65 }, { "epoch": 0.003798561151079137, "grad_norm": 3.913362741470337, "learning_rate": 1.3000000000000001e-05, "loss": 0.4672, "step": 66 }, { "epoch": 0.003856115107913669, "grad_norm": 4.887617588043213, "learning_rate": 1.3200000000000002e-05, "loss": 0.5162, "step": 67 }, { "epoch": 0.003913669064748201, "grad_norm": 6.661849021911621, "learning_rate": 1.3400000000000002e-05, "loss": 0.6116, "step": 68 }, { "epoch": 0.003971223021582734, "grad_norm": 7.069644451141357, "learning_rate": 1.3600000000000002e-05, "loss": 0.637, "step": 69 }, { "epoch": 0.004028776978417266, "grad_norm": 4.68384313583374, "learning_rate": 1.38e-05, "loss": 0.4406, "step": 70 }, { "epoch": 0.004086330935251799, "grad_norm": 5.5008134841918945, "learning_rate": 1.4e-05, "loss": 0.3483, "step": 71 }, { "epoch": 0.004143884892086331, "grad_norm": 4.3354105949401855, "learning_rate": 1.4200000000000001e-05, "loss": 0.5613, "step": 72 }, { "epoch": 0.004201438848920863, "grad_norm": 5.105152130126953, "learning_rate": 1.4400000000000001e-05, "loss": 0.3011, "step": 73 }, { "epoch": 0.004258992805755396, "grad_norm": 2.7917816638946533, "learning_rate": 1.46e-05, "loss": 0.2686, "step": 74 }, { "epoch": 0.004316546762589928, "grad_norm": 4.981333255767822, "learning_rate": 1.48e-05, "loss": 0.3631, "step": 75 }, { "epoch": 0.004374100719424461, "grad_norm": 3.1867117881774902, "learning_rate": 1.5000000000000002e-05, "loss": 0.4055, "step": 76 }, { "epoch": 0.0044316546762589925, "grad_norm": 7.01395320892334, "learning_rate": 1.5200000000000002e-05, "loss": 0.5945, "step": 77 }, { "epoch": 0.004489208633093525, "grad_norm": 7.518090724945068, "learning_rate": 1.54e-05, "loss": 0.9833, "step": 78 }, { "epoch": 0.004546762589928057, "grad_norm": 3.8727071285247803, "learning_rate": 1.5600000000000003e-05, "loss": 0.3201, "step": 79 }, { "epoch": 0.00460431654676259, "grad_norm": 3.5626542568206787, "learning_rate": 1.58e-05, "loss": 0.4482, "step": 80 }, { "epoch": 0.004661870503597123, "grad_norm": 4.564299583435059, "learning_rate": 1.6000000000000003e-05, "loss": 0.3674, "step": 81 }, { "epoch": 0.0047194244604316545, "grad_norm": 3.150601387023926, "learning_rate": 1.62e-05, "loss": 0.4653, "step": 82 }, { "epoch": 0.004776978417266187, "grad_norm": 4.487133502960205, "learning_rate": 1.64e-05, "loss": 0.3553, "step": 83 }, { "epoch": 0.004834532374100719, "grad_norm": 9.726006507873535, "learning_rate": 1.66e-05, "loss": 0.7446, "step": 84 }, { "epoch": 0.004892086330935252, "grad_norm": 4.138358116149902, "learning_rate": 1.6800000000000002e-05, "loss": 0.2264, "step": 85 }, { "epoch": 0.0049496402877697845, "grad_norm": 4.193305015563965, "learning_rate": 1.7e-05, "loss": 0.3754, "step": 86 }, { "epoch": 0.005007194244604316, "grad_norm": 5.699273109436035, "learning_rate": 1.72e-05, "loss": 0.2857, "step": 87 }, { "epoch": 0.005064748201438849, "grad_norm": 3.635141134262085, "learning_rate": 1.7400000000000003e-05, "loss": 0.4759, "step": 88 }, { "epoch": 0.005122302158273381, "grad_norm": 4.9962592124938965, "learning_rate": 1.76e-05, "loss": 0.3983, "step": 89 }, { "epoch": 0.005179856115107914, "grad_norm": 9.600940704345703, "learning_rate": 1.7800000000000002e-05, "loss": 0.6997, "step": 90 }, { "epoch": 0.005237410071942446, "grad_norm": 5.990379333496094, "learning_rate": 1.8e-05, "loss": 0.325, "step": 91 }, { "epoch": 0.005294964028776978, "grad_norm": 3.9311363697052, "learning_rate": 1.8200000000000002e-05, "loss": 0.221, "step": 92 }, { "epoch": 0.005352517985611511, "grad_norm": 4.358214378356934, "learning_rate": 1.8400000000000003e-05, "loss": 0.5484, "step": 93 }, { "epoch": 0.005410071942446043, "grad_norm": 4.867093086242676, "learning_rate": 1.86e-05, "loss": 0.3255, "step": 94 }, { "epoch": 0.005467625899280576, "grad_norm": 5.905974864959717, "learning_rate": 1.88e-05, "loss": 0.7437, "step": 95 }, { "epoch": 0.005525179856115108, "grad_norm": 3.0502068996429443, "learning_rate": 1.9e-05, "loss": 0.3985, "step": 96 }, { "epoch": 0.00558273381294964, "grad_norm": 3.0659244060516357, "learning_rate": 1.9200000000000003e-05, "loss": 0.4518, "step": 97 }, { "epoch": 0.005640287769784173, "grad_norm": 5.97324800491333, "learning_rate": 1.94e-05, "loss": 0.5574, "step": 98 }, { "epoch": 0.005697841726618705, "grad_norm": 5.410953998565674, "learning_rate": 1.9600000000000002e-05, "loss": 0.6145, "step": 99 }, { "epoch": 0.0057553956834532375, "grad_norm": 3.2696614265441895, "learning_rate": 1.98e-05, "loss": 0.3616, "step": 100 }, { "epoch": 0.005812949640287769, "grad_norm": 6.245858192443848, "learning_rate": 2e-05, "loss": 0.2664, "step": 101 }, { "epoch": 0.005870503597122302, "grad_norm": 8.132523536682129, "learning_rate": 1.9999939076577906e-05, "loss": 0.4025, "step": 102 }, { "epoch": 0.005928057553956835, "grad_norm": 11.821637153625488, "learning_rate": 1.9999756307053947e-05, "loss": 0.9175, "step": 103 }, { "epoch": 0.005985611510791367, "grad_norm": 5.507436752319336, "learning_rate": 1.9999451693655125e-05, "loss": 0.5407, "step": 104 }, { "epoch": 0.0060431654676258995, "grad_norm": 2.0903306007385254, "learning_rate": 1.9999025240093045e-05, "loss": 0.2037, "step": 105 }, { "epoch": 0.006100719424460431, "grad_norm": 6.703708648681641, "learning_rate": 1.9998476951563914e-05, "loss": 0.3923, "step": 106 }, { "epoch": 0.006158273381294964, "grad_norm": 7.479000091552734, "learning_rate": 1.9997806834748455e-05, "loss": 0.5959, "step": 107 }, { "epoch": 0.006215827338129497, "grad_norm": 3.7436177730560303, "learning_rate": 1.9997014897811834e-05, "loss": 0.3848, "step": 108 }, { "epoch": 0.006273381294964029, "grad_norm": 6.0655388832092285, "learning_rate": 1.9996101150403543e-05, "loss": 0.3867, "step": 109 }, { "epoch": 0.006330935251798561, "grad_norm": 2.6236915588378906, "learning_rate": 1.9995065603657317e-05, "loss": 0.2235, "step": 110 }, { "epoch": 0.006388489208633093, "grad_norm": 3.6318767070770264, "learning_rate": 1.999390827019096e-05, "loss": 0.2775, "step": 111 }, { "epoch": 0.006446043165467626, "grad_norm": 3.6131348609924316, "learning_rate": 1.999262916410621e-05, "loss": 0.473, "step": 112 }, { "epoch": 0.006503597122302159, "grad_norm": 5.235411167144775, "learning_rate": 1.9991228300988586e-05, "loss": 0.5327, "step": 113 }, { "epoch": 0.0065611510791366906, "grad_norm": 4.830928802490234, "learning_rate": 1.998970569790715e-05, "loss": 0.3054, "step": 114 }, { "epoch": 0.006618705035971223, "grad_norm": 3.149054527282715, "learning_rate": 1.9988061373414342e-05, "loss": 0.2049, "step": 115 }, { "epoch": 0.006676258992805755, "grad_norm": 0.9228267669677734, "learning_rate": 1.9986295347545738e-05, "loss": 0.137, "step": 116 }, { "epoch": 0.006733812949640288, "grad_norm": 1.2999602556228638, "learning_rate": 1.9984407641819812e-05, "loss": 0.1165, "step": 117 }, { "epoch": 0.00679136690647482, "grad_norm": 5.368565082550049, "learning_rate": 1.9982398279237657e-05, "loss": 0.4314, "step": 118 }, { "epoch": 0.0068489208633093525, "grad_norm": 11.419666290283203, "learning_rate": 1.9980267284282718e-05, "loss": 0.7708, "step": 119 }, { "epoch": 0.006906474820143885, "grad_norm": 5.084501266479492, "learning_rate": 1.9978014682920503e-05, "loss": 0.2652, "step": 120 }, { "epoch": 0.006964028776978417, "grad_norm": 5.778707027435303, "learning_rate": 1.9975640502598243e-05, "loss": 0.6306, "step": 121 }, { "epoch": 0.00702158273381295, "grad_norm": 4.678982257843018, "learning_rate": 1.997314477224458e-05, "loss": 0.2769, "step": 122 }, { "epoch": 0.007079136690647482, "grad_norm": 3.2127490043640137, "learning_rate": 1.9970527522269204e-05, "loss": 0.3237, "step": 123 }, { "epoch": 0.007136690647482014, "grad_norm": 8.614684104919434, "learning_rate": 1.9967788784562474e-05, "loss": 0.7173, "step": 124 }, { "epoch": 0.007194244604316547, "grad_norm": 7.342788219451904, "learning_rate": 1.9964928592495046e-05, "loss": 0.6905, "step": 125 }, { "epoch": 0.007251798561151079, "grad_norm": 3.408464193344116, "learning_rate": 1.9961946980917457e-05, "loss": 0.2835, "step": 126 }, { "epoch": 0.007309352517985612, "grad_norm": 4.440028667449951, "learning_rate": 1.9958843986159705e-05, "loss": 0.2777, "step": 127 }, { "epoch": 0.007366906474820144, "grad_norm": 2.4154090881347656, "learning_rate": 1.99556196460308e-05, "loss": 0.205, "step": 128 }, { "epoch": 0.007424460431654676, "grad_norm": 2.5792348384857178, "learning_rate": 1.9952273999818312e-05, "loss": 0.2947, "step": 129 }, { "epoch": 0.007482014388489209, "grad_norm": 5.055807113647461, "learning_rate": 1.9948807088287884e-05, "loss": 0.4372, "step": 130 }, { "epoch": 0.007539568345323741, "grad_norm": 5.54684591293335, "learning_rate": 1.9945218953682736e-05, "loss": 0.305, "step": 131 }, { "epoch": 0.007597122302158274, "grad_norm": 5.550899028778076, "learning_rate": 1.9941509639723155e-05, "loss": 0.5005, "step": 132 }, { "epoch": 0.0076546762589928055, "grad_norm": 6.129327297210693, "learning_rate": 1.9937679191605964e-05, "loss": 0.7251, "step": 133 }, { "epoch": 0.007712230215827338, "grad_norm": 14.2527437210083, "learning_rate": 1.9933727656003964e-05, "loss": 0.6724, "step": 134 }, { "epoch": 0.00776978417266187, "grad_norm": 5.460100173950195, "learning_rate": 1.992965508106537e-05, "loss": 0.3455, "step": 135 }, { "epoch": 0.007827338129496403, "grad_norm": 6.3173828125, "learning_rate": 1.9925461516413224e-05, "loss": 0.5253, "step": 136 }, { "epoch": 0.007884892086330935, "grad_norm": 2.2781403064727783, "learning_rate": 1.9921147013144782e-05, "loss": 0.284, "step": 137 }, { "epoch": 0.007942446043165468, "grad_norm": 8.100839614868164, "learning_rate": 1.9916711623830904e-05, "loss": 0.561, "step": 138 }, { "epoch": 0.008, "grad_norm": 2.185936212539673, "learning_rate": 1.991215540251542e-05, "loss": 0.2032, "step": 139 }, { "epoch": 0.008057553956834532, "grad_norm": 3.2472143173217773, "learning_rate": 1.9907478404714438e-05, "loss": 0.3126, "step": 140 }, { "epoch": 0.008115107913669064, "grad_norm": 2.253760576248169, "learning_rate": 1.9902680687415704e-05, "loss": 0.28, "step": 141 }, { "epoch": 0.008172661870503597, "grad_norm": 2.6927177906036377, "learning_rate": 1.989776230907789e-05, "loss": 0.32, "step": 142 }, { "epoch": 0.00823021582733813, "grad_norm": 6.3567681312561035, "learning_rate": 1.9892723329629885e-05, "loss": 0.4475, "step": 143 }, { "epoch": 0.008287769784172661, "grad_norm": 2.409682512283325, "learning_rate": 1.988756381047006e-05, "loss": 0.1429, "step": 144 }, { "epoch": 0.008345323741007195, "grad_norm": 1.8926228284835815, "learning_rate": 1.988228381446553e-05, "loss": 0.1483, "step": 145 }, { "epoch": 0.008402877697841727, "grad_norm": 4.373330116271973, "learning_rate": 1.9876883405951378e-05, "loss": 0.2895, "step": 146 }, { "epoch": 0.008460431654676259, "grad_norm": 5.756573677062988, "learning_rate": 1.987136265072988e-05, "loss": 0.3329, "step": 147 }, { "epoch": 0.008517985611510792, "grad_norm": 4.497378826141357, "learning_rate": 1.9865721616069695e-05, "loss": 0.4046, "step": 148 }, { "epoch": 0.008575539568345324, "grad_norm": 3.908770799636841, "learning_rate": 1.985996037070505e-05, "loss": 0.3289, "step": 149 }, { "epoch": 0.008633093525179856, "grad_norm": 9.844822883605957, "learning_rate": 1.9854078984834904e-05, "loss": 0.4939, "step": 150 }, { "epoch": 0.008690647482014388, "grad_norm": 8.726676940917969, "learning_rate": 1.9848077530122083e-05, "loss": 0.5217, "step": 151 }, { "epoch": 0.008748201438848921, "grad_norm": 2.3976569175720215, "learning_rate": 1.984195607969242e-05, "loss": 0.0699, "step": 152 }, { "epoch": 0.008805755395683453, "grad_norm": 5.398916244506836, "learning_rate": 1.983571470813386e-05, "loss": 0.4521, "step": 153 }, { "epoch": 0.008863309352517985, "grad_norm": 6.7461137771606445, "learning_rate": 1.9829353491495545e-05, "loss": 0.3828, "step": 154 }, { "epoch": 0.008920863309352519, "grad_norm": 4.338736057281494, "learning_rate": 1.982287250728689e-05, "loss": 0.3047, "step": 155 }, { "epoch": 0.00897841726618705, "grad_norm": 6.214763164520264, "learning_rate": 1.9816271834476642e-05, "loss": 0.3477, "step": 156 }, { "epoch": 0.009035971223021582, "grad_norm": 2.8336939811706543, "learning_rate": 1.9809551553491918e-05, "loss": 0.3321, "step": 157 }, { "epoch": 0.009093525179856114, "grad_norm": 4.502871990203857, "learning_rate": 1.9802711746217222e-05, "loss": 0.4364, "step": 158 }, { "epoch": 0.009151079136690648, "grad_norm": 8.72002124786377, "learning_rate": 1.979575249599344e-05, "loss": 0.4955, "step": 159 }, { "epoch": 0.00920863309352518, "grad_norm": 6.201029300689697, "learning_rate": 1.9788673887616852e-05, "loss": 0.2945, "step": 160 }, { "epoch": 0.009266187050359712, "grad_norm": 8.309669494628906, "learning_rate": 1.9781476007338058e-05, "loss": 0.3601, "step": 161 }, { "epoch": 0.009323741007194245, "grad_norm": 2.303840398788452, "learning_rate": 1.9774158942860962e-05, "loss": 0.2439, "step": 162 }, { "epoch": 0.009381294964028777, "grad_norm": 1.9197461605072021, "learning_rate": 1.9766722783341682e-05, "loss": 0.1073, "step": 163 }, { "epoch": 0.009438848920863309, "grad_norm": 2.906818389892578, "learning_rate": 1.9759167619387474e-05, "loss": 0.3538, "step": 164 }, { "epoch": 0.009496402877697842, "grad_norm": 6.95295524597168, "learning_rate": 1.9751493543055634e-05, "loss": 0.7104, "step": 165 }, { "epoch": 0.009553956834532374, "grad_norm": 13.919471740722656, "learning_rate": 1.9743700647852356e-05, "loss": 0.5481, "step": 166 }, { "epoch": 0.009611510791366906, "grad_norm": 5.098160743713379, "learning_rate": 1.9735789028731603e-05, "loss": 0.6219, "step": 167 }, { "epoch": 0.009669064748201438, "grad_norm": 2.4541916847229004, "learning_rate": 1.972775878209397e-05, "loss": 0.1824, "step": 168 }, { "epoch": 0.009726618705035972, "grad_norm": 2.5615649223327637, "learning_rate": 1.9719610005785466e-05, "loss": 0.2294, "step": 169 }, { "epoch": 0.009784172661870504, "grad_norm": 5.56582498550415, "learning_rate": 1.971134279909636e-05, "loss": 0.4881, "step": 170 }, { "epoch": 0.009841726618705035, "grad_norm": 3.3772428035736084, "learning_rate": 1.9702957262759964e-05, "loss": 0.2457, "step": 171 }, { "epoch": 0.009899280575539569, "grad_norm": 3.7561800479888916, "learning_rate": 1.9694453498951392e-05, "loss": 0.2876, "step": 172 }, { "epoch": 0.009956834532374101, "grad_norm": 4.44319486618042, "learning_rate": 1.9685831611286312e-05, "loss": 0.3955, "step": 173 }, { "epoch": 0.010014388489208633, "grad_norm": 7.627588748931885, "learning_rate": 1.9677091704819714e-05, "loss": 0.5145, "step": 174 }, { "epoch": 0.010071942446043165, "grad_norm": 2.9267642498016357, "learning_rate": 1.9668233886044597e-05, "loss": 0.2853, "step": 175 }, { "epoch": 0.010129496402877698, "grad_norm": 2.577763319015503, "learning_rate": 1.9659258262890683e-05, "loss": 0.1479, "step": 176 }, { "epoch": 0.01018705035971223, "grad_norm": 3.698162794113159, "learning_rate": 1.9650164944723116e-05, "loss": 0.3178, "step": 177 }, { "epoch": 0.010244604316546762, "grad_norm": 3.1427218914031982, "learning_rate": 1.96409540423411e-05, "loss": 0.3621, "step": 178 }, { "epoch": 0.010302158273381296, "grad_norm": 6.883255958557129, "learning_rate": 1.9631625667976584e-05, "loss": 0.6752, "step": 179 }, { "epoch": 0.010359712230215827, "grad_norm": 5.5977044105529785, "learning_rate": 1.9622179935292855e-05, "loss": 0.2751, "step": 180 }, { "epoch": 0.01041726618705036, "grad_norm": 6.071504592895508, "learning_rate": 1.961261695938319e-05, "loss": 0.2445, "step": 181 }, { "epoch": 0.010474820143884893, "grad_norm": 3.8929355144500732, "learning_rate": 1.9602936856769432e-05, "loss": 0.3608, "step": 182 }, { "epoch": 0.010532374100719425, "grad_norm": 3.836050033569336, "learning_rate": 1.9593139745400575e-05, "loss": 0.2327, "step": 183 }, { "epoch": 0.010589928057553957, "grad_norm": 11.480710983276367, "learning_rate": 1.9583225744651334e-05, "loss": 0.6428, "step": 184 }, { "epoch": 0.010647482014388488, "grad_norm": 4.123970985412598, "learning_rate": 1.9573194975320672e-05, "loss": 0.2694, "step": 185 }, { "epoch": 0.010705035971223022, "grad_norm": 5.072369575500488, "learning_rate": 1.9563047559630356e-05, "loss": 0.3314, "step": 186 }, { "epoch": 0.010762589928057554, "grad_norm": 6.05760383605957, "learning_rate": 1.9552783621223437e-05, "loss": 0.4093, "step": 187 }, { "epoch": 0.010820143884892086, "grad_norm": 1.4690526723861694, "learning_rate": 1.954240328516277e-05, "loss": 0.1567, "step": 188 }, { "epoch": 0.01087769784172662, "grad_norm": 4.700074672698975, "learning_rate": 1.9531906677929472e-05, "loss": 0.224, "step": 189 }, { "epoch": 0.010935251798561151, "grad_norm": 7.230707168579102, "learning_rate": 1.9521293927421388e-05, "loss": 0.5614, "step": 190 }, { "epoch": 0.010992805755395683, "grad_norm": 3.8735275268554688, "learning_rate": 1.9510565162951538e-05, "loss": 0.2567, "step": 191 }, { "epoch": 0.011050359712230217, "grad_norm": 1.882738471031189, "learning_rate": 1.9499720515246524e-05, "loss": 0.1471, "step": 192 }, { "epoch": 0.011107913669064749, "grad_norm": 11.0597505569458, "learning_rate": 1.9488760116444966e-05, "loss": 0.6557, "step": 193 }, { "epoch": 0.01116546762589928, "grad_norm": 8.127897262573242, "learning_rate": 1.947768410009586e-05, "loss": 0.6195, "step": 194 }, { "epoch": 0.011223021582733812, "grad_norm": 10.546082496643066, "learning_rate": 1.9466492601156964e-05, "loss": 0.6428, "step": 195 }, { "epoch": 0.011280575539568346, "grad_norm": 5.149999141693115, "learning_rate": 1.945518575599317e-05, "loss": 0.5332, "step": 196 }, { "epoch": 0.011338129496402878, "grad_norm": 1.2085461616516113, "learning_rate": 1.944376370237481e-05, "loss": 0.0889, "step": 197 }, { "epoch": 0.01139568345323741, "grad_norm": 4.670867919921875, "learning_rate": 1.943222657947601e-05, "loss": 0.3224, "step": 198 }, { "epoch": 0.011453237410071943, "grad_norm": 10.440308570861816, "learning_rate": 1.942057452787297e-05, "loss": 0.5673, "step": 199 }, { "epoch": 0.011510791366906475, "grad_norm": 2.601593017578125, "learning_rate": 1.9408807689542257e-05, "loss": 0.1987, "step": 200 }, { "epoch": 0.011568345323741007, "grad_norm": 3.630814552307129, "learning_rate": 1.9396926207859085e-05, "loss": 0.1889, "step": 201 }, { "epoch": 0.011625899280575539, "grad_norm": 5.2653632164001465, "learning_rate": 1.938493022759556e-05, "loss": 0.5126, "step": 202 }, { "epoch": 0.011683453237410072, "grad_norm": 5.171159744262695, "learning_rate": 1.937281989491892e-05, "loss": 0.4045, "step": 203 }, { "epoch": 0.011741007194244604, "grad_norm": 4.25119686126709, "learning_rate": 1.9360595357389735e-05, "loss": 0.4102, "step": 204 }, { "epoch": 0.011798561151079136, "grad_norm": 4.690221309661865, "learning_rate": 1.9348256763960146e-05, "loss": 0.1851, "step": 205 }, { "epoch": 0.01185611510791367, "grad_norm": 2.5318408012390137, "learning_rate": 1.9335804264972018e-05, "loss": 0.1637, "step": 206 }, { "epoch": 0.011913669064748202, "grad_norm": 3.1044423580169678, "learning_rate": 1.9323238012155125e-05, "loss": 0.2009, "step": 207 }, { "epoch": 0.011971223021582733, "grad_norm": 3.260160207748413, "learning_rate": 1.9310558158625286e-05, "loss": 0.3784, "step": 208 }, { "epoch": 0.012028776978417267, "grad_norm": 2.753958225250244, "learning_rate": 1.9297764858882516e-05, "loss": 0.2527, "step": 209 }, { "epoch": 0.012086330935251799, "grad_norm": 2.3980395793914795, "learning_rate": 1.9284858268809135e-05, "loss": 0.2148, "step": 210 }, { "epoch": 0.01214388489208633, "grad_norm": 7.618435382843018, "learning_rate": 1.9271838545667876e-05, "loss": 0.4407, "step": 211 }, { "epoch": 0.012201438848920863, "grad_norm": 2.246246099472046, "learning_rate": 1.925870584809995e-05, "loss": 0.2267, "step": 212 }, { "epoch": 0.012258992805755396, "grad_norm": 6.476871490478516, "learning_rate": 1.9245460336123136e-05, "loss": 0.3225, "step": 213 }, { "epoch": 0.012316546762589928, "grad_norm": 3.0804295539855957, "learning_rate": 1.923210217112981e-05, "loss": 0.2715, "step": 214 }, { "epoch": 0.01237410071942446, "grad_norm": 2.8532204627990723, "learning_rate": 1.9218631515885007e-05, "loss": 0.1833, "step": 215 }, { "epoch": 0.012431654676258994, "grad_norm": 5.331664562225342, "learning_rate": 1.9205048534524405e-05, "loss": 0.2848, "step": 216 }, { "epoch": 0.012489208633093525, "grad_norm": 6.048542022705078, "learning_rate": 1.9191353392552346e-05, "loss": 0.5696, "step": 217 }, { "epoch": 0.012546762589928057, "grad_norm": 1.7071490287780762, "learning_rate": 1.9177546256839814e-05, "loss": 0.1342, "step": 218 }, { "epoch": 0.01260431654676259, "grad_norm": 2.181581974029541, "learning_rate": 1.9163627295622397e-05, "loss": 0.123, "step": 219 }, { "epoch": 0.012661870503597123, "grad_norm": 3.1651432514190674, "learning_rate": 1.914959667849825e-05, "loss": 0.1772, "step": 220 }, { "epoch": 0.012719424460431655, "grad_norm": 5.666025638580322, "learning_rate": 1.913545457642601e-05, "loss": 0.3085, "step": 221 }, { "epoch": 0.012776978417266186, "grad_norm": 2.379805088043213, "learning_rate": 1.9121201161722732e-05, "loss": 0.1892, "step": 222 }, { "epoch": 0.01283453237410072, "grad_norm": 2.259615182876587, "learning_rate": 1.910683660806177e-05, "loss": 0.1345, "step": 223 }, { "epoch": 0.012892086330935252, "grad_norm": 5.213752746582031, "learning_rate": 1.9092361090470688e-05, "loss": 0.3298, "step": 224 }, { "epoch": 0.012949640287769784, "grad_norm": 4.78728723526001, "learning_rate": 1.907777478532909e-05, "loss": 0.2825, "step": 225 }, { "epoch": 0.013007194244604317, "grad_norm": 0.8624676465988159, "learning_rate": 1.9063077870366504e-05, "loss": 0.109, "step": 226 }, { "epoch": 0.01306474820143885, "grad_norm": 1.283377766609192, "learning_rate": 1.9048270524660197e-05, "loss": 0.1188, "step": 227 }, { "epoch": 0.013122302158273381, "grad_norm": 9.221684455871582, "learning_rate": 1.903335292863301e-05, "loss": 0.5137, "step": 228 }, { "epoch": 0.013179856115107913, "grad_norm": 11.753008842468262, "learning_rate": 1.901832526405114e-05, "loss": 0.4472, "step": 229 }, { "epoch": 0.013237410071942447, "grad_norm": 1.6660693883895874, "learning_rate": 1.9003187714021936e-05, "loss": 0.1548, "step": 230 }, { "epoch": 0.013294964028776978, "grad_norm": 0.9586846232414246, "learning_rate": 1.8987940462991673e-05, "loss": 0.0959, "step": 231 }, { "epoch": 0.01335251798561151, "grad_norm": 0.9193116426467896, "learning_rate": 1.8972583696743284e-05, "loss": 0.0977, "step": 232 }, { "epoch": 0.013410071942446044, "grad_norm": 0.9497302174568176, "learning_rate": 1.895711760239413e-05, "loss": 0.0993, "step": 233 }, { "epoch": 0.013467625899280576, "grad_norm": 2.8770296573638916, "learning_rate": 1.8941542368393683e-05, "loss": 0.1175, "step": 234 }, { "epoch": 0.013525179856115108, "grad_norm": 5.624111175537109, "learning_rate": 1.892585818452126e-05, "loss": 0.2421, "step": 235 }, { "epoch": 0.01358273381294964, "grad_norm": 1.8553048372268677, "learning_rate": 1.891006524188368e-05, "loss": 0.1538, "step": 236 }, { "epoch": 0.013640287769784173, "grad_norm": 6.076197624206543, "learning_rate": 1.889416373291298e-05, "loss": 0.2427, "step": 237 }, { "epoch": 0.013697841726618705, "grad_norm": 3.465892791748047, "learning_rate": 1.8878153851364013e-05, "loss": 0.1723, "step": 238 }, { "epoch": 0.013755395683453237, "grad_norm": 15.277708053588867, "learning_rate": 1.8862035792312148e-05, "loss": 0.5853, "step": 239 }, { "epoch": 0.01381294964028777, "grad_norm": 4.846475124359131, "learning_rate": 1.884580975215084e-05, "loss": 0.2031, "step": 240 }, { "epoch": 0.013870503597122302, "grad_norm": 3.0435233116149902, "learning_rate": 1.8829475928589272e-05, "loss": 0.2053, "step": 241 }, { "epoch": 0.013928057553956834, "grad_norm": 6.824376583099365, "learning_rate": 1.8813034520649923e-05, "loss": 0.1688, "step": 242 }, { "epoch": 0.013985611510791368, "grad_norm": 1.9684380292892456, "learning_rate": 1.879648572866617e-05, "loss": 0.1105, "step": 243 }, { "epoch": 0.0140431654676259, "grad_norm": 7.668671131134033, "learning_rate": 1.8779829754279806e-05, "loss": 0.4672, "step": 244 }, { "epoch": 0.014100719424460431, "grad_norm": 7.829854488372803, "learning_rate": 1.8763066800438638e-05, "loss": 0.229, "step": 245 }, { "epoch": 0.014158273381294963, "grad_norm": 2.082008123397827, "learning_rate": 1.874619707139396e-05, "loss": 0.1406, "step": 246 }, { "epoch": 0.014215827338129497, "grad_norm": 6.171285629272461, "learning_rate": 1.8729220772698096e-05, "loss": 0.1633, "step": 247 }, { "epoch": 0.014273381294964029, "grad_norm": 2.4114866256713867, "learning_rate": 1.8712138111201898e-05, "loss": 0.1011, "step": 248 }, { "epoch": 0.01433093525179856, "grad_norm": 6.919300079345703, "learning_rate": 1.869494929505219e-05, "loss": 0.2706, "step": 249 }, { "epoch": 0.014388489208633094, "grad_norm": 2.967616558074951, "learning_rate": 1.8677654533689287e-05, "loss": 0.1721, "step": 250 }, { "epoch": 0.014446043165467626, "grad_norm": 19.468122482299805, "learning_rate": 1.866025403784439e-05, "loss": 0.7281, "step": 251 }, { "epoch": 0.014503597122302158, "grad_norm": 5.602455139160156, "learning_rate": 1.864274801953705e-05, "loss": 0.108, "step": 252 }, { "epoch": 0.01456115107913669, "grad_norm": 16.94603729248047, "learning_rate": 1.8625136692072577e-05, "loss": 0.4322, "step": 253 }, { "epoch": 0.014618705035971223, "grad_norm": 5.4511823654174805, "learning_rate": 1.860742027003944e-05, "loss": 0.2628, "step": 254 }, { "epoch": 0.014676258992805755, "grad_norm": 3.1577935218811035, "learning_rate": 1.8589598969306646e-05, "loss": 0.1408, "step": 255 }, { "epoch": 0.014733812949640287, "grad_norm": 0.9684641361236572, "learning_rate": 1.8571673007021124e-05, "loss": 0.0771, "step": 256 }, { "epoch": 0.01479136690647482, "grad_norm": 10.628517150878906, "learning_rate": 1.855364260160507e-05, "loss": 0.4681, "step": 257 }, { "epoch": 0.014848920863309353, "grad_norm": 1.4679758548736572, "learning_rate": 1.8535507972753275e-05, "loss": 0.1161, "step": 258 }, { "epoch": 0.014906474820143885, "grad_norm": 2.098684549331665, "learning_rate": 1.851726934143048e-05, "loss": 0.1365, "step": 259 }, { "epoch": 0.014964028776978418, "grad_norm": 1.1517246961593628, "learning_rate": 1.849892692986864e-05, "loss": 0.1261, "step": 260 }, { "epoch": 0.01502158273381295, "grad_norm": 11.601617813110352, "learning_rate": 1.848048096156426e-05, "loss": 0.5597, "step": 261 }, { "epoch": 0.015079136690647482, "grad_norm": 9.081189155578613, "learning_rate": 1.8461931661275642e-05, "loss": 0.3012, "step": 262 }, { "epoch": 0.015136690647482014, "grad_norm": 1.1501121520996094, "learning_rate": 1.8443279255020153e-05, "loss": 0.1101, "step": 263 }, { "epoch": 0.015194244604316547, "grad_norm": 1.6358212232589722, "learning_rate": 1.842452397007148e-05, "loss": 0.0907, "step": 264 }, { "epoch": 0.01525179856115108, "grad_norm": 1.1782243251800537, "learning_rate": 1.8405666034956842e-05, "loss": 0.0821, "step": 265 }, { "epoch": 0.015309352517985611, "grad_norm": 3.5769565105438232, "learning_rate": 1.8386705679454243e-05, "loss": 0.1347, "step": 266 }, { "epoch": 0.015366906474820145, "grad_norm": 9.618369102478027, "learning_rate": 1.836764313458962e-05, "loss": 0.2999, "step": 267 }, { "epoch": 0.015424460431654676, "grad_norm": 3.8770649433135986, "learning_rate": 1.8348478632634067e-05, "loss": 0.1558, "step": 268 }, { "epoch": 0.015482014388489208, "grad_norm": 5.656462669372559, "learning_rate": 1.8329212407100996e-05, "loss": 0.2134, "step": 269 }, { "epoch": 0.01553956834532374, "grad_norm": 7.180418491363525, "learning_rate": 1.8309844692743283e-05, "loss": 0.2609, "step": 270 }, { "epoch": 0.015597122302158274, "grad_norm": 1.3020349740982056, "learning_rate": 1.8290375725550417e-05, "loss": 0.0988, "step": 271 }, { "epoch": 0.015654676258992806, "grad_norm": 5.860103607177734, "learning_rate": 1.827080574274562e-05, "loss": 0.2128, "step": 272 }, { "epoch": 0.01571223021582734, "grad_norm": 10.250383377075195, "learning_rate": 1.8251134982782952e-05, "loss": 0.2957, "step": 273 }, { "epoch": 0.01576978417266187, "grad_norm": 0.8903178572654724, "learning_rate": 1.8231363685344422e-05, "loss": 0.1002, "step": 274 }, { "epoch": 0.015827338129496403, "grad_norm": 9.552652359008789, "learning_rate": 1.821149209133704e-05, "loss": 0.2226, "step": 275 }, { "epoch": 0.015884892086330937, "grad_norm": 8.22533893585205, "learning_rate": 1.819152044288992e-05, "loss": 0.4168, "step": 276 }, { "epoch": 0.015942446043165467, "grad_norm": 3.7411088943481445, "learning_rate": 1.8171448983351284e-05, "loss": 0.133, "step": 277 }, { "epoch": 0.016, "grad_norm": 6.830077171325684, "learning_rate": 1.815127795728554e-05, "loss": 0.1879, "step": 278 }, { "epoch": 0.016057553956834534, "grad_norm": 6.609959125518799, "learning_rate": 1.8131007610470278e-05, "loss": 0.3989, "step": 279 }, { "epoch": 0.016115107913669064, "grad_norm": 8.54151725769043, "learning_rate": 1.8110638189893267e-05, "loss": 0.2935, "step": 280 }, { "epoch": 0.016172661870503598, "grad_norm": 2.277355909347534, "learning_rate": 1.8090169943749477e-05, "loss": 0.0824, "step": 281 }, { "epoch": 0.016230215827338128, "grad_norm": 0.6691151857376099, "learning_rate": 1.806960312143802e-05, "loss": 0.0981, "step": 282 }, { "epoch": 0.01628776978417266, "grad_norm": 1.0405458211898804, "learning_rate": 1.804893797355914e-05, "loss": 0.1054, "step": 283 }, { "epoch": 0.016345323741007195, "grad_norm": 3.0072195529937744, "learning_rate": 1.8028174751911147e-05, "loss": 0.1007, "step": 284 }, { "epoch": 0.016402877697841725, "grad_norm": 5.276819229125977, "learning_rate": 1.8007313709487334e-05, "loss": 0.2197, "step": 285 }, { "epoch": 0.01646043165467626, "grad_norm": 0.3860107362270355, "learning_rate": 1.798635510047293e-05, "loss": 0.0449, "step": 286 }, { "epoch": 0.016517985611510792, "grad_norm": 7.217343330383301, "learning_rate": 1.7965299180241963e-05, "loss": 0.2553, "step": 287 }, { "epoch": 0.016575539568345322, "grad_norm": 6.964923858642578, "learning_rate": 1.7944146205354182e-05, "loss": 0.3295, "step": 288 }, { "epoch": 0.016633093525179856, "grad_norm": 6.25247859954834, "learning_rate": 1.792289643355191e-05, "loss": 0.2184, "step": 289 }, { "epoch": 0.01669064748201439, "grad_norm": 2.1732687950134277, "learning_rate": 1.7901550123756906e-05, "loss": 0.0992, "step": 290 }, { "epoch": 0.01674820143884892, "grad_norm": 4.60042142868042, "learning_rate": 1.788010753606722e-05, "loss": 0.241, "step": 291 }, { "epoch": 0.016805755395683453, "grad_norm": 1.7903494834899902, "learning_rate": 1.785856893175402e-05, "loss": 0.1072, "step": 292 }, { "epoch": 0.016863309352517987, "grad_norm": 9.696785926818848, "learning_rate": 1.78369345732584e-05, "loss": 0.2978, "step": 293 }, { "epoch": 0.016920863309352517, "grad_norm": 9.798710823059082, "learning_rate": 1.781520472418819e-05, "loss": 0.3003, "step": 294 }, { "epoch": 0.01697841726618705, "grad_norm": 1.50823175907135, "learning_rate": 1.7793379649314743e-05, "loss": 0.1167, "step": 295 }, { "epoch": 0.017035971223021584, "grad_norm": 15.534778594970703, "learning_rate": 1.777145961456971e-05, "loss": 0.3782, "step": 296 }, { "epoch": 0.017093525179856114, "grad_norm": 1.2554796934127808, "learning_rate": 1.7749444887041797e-05, "loss": 0.0898, "step": 297 }, { "epoch": 0.017151079136690648, "grad_norm": 4.486696720123291, "learning_rate": 1.7727335734973512e-05, "loss": 0.1792, "step": 298 }, { "epoch": 0.017208633093525178, "grad_norm": 4.518349647521973, "learning_rate": 1.7705132427757895e-05, "loss": 0.2248, "step": 299 }, { "epoch": 0.017266187050359712, "grad_norm": 11.271574020385742, "learning_rate": 1.7682835235935236e-05, "loss": 0.7558, "step": 300 }, { "epoch": 0.017323741007194245, "grad_norm": 7.780160427093506, "learning_rate": 1.766044443118978e-05, "loss": 0.2102, "step": 301 }, { "epoch": 0.017381294964028775, "grad_norm": 4.762843132019043, "learning_rate": 1.7637960286346423e-05, "loss": 0.2039, "step": 302 }, { "epoch": 0.01743884892086331, "grad_norm": 2.5707688331604004, "learning_rate": 1.761538307536737e-05, "loss": 0.1405, "step": 303 }, { "epoch": 0.017496402877697843, "grad_norm": 1.7889554500579834, "learning_rate": 1.759271307334881e-05, "loss": 0.1155, "step": 304 }, { "epoch": 0.017553956834532373, "grad_norm": 4.67526912689209, "learning_rate": 1.7569950556517566e-05, "loss": 0.2787, "step": 305 }, { "epoch": 0.017611510791366906, "grad_norm": 3.5515453815460205, "learning_rate": 1.7547095802227723e-05, "loss": 0.1925, "step": 306 }, { "epoch": 0.01766906474820144, "grad_norm": 5.906973838806152, "learning_rate": 1.7524149088957244e-05, "loss": 0.251, "step": 307 }, { "epoch": 0.01772661870503597, "grad_norm": 11.664872169494629, "learning_rate": 1.7501110696304598e-05, "loss": 0.9332, "step": 308 }, { "epoch": 0.017784172661870504, "grad_norm": 6.628945827484131, "learning_rate": 1.747798090498532e-05, "loss": 0.294, "step": 309 }, { "epoch": 0.017841726618705037, "grad_norm": 1.0758421421051025, "learning_rate": 1.7454759996828622e-05, "loss": 0.1241, "step": 310 }, { "epoch": 0.017899280575539567, "grad_norm": 5.263257026672363, "learning_rate": 1.7431448254773943e-05, "loss": 0.2367, "step": 311 }, { "epoch": 0.0179568345323741, "grad_norm": 1.0564324855804443, "learning_rate": 1.74080459628675e-05, "loss": 0.1437, "step": 312 }, { "epoch": 0.018014388489208635, "grad_norm": 7.245335578918457, "learning_rate": 1.7384553406258842e-05, "loss": 0.2376, "step": 313 }, { "epoch": 0.018071942446043165, "grad_norm": 1.8886218070983887, "learning_rate": 1.7360970871197347e-05, "loss": 0.1031, "step": 314 }, { "epoch": 0.0181294964028777, "grad_norm": 16.046236038208008, "learning_rate": 1.7337298645028764e-05, "loss": 0.6099, "step": 315 }, { "epoch": 0.01818705035971223, "grad_norm": 3.0381016731262207, "learning_rate": 1.7313537016191706e-05, "loss": 0.13, "step": 316 }, { "epoch": 0.018244604316546762, "grad_norm": 4.647341251373291, "learning_rate": 1.7289686274214116e-05, "loss": 0.1707, "step": 317 }, { "epoch": 0.018302158273381296, "grad_norm": 7.225495338439941, "learning_rate": 1.7265746709709762e-05, "loss": 0.3956, "step": 318 }, { "epoch": 0.018359712230215826, "grad_norm": 4.374240875244141, "learning_rate": 1.7241718614374678e-05, "loss": 0.2331, "step": 319 }, { "epoch": 0.01841726618705036, "grad_norm": 13.612607955932617, "learning_rate": 1.7217602280983622e-05, "loss": 0.645, "step": 320 }, { "epoch": 0.018474820143884893, "grad_norm": 4.012824535369873, "learning_rate": 1.7193398003386514e-05, "loss": 0.2226, "step": 321 }, { "epoch": 0.018532374100719423, "grad_norm": 1.8649362325668335, "learning_rate": 1.716910607650483e-05, "loss": 0.1594, "step": 322 }, { "epoch": 0.018589928057553957, "grad_norm": 5.170797824859619, "learning_rate": 1.7144726796328034e-05, "loss": 0.4557, "step": 323 }, { "epoch": 0.01864748201438849, "grad_norm": 7.867353916168213, "learning_rate": 1.712026045990997e-05, "loss": 0.325, "step": 324 }, { "epoch": 0.01870503597122302, "grad_norm": 4.790438175201416, "learning_rate": 1.709570736536521e-05, "loss": 0.2865, "step": 325 }, { "epoch": 0.018762589928057554, "grad_norm": 2.507946252822876, "learning_rate": 1.7071067811865477e-05, "loss": 0.104, "step": 326 }, { "epoch": 0.018820143884892088, "grad_norm": 3.4884064197540283, "learning_rate": 1.7046342099635948e-05, "loss": 0.1544, "step": 327 }, { "epoch": 0.018877697841726618, "grad_norm": 4.94096565246582, "learning_rate": 1.7021530529951627e-05, "loss": 0.2202, "step": 328 }, { "epoch": 0.01893525179856115, "grad_norm": 2.591435432434082, "learning_rate": 1.6996633405133656e-05, "loss": 0.1226, "step": 329 }, { "epoch": 0.018992805755395685, "grad_norm": 1.8974847793579102, "learning_rate": 1.697165102854565e-05, "loss": 0.1559, "step": 330 }, { "epoch": 0.019050359712230215, "grad_norm": 1.4150128364562988, "learning_rate": 1.6946583704589973e-05, "loss": 0.1162, "step": 331 }, { "epoch": 0.01910791366906475, "grad_norm": 3.744464874267578, "learning_rate": 1.692143173870407e-05, "loss": 0.2291, "step": 332 }, { "epoch": 0.01916546762589928, "grad_norm": 0.9393559694290161, "learning_rate": 1.68961954373567e-05, "loss": 0.1093, "step": 333 }, { "epoch": 0.019223021582733812, "grad_norm": 4.989112377166748, "learning_rate": 1.6870875108044233e-05, "loss": 0.2268, "step": 334 }, { "epoch": 0.019280575539568346, "grad_norm": 3.0673670768737793, "learning_rate": 1.684547105928689e-05, "loss": 0.1281, "step": 335 }, { "epoch": 0.019338129496402876, "grad_norm": 7.947831153869629, "learning_rate": 1.6819983600624986e-05, "loss": 0.665, "step": 336 }, { "epoch": 0.01939568345323741, "grad_norm": 6.416712284088135, "learning_rate": 1.6794413042615168e-05, "loss": 0.2137, "step": 337 }, { "epoch": 0.019453237410071943, "grad_norm": 16.095256805419922, "learning_rate": 1.6768759696826608e-05, "loss": 0.3756, "step": 338 }, { "epoch": 0.019510791366906474, "grad_norm": 2.4824581146240234, "learning_rate": 1.6743023875837233e-05, "loss": 0.117, "step": 339 }, { "epoch": 0.019568345323741007, "grad_norm": 5.838296890258789, "learning_rate": 1.6717205893229904e-05, "loss": 0.3567, "step": 340 }, { "epoch": 0.01962589928057554, "grad_norm": 5.100681781768799, "learning_rate": 1.6691306063588583e-05, "loss": 0.2389, "step": 341 }, { "epoch": 0.01968345323741007, "grad_norm": 7.527962684631348, "learning_rate": 1.6665324702494524e-05, "loss": 0.4136, "step": 342 }, { "epoch": 0.019741007194244604, "grad_norm": 2.3168222904205322, "learning_rate": 1.6639262126522417e-05, "loss": 0.158, "step": 343 }, { "epoch": 0.019798561151079138, "grad_norm": 0.8575424551963806, "learning_rate": 1.661311865323652e-05, "loss": 0.0539, "step": 344 }, { "epoch": 0.019856115107913668, "grad_norm": 3.8268239498138428, "learning_rate": 1.6586894601186804e-05, "loss": 0.2135, "step": 345 }, { "epoch": 0.019913669064748202, "grad_norm": 2.2744338512420654, "learning_rate": 1.6560590289905074e-05, "loss": 0.1278, "step": 346 }, { "epoch": 0.019971223021582735, "grad_norm": 5.595208168029785, "learning_rate": 1.6534206039901057e-05, "loss": 0.2693, "step": 347 }, { "epoch": 0.020028776978417265, "grad_norm": 2.456190347671509, "learning_rate": 1.650774217265851e-05, "loss": 0.1437, "step": 348 }, { "epoch": 0.0200863309352518, "grad_norm": 10.795378684997559, "learning_rate": 1.6481199010631312e-05, "loss": 0.7296, "step": 349 }, { "epoch": 0.02014388489208633, "grad_norm": 0.6766440868377686, "learning_rate": 1.645457687723951e-05, "loss": 0.104, "step": 350 }, { "epoch": 0.020201438848920863, "grad_norm": 2.074960708618164, "learning_rate": 1.6427876096865394e-05, "loss": 0.1433, "step": 351 }, { "epoch": 0.020258992805755396, "grad_norm": 3.726234197616577, "learning_rate": 1.6401096994849558e-05, "loss": 0.1568, "step": 352 }, { "epoch": 0.020316546762589927, "grad_norm": 1.0608885288238525, "learning_rate": 1.63742398974869e-05, "loss": 0.095, "step": 353 }, { "epoch": 0.02037410071942446, "grad_norm": 3.429298162460327, "learning_rate": 1.6347305132022677e-05, "loss": 0.1117, "step": 354 }, { "epoch": 0.020431654676258994, "grad_norm": 5.276256561279297, "learning_rate": 1.632029302664851e-05, "loss": 0.4022, "step": 355 }, { "epoch": 0.020489208633093524, "grad_norm": 6.028234481811523, "learning_rate": 1.6293203910498375e-05, "loss": 0.1726, "step": 356 }, { "epoch": 0.020546762589928057, "grad_norm": 7.822464942932129, "learning_rate": 1.6266038113644605e-05, "loss": 0.3716, "step": 357 }, { "epoch": 0.02060431654676259, "grad_norm": 9.866429328918457, "learning_rate": 1.6238795967093865e-05, "loss": 0.383, "step": 358 }, { "epoch": 0.02066187050359712, "grad_norm": 2.492753505706787, "learning_rate": 1.6211477802783105e-05, "loss": 0.1362, "step": 359 }, { "epoch": 0.020719424460431655, "grad_norm": 5.538491249084473, "learning_rate": 1.6184083953575543e-05, "loss": 0.2055, "step": 360 }, { "epoch": 0.02077697841726619, "grad_norm": 1.3676029443740845, "learning_rate": 1.6156614753256583e-05, "loss": 0.0914, "step": 361 }, { "epoch": 0.02083453237410072, "grad_norm": 5.788948059082031, "learning_rate": 1.6129070536529767e-05, "loss": 0.195, "step": 362 }, { "epoch": 0.020892086330935252, "grad_norm": 1.366302251815796, "learning_rate": 1.610145163901268e-05, "loss": 0.112, "step": 363 }, { "epoch": 0.020949640287769786, "grad_norm": 2.1532888412475586, "learning_rate": 1.607375839723287e-05, "loss": 0.1321, "step": 364 }, { "epoch": 0.021007194244604316, "grad_norm": 2.4224016666412354, "learning_rate": 1.6045991148623752e-05, "loss": 0.1307, "step": 365 }, { "epoch": 0.02106474820143885, "grad_norm": 2.301203489303589, "learning_rate": 1.6018150231520486e-05, "loss": 0.1129, "step": 366 }, { "epoch": 0.02112230215827338, "grad_norm": 2.732968807220459, "learning_rate": 1.599023598515586e-05, "loss": 0.1113, "step": 367 }, { "epoch": 0.021179856115107913, "grad_norm": 4.585814952850342, "learning_rate": 1.5962248749656158e-05, "loss": 0.2737, "step": 368 }, { "epoch": 0.021237410071942447, "grad_norm": 3.1472392082214355, "learning_rate": 1.5934188866037017e-05, "loss": 0.1172, "step": 369 }, { "epoch": 0.021294964028776977, "grad_norm": 0.760041356086731, "learning_rate": 1.5906056676199256e-05, "loss": 0.0888, "step": 370 }, { "epoch": 0.02135251798561151, "grad_norm": 1.1722584962844849, "learning_rate": 1.5877852522924733e-05, "loss": 0.0718, "step": 371 }, { "epoch": 0.021410071942446044, "grad_norm": 3.714165449142456, "learning_rate": 1.584957674987216e-05, "loss": 0.2098, "step": 372 }, { "epoch": 0.021467625899280574, "grad_norm": 13.37151050567627, "learning_rate": 1.5821229701572897e-05, "loss": 0.6504, "step": 373 }, { "epoch": 0.021525179856115108, "grad_norm": 7.64274787902832, "learning_rate": 1.5792811723426787e-05, "loss": 0.1673, "step": 374 }, { "epoch": 0.02158273381294964, "grad_norm": 5.333349227905273, "learning_rate": 1.5764323161697933e-05, "loss": 0.1838, "step": 375 }, { "epoch": 0.02164028776978417, "grad_norm": 5.268608570098877, "learning_rate": 1.573576436351046e-05, "loss": 0.201, "step": 376 }, { "epoch": 0.021697841726618705, "grad_norm": 1.2459615468978882, "learning_rate": 1.570713567684432e-05, "loss": 0.0831, "step": 377 }, { "epoch": 0.02175539568345324, "grad_norm": 0.3400329053401947, "learning_rate": 1.5678437450531014e-05, "loss": 0.0705, "step": 378 }, { "epoch": 0.02181294964028777, "grad_norm": 3.784626007080078, "learning_rate": 1.564967003424938e-05, "loss": 0.1281, "step": 379 }, { "epoch": 0.021870503597122302, "grad_norm": 2.0622975826263428, "learning_rate": 1.5620833778521306e-05, "loss": 0.1074, "step": 380 }, { "epoch": 0.021928057553956836, "grad_norm": 2.558227777481079, "learning_rate": 1.5591929034707468e-05, "loss": 0.1039, "step": 381 }, { "epoch": 0.021985611510791366, "grad_norm": 1.8183974027633667, "learning_rate": 1.556295615500305e-05, "loss": 0.1491, "step": 382 }, { "epoch": 0.0220431654676259, "grad_norm": 3.397756814956665, "learning_rate": 1.553391549243344e-05, "loss": 0.0963, "step": 383 }, { "epoch": 0.022100719424460433, "grad_norm": 1.9542852640151978, "learning_rate": 1.5504807400849957e-05, "loss": 0.1328, "step": 384 }, { "epoch": 0.022158273381294964, "grad_norm": 1.1182966232299805, "learning_rate": 1.5475632234925505e-05, "loss": 0.0329, "step": 385 }, { "epoch": 0.022215827338129497, "grad_norm": 6.058446407318115, "learning_rate": 1.5446390350150272e-05, "loss": 0.2127, "step": 386 }, { "epoch": 0.022273381294964027, "grad_norm": 4.640816688537598, "learning_rate": 1.54170821028274e-05, "loss": 0.1501, "step": 387 }, { "epoch": 0.02233093525179856, "grad_norm": 12.117772102355957, "learning_rate": 1.5387707850068633e-05, "loss": 0.4365, "step": 388 }, { "epoch": 0.022388489208633094, "grad_norm": 3.080798625946045, "learning_rate": 1.5358267949789968e-05, "loss": 0.1568, "step": 389 }, { "epoch": 0.022446043165467625, "grad_norm": 9.55615520477295, "learning_rate": 1.53287627607073e-05, "loss": 0.2771, "step": 390 }, { "epoch": 0.022503597122302158, "grad_norm": 0.7347004413604736, "learning_rate": 1.529919264233205e-05, "loss": 0.078, "step": 391 }, { "epoch": 0.022561151079136692, "grad_norm": 2.543454647064209, "learning_rate": 1.5269557954966777e-05, "loss": 0.0838, "step": 392 }, { "epoch": 0.022618705035971222, "grad_norm": 0.8127100467681885, "learning_rate": 1.5239859059700794e-05, "loss": 0.05, "step": 393 }, { "epoch": 0.022676258992805755, "grad_norm": 15.900167465209961, "learning_rate": 1.5210096318405768e-05, "loss": 0.313, "step": 394 }, { "epoch": 0.02273381294964029, "grad_norm": 3.528336524963379, "learning_rate": 1.5180270093731305e-05, "loss": 0.0843, "step": 395 }, { "epoch": 0.02279136690647482, "grad_norm": 0.9315251708030701, "learning_rate": 1.5150380749100545e-05, "loss": 0.0912, "step": 396 }, { "epoch": 0.022848920863309353, "grad_norm": 13.889352798461914, "learning_rate": 1.5120428648705716e-05, "loss": 0.3968, "step": 397 }, { "epoch": 0.022906474820143886, "grad_norm": 4.333874225616455, "learning_rate": 1.5090414157503715e-05, "loss": 0.1176, "step": 398 }, { "epoch": 0.022964028776978417, "grad_norm": 1.4791545867919922, "learning_rate": 1.5060337641211637e-05, "loss": 0.1132, "step": 399 }, { "epoch": 0.02302158273381295, "grad_norm": 17.43948745727539, "learning_rate": 1.5030199466302354e-05, "loss": 0.3854, "step": 400 }, { "epoch": 0.023079136690647484, "grad_norm": 1.4201478958129883, "learning_rate": 1.5000000000000002e-05, "loss": 0.0633, "step": 401 }, { "epoch": 0.023136690647482014, "grad_norm": 6.711745738983154, "learning_rate": 1.4969739610275556e-05, "loss": 0.1732, "step": 402 }, { "epoch": 0.023194244604316547, "grad_norm": 1.3818823099136353, "learning_rate": 1.493941866584231e-05, "loss": 0.0769, "step": 403 }, { "epoch": 0.023251798561151078, "grad_norm": 11.818349838256836, "learning_rate": 1.490903753615141e-05, "loss": 0.6309, "step": 404 }, { "epoch": 0.02330935251798561, "grad_norm": 6.571465969085693, "learning_rate": 1.4878596591387329e-05, "loss": 0.1149, "step": 405 }, { "epoch": 0.023366906474820145, "grad_norm": 8.349808692932129, "learning_rate": 1.4848096202463373e-05, "loss": 0.1896, "step": 406 }, { "epoch": 0.023424460431654675, "grad_norm": 2.857787609100342, "learning_rate": 1.4817536741017153e-05, "loss": 0.0704, "step": 407 }, { "epoch": 0.02348201438848921, "grad_norm": 8.498452186584473, "learning_rate": 1.478691857940607e-05, "loss": 0.2247, "step": 408 }, { "epoch": 0.023539568345323742, "grad_norm": 0.468039333820343, "learning_rate": 1.4756242090702756e-05, "loss": 0.0615, "step": 409 }, { "epoch": 0.023597122302158272, "grad_norm": 0.5978952646255493, "learning_rate": 1.4725507648690542e-05, "loss": 0.0501, "step": 410 }, { "epoch": 0.023654676258992806, "grad_norm": 11.912870407104492, "learning_rate": 1.469471562785891e-05, "loss": 0.3164, "step": 411 }, { "epoch": 0.02371223021582734, "grad_norm": 8.567428588867188, "learning_rate": 1.4663866403398915e-05, "loss": 0.2484, "step": 412 }, { "epoch": 0.02376978417266187, "grad_norm": 5.721550941467285, "learning_rate": 1.463296035119862e-05, "loss": 0.1515, "step": 413 }, { "epoch": 0.023827338129496403, "grad_norm": 11.112815856933594, "learning_rate": 1.4601997847838518e-05, "loss": 0.218, "step": 414 }, { "epoch": 0.023884892086330937, "grad_norm": 0.5129345655441284, "learning_rate": 1.4570979270586944e-05, "loss": 0.0569, "step": 415 }, { "epoch": 0.023942446043165467, "grad_norm": 7.7596259117126465, "learning_rate": 1.4539904997395468e-05, "loss": 0.1584, "step": 416 }, { "epoch": 0.024, "grad_norm": 6.294882774353027, "learning_rate": 1.4508775406894308e-05, "loss": 0.0663, "step": 417 }, { "epoch": 0.024057553956834534, "grad_norm": 10.160890579223633, "learning_rate": 1.4477590878387697e-05, "loss": 0.3199, "step": 418 }, { "epoch": 0.024115107913669064, "grad_norm": 0.9614742398262024, "learning_rate": 1.4446351791849276e-05, "loss": 0.077, "step": 419 }, { "epoch": 0.024172661870503598, "grad_norm": 3.362921953201294, "learning_rate": 1.4415058527917454e-05, "loss": 0.1266, "step": 420 }, { "epoch": 0.024230215827338128, "grad_norm": 8.087349891662598, "learning_rate": 1.4383711467890776e-05, "loss": 0.3689, "step": 421 }, { "epoch": 0.02428776978417266, "grad_norm": 8.182154655456543, "learning_rate": 1.4352310993723277e-05, "loss": 0.1767, "step": 422 }, { "epoch": 0.024345323741007195, "grad_norm": 1.1635339260101318, "learning_rate": 1.4320857488019826e-05, "loss": 0.1257, "step": 423 }, { "epoch": 0.024402877697841725, "grad_norm": 3.1351680755615234, "learning_rate": 1.4289351334031461e-05, "loss": 0.1368, "step": 424 }, { "epoch": 0.02446043165467626, "grad_norm": 8.145259857177734, "learning_rate": 1.4257792915650728e-05, "loss": 0.2471, "step": 425 }, { "epoch": 0.024517985611510792, "grad_norm": 0.6130021810531616, "learning_rate": 1.4226182617406996e-05, "loss": 0.0769, "step": 426 }, { "epoch": 0.024575539568345323, "grad_norm": 30.53700065612793, "learning_rate": 1.4194520824461773e-05, "loss": 0.2786, "step": 427 }, { "epoch": 0.024633093525179856, "grad_norm": 10.179473876953125, "learning_rate": 1.4162807922604014e-05, "loss": 0.22, "step": 428 }, { "epoch": 0.02469064748201439, "grad_norm": 20.345251083374023, "learning_rate": 1.413104429824542e-05, "loss": 1.2058, "step": 429 }, { "epoch": 0.02474820143884892, "grad_norm": 17.933361053466797, "learning_rate": 1.4099230338415728e-05, "loss": 0.5017, "step": 430 }, { "epoch": 0.024805755395683454, "grad_norm": 16.122068405151367, "learning_rate": 1.4067366430758004e-05, "loss": 0.4488, "step": 431 }, { "epoch": 0.024863309352517987, "grad_norm": 3.910707712173462, "learning_rate": 1.4035452963523903e-05, "loss": 0.1475, "step": 432 }, { "epoch": 0.024920863309352517, "grad_norm": 3.197533130645752, "learning_rate": 1.4003490325568953e-05, "loss": 0.1384, "step": 433 }, { "epoch": 0.02497841726618705, "grad_norm": 1.7127132415771484, "learning_rate": 1.3971478906347806e-05, "loss": 0.0953, "step": 434 }, { "epoch": 0.025035971223021584, "grad_norm": 0.6795534491539001, "learning_rate": 1.3939419095909513e-05, "loss": 0.084, "step": 435 }, { "epoch": 0.025093525179856115, "grad_norm": 0.7117164731025696, "learning_rate": 1.3907311284892737e-05, "loss": 0.1028, "step": 436 }, { "epoch": 0.025151079136690648, "grad_norm": 5.165547847747803, "learning_rate": 1.3875155864521031e-05, "loss": 0.1473, "step": 437 }, { "epoch": 0.02520863309352518, "grad_norm": 1.277207374572754, "learning_rate": 1.3842953226598036e-05, "loss": 0.1054, "step": 438 }, { "epoch": 0.025266187050359712, "grad_norm": 6.697636604309082, "learning_rate": 1.3810703763502744e-05, "loss": 0.2188, "step": 439 }, { "epoch": 0.025323741007194246, "grad_norm": 0.8036256432533264, "learning_rate": 1.3778407868184674e-05, "loss": 0.0717, "step": 440 }, { "epoch": 0.025381294964028776, "grad_norm": 14.702827453613281, "learning_rate": 1.3746065934159123e-05, "loss": 0.5481, "step": 441 }, { "epoch": 0.02543884892086331, "grad_norm": 0.643853485584259, "learning_rate": 1.371367835550235e-05, "loss": 0.0649, "step": 442 }, { "epoch": 0.025496402877697843, "grad_norm": 6.889122009277344, "learning_rate": 1.3681245526846782e-05, "loss": 0.1057, "step": 443 }, { "epoch": 0.025553956834532373, "grad_norm": 3.3967247009277344, "learning_rate": 1.3648767843376196e-05, "loss": 0.1841, "step": 444 }, { "epoch": 0.025611510791366907, "grad_norm": 4.166593074798584, "learning_rate": 1.3616245700820922e-05, "loss": 0.1322, "step": 445 }, { "epoch": 0.02566906474820144, "grad_norm": 0.9764627814292908, "learning_rate": 1.3583679495453e-05, "loss": 0.0708, "step": 446 }, { "epoch": 0.02572661870503597, "grad_norm": 8.78773307800293, "learning_rate": 1.3551069624081372e-05, "loss": 0.2444, "step": 447 }, { "epoch": 0.025784172661870504, "grad_norm": 0.8957186341285706, "learning_rate": 1.3518416484047018e-05, "loss": 0.0952, "step": 448 }, { "epoch": 0.025841726618705037, "grad_norm": 2.754924774169922, "learning_rate": 1.3485720473218153e-05, "loss": 0.1718, "step": 449 }, { "epoch": 0.025899280575539568, "grad_norm": 9.26056957244873, "learning_rate": 1.3452981989985347e-05, "loss": 0.1871, "step": 450 }, { "epoch": 0.0259568345323741, "grad_norm": 2.4848737716674805, "learning_rate": 1.342020143325669e-05, "loss": 0.0396, "step": 451 }, { "epoch": 0.026014388489208635, "grad_norm": 3.2999818325042725, "learning_rate": 1.3387379202452917e-05, "loss": 0.1324, "step": 452 }, { "epoch": 0.026071942446043165, "grad_norm": 13.430712699890137, "learning_rate": 1.3354515697502552e-05, "loss": 0.4404, "step": 453 }, { "epoch": 0.0261294964028777, "grad_norm": 0.6919059753417969, "learning_rate": 1.3321611318837033e-05, "loss": 0.0747, "step": 454 }, { "epoch": 0.02618705035971223, "grad_norm": 1.3426858186721802, "learning_rate": 1.3288666467385834e-05, "loss": 0.1142, "step": 455 }, { "epoch": 0.026244604316546762, "grad_norm": 0.5844228863716125, "learning_rate": 1.3255681544571568e-05, "loss": 0.0768, "step": 456 }, { "epoch": 0.026302158273381296, "grad_norm": 1.4332048892974854, "learning_rate": 1.3222656952305113e-05, "loss": 0.0896, "step": 457 }, { "epoch": 0.026359712230215826, "grad_norm": 5.514071464538574, "learning_rate": 1.3189593092980701e-05, "loss": 0.2002, "step": 458 }, { "epoch": 0.02641726618705036, "grad_norm": 10.462922096252441, "learning_rate": 1.3156490369471026e-05, "loss": 0.3964, "step": 459 }, { "epoch": 0.026474820143884893, "grad_norm": 7.807409763336182, "learning_rate": 1.3123349185122328e-05, "loss": 0.2995, "step": 460 }, { "epoch": 0.026532374100719423, "grad_norm": 2.9369232654571533, "learning_rate": 1.3090169943749475e-05, "loss": 0.1574, "step": 461 }, { "epoch": 0.026589928057553957, "grad_norm": 2.5351967811584473, "learning_rate": 1.3056953049631059e-05, "loss": 0.0965, "step": 462 }, { "epoch": 0.02664748201438849, "grad_norm": 1.906512975692749, "learning_rate": 1.3023698907504447e-05, "loss": 0.1145, "step": 463 }, { "epoch": 0.02670503597122302, "grad_norm": 9.540634155273438, "learning_rate": 1.2990407922560869e-05, "loss": 0.3393, "step": 464 }, { "epoch": 0.026762589928057554, "grad_norm": 13.937700271606445, "learning_rate": 1.2957080500440469e-05, "loss": 0.4293, "step": 465 }, { "epoch": 0.026820143884892088, "grad_norm": 2.0209853649139404, "learning_rate": 1.2923717047227368e-05, "loss": 0.049, "step": 466 }, { "epoch": 0.026877697841726618, "grad_norm": 8.90571117401123, "learning_rate": 1.2890317969444716e-05, "loss": 0.3679, "step": 467 }, { "epoch": 0.02693525179856115, "grad_norm": 1.0252469778060913, "learning_rate": 1.2856883674049736e-05, "loss": 0.0684, "step": 468 }, { "epoch": 0.026992805755395685, "grad_norm": 4.494703769683838, "learning_rate": 1.2823414568428767e-05, "loss": 0.1366, "step": 469 }, { "epoch": 0.027050359712230215, "grad_norm": 4.636646747589111, "learning_rate": 1.2789911060392295e-05, "loss": 0.1983, "step": 470 }, { "epoch": 0.02710791366906475, "grad_norm": 1.6157455444335938, "learning_rate": 1.2756373558169992e-05, "loss": 0.1423, "step": 471 }, { "epoch": 0.02716546762589928, "grad_norm": 1.4866031408309937, "learning_rate": 1.2722802470405744e-05, "loss": 0.1039, "step": 472 }, { "epoch": 0.027223021582733813, "grad_norm": 0.6129380464553833, "learning_rate": 1.2689198206152657e-05, "loss": 0.0738, "step": 473 }, { "epoch": 0.027280575539568346, "grad_norm": 8.018013954162598, "learning_rate": 1.265556117486809e-05, "loss": 0.1237, "step": 474 }, { "epoch": 0.027338129496402876, "grad_norm": 8.371533393859863, "learning_rate": 1.2621891786408648e-05, "loss": 0.2127, "step": 475 }, { "epoch": 0.02739568345323741, "grad_norm": 9.338693618774414, "learning_rate": 1.2588190451025209e-05, "loss": 0.313, "step": 476 }, { "epoch": 0.027453237410071944, "grad_norm": 2.514599323272705, "learning_rate": 1.2554457579357906e-05, "loss": 0.1236, "step": 477 }, { "epoch": 0.027510791366906474, "grad_norm": 36.57285690307617, "learning_rate": 1.252069358243114e-05, "loss": 0.8909, "step": 478 }, { "epoch": 0.027568345323741007, "grad_norm": 8.022867202758789, "learning_rate": 1.2486898871648552e-05, "loss": 0.243, "step": 479 }, { "epoch": 0.02762589928057554, "grad_norm": 3.656367063522339, "learning_rate": 1.2453073858788027e-05, "loss": 0.1721, "step": 480 }, { "epoch": 0.02768345323741007, "grad_norm": 7.827700614929199, "learning_rate": 1.2419218955996677e-05, "loss": 0.1393, "step": 481 }, { "epoch": 0.027741007194244605, "grad_norm": 0.9252501130104065, "learning_rate": 1.238533457578581e-05, "loss": 0.091, "step": 482 }, { "epoch": 0.027798561151079138, "grad_norm": 1.7856941223144531, "learning_rate": 1.23514211310259e-05, "loss": 0.1275, "step": 483 }, { "epoch": 0.02785611510791367, "grad_norm": 23.890789031982422, "learning_rate": 1.2317479034941572e-05, "loss": 0.3568, "step": 484 }, { "epoch": 0.027913669064748202, "grad_norm": 2.6963653564453125, "learning_rate": 1.2283508701106559e-05, "loss": 0.1143, "step": 485 }, { "epoch": 0.027971223021582736, "grad_norm": 5.141092777252197, "learning_rate": 1.2249510543438652e-05, "loss": 0.2145, "step": 486 }, { "epoch": 0.028028776978417266, "grad_norm": 4.484059810638428, "learning_rate": 1.2215484976194675e-05, "loss": 0.134, "step": 487 }, { "epoch": 0.0280863309352518, "grad_norm": 13.956079483032227, "learning_rate": 1.2181432413965428e-05, "loss": 0.2296, "step": 488 }, { "epoch": 0.02814388489208633, "grad_norm": 57.16883087158203, "learning_rate": 1.2147353271670634e-05, "loss": 0.3778, "step": 489 }, { "epoch": 0.028201438848920863, "grad_norm": 0.9326462745666504, "learning_rate": 1.211324796455389e-05, "loss": 0.0922, "step": 490 }, { "epoch": 0.028258992805755397, "grad_norm": 0.6182975172996521, "learning_rate": 1.2079116908177592e-05, "loss": 0.0514, "step": 491 }, { "epoch": 0.028316546762589927, "grad_norm": 0.42364293336868286, "learning_rate": 1.2044960518417902e-05, "loss": 0.0181, "step": 492 }, { "epoch": 0.02837410071942446, "grad_norm": 3.4781551361083984, "learning_rate": 1.2010779211459649e-05, "loss": 0.161, "step": 493 }, { "epoch": 0.028431654676258994, "grad_norm": 2.329385280609131, "learning_rate": 1.1976573403791263e-05, "loss": 0.0878, "step": 494 }, { "epoch": 0.028489208633093524, "grad_norm": 5.409578323364258, "learning_rate": 1.194234351219972e-05, "loss": 0.1646, "step": 495 }, { "epoch": 0.028546762589928058, "grad_norm": 0.2559332847595215, "learning_rate": 1.190808995376545e-05, "loss": 0.0438, "step": 496 }, { "epoch": 0.02860431654676259, "grad_norm": 7.3071088790893555, "learning_rate": 1.187381314585725e-05, "loss": 0.2373, "step": 497 }, { "epoch": 0.02866187050359712, "grad_norm": 13.68780517578125, "learning_rate": 1.1839513506127202e-05, "loss": 0.5267, "step": 498 }, { "epoch": 0.028719424460431655, "grad_norm": 14.798240661621094, "learning_rate": 1.1805191452505602e-05, "loss": 0.4246, "step": 499 }, { "epoch": 0.02877697841726619, "grad_norm": 0.8802618980407715, "learning_rate": 1.1770847403195836e-05, "loss": 0.0126, "step": 500 }, { "epoch": 0.02883453237410072, "grad_norm": 2.4958908557891846, "learning_rate": 1.1736481776669307e-05, "loss": 0.0668, "step": 501 }, { "epoch": 0.028892086330935252, "grad_norm": 8.02234172821045, "learning_rate": 1.1702094991660326e-05, "loss": 0.2478, "step": 502 }, { "epoch": 0.028949640287769786, "grad_norm": 6.714267730712891, "learning_rate": 1.1667687467161025e-05, "loss": 0.2143, "step": 503 }, { "epoch": 0.029007194244604316, "grad_norm": 7.356170177459717, "learning_rate": 1.1633259622416224e-05, "loss": 0.2208, "step": 504 }, { "epoch": 0.02906474820143885, "grad_norm": 4.378060340881348, "learning_rate": 1.159881187691835e-05, "loss": 0.1349, "step": 505 }, { "epoch": 0.02912230215827338, "grad_norm": 9.72322940826416, "learning_rate": 1.156434465040231e-05, "loss": 0.277, "step": 506 }, { "epoch": 0.029179856115107913, "grad_norm": 1.8536237478256226, "learning_rate": 1.1529858362840383e-05, "loss": 0.1448, "step": 507 }, { "epoch": 0.029237410071942447, "grad_norm": 3.8134891986846924, "learning_rate": 1.1495353434437098e-05, "loss": 0.1221, "step": 508 }, { "epoch": 0.029294964028776977, "grad_norm": 8.059671401977539, "learning_rate": 1.1460830285624119e-05, "loss": 0.3355, "step": 509 }, { "epoch": 0.02935251798561151, "grad_norm": 10.443696022033691, "learning_rate": 1.1426289337055119e-05, "loss": 0.329, "step": 510 }, { "epoch": 0.029410071942446044, "grad_norm": 1.0404391288757324, "learning_rate": 1.1391731009600655e-05, "loss": 0.0193, "step": 511 }, { "epoch": 0.029467625899280574, "grad_norm": 9.676289558410645, "learning_rate": 1.1357155724343046e-05, "loss": 0.2689, "step": 512 }, { "epoch": 0.029525179856115108, "grad_norm": 25.724231719970703, "learning_rate": 1.1322563902571227e-05, "loss": 0.7274, "step": 513 }, { "epoch": 0.02958273381294964, "grad_norm": 3.297560214996338, "learning_rate": 1.128795596577563e-05, "loss": 0.1354, "step": 514 }, { "epoch": 0.02964028776978417, "grad_norm": 7.687169551849365, "learning_rate": 1.1253332335643043e-05, "loss": 0.8631, "step": 515 }, { "epoch": 0.029697841726618705, "grad_norm": 1.4026639461517334, "learning_rate": 1.1218693434051475e-05, "loss": 0.0832, "step": 516 }, { "epoch": 0.02975539568345324, "grad_norm": 1.158055067062378, "learning_rate": 1.1184039683065014e-05, "loss": 0.1053, "step": 517 }, { "epoch": 0.02981294964028777, "grad_norm": 1.325383186340332, "learning_rate": 1.1149371504928667e-05, "loss": 0.0601, "step": 518 }, { "epoch": 0.029870503597122303, "grad_norm": 21.55183219909668, "learning_rate": 1.1114689322063255e-05, "loss": 0.4671, "step": 519 }, { "epoch": 0.029928057553956836, "grad_norm": 5.480937480926514, "learning_rate": 1.1079993557060228e-05, "loss": 0.19, "step": 520 }, { "epoch": 0.029985611510791366, "grad_norm": 7.440226078033447, "learning_rate": 1.1045284632676535e-05, "loss": 0.2281, "step": 521 }, { "epoch": 0.0300431654676259, "grad_norm": 2.201996326446533, "learning_rate": 1.1010562971829464e-05, "loss": 0.1137, "step": 522 }, { "epoch": 0.03010071942446043, "grad_norm": 2.157567024230957, "learning_rate": 1.0975828997591496e-05, "loss": 0.1234, "step": 523 }, { "epoch": 0.030158273381294964, "grad_norm": 1.4759501218795776, "learning_rate": 1.0941083133185146e-05, "loss": 0.0717, "step": 524 }, { "epoch": 0.030215827338129497, "grad_norm": 4.390008449554443, "learning_rate": 1.0906325801977804e-05, "loss": 0.1566, "step": 525 }, { "epoch": 0.030273381294964027, "grad_norm": 2.1558635234832764, "learning_rate": 1.0871557427476585e-05, "loss": 0.1084, "step": 526 }, { "epoch": 0.03033093525179856, "grad_norm": 5.547511577606201, "learning_rate": 1.083677843332316e-05, "loss": 0.2485, "step": 527 }, { "epoch": 0.030388489208633095, "grad_norm": 1.3841583728790283, "learning_rate": 1.0801989243288588e-05, "loss": 0.0932, "step": 528 }, { "epoch": 0.030446043165467625, "grad_norm": 3.9912872314453125, "learning_rate": 1.0767190281268187e-05, "loss": 0.1574, "step": 529 }, { "epoch": 0.03050359712230216, "grad_norm": 0.8950588703155518, "learning_rate": 1.0732381971276318e-05, "loss": 0.0841, "step": 530 }, { "epoch": 0.030561151079136692, "grad_norm": 1.7501403093338013, "learning_rate": 1.0697564737441254e-05, "loss": 0.1257, "step": 531 }, { "epoch": 0.030618705035971222, "grad_norm": 3.171156406402588, "learning_rate": 1.0662739004000005e-05, "loss": 0.1247, "step": 532 }, { "epoch": 0.030676258992805756, "grad_norm": 8.79008960723877, "learning_rate": 1.0627905195293135e-05, "loss": 0.4406, "step": 533 }, { "epoch": 0.03073381294964029, "grad_norm": 0.8652887344360352, "learning_rate": 1.0593063735759619e-05, "loss": 0.0557, "step": 534 }, { "epoch": 0.03079136690647482, "grad_norm": 2.49337100982666, "learning_rate": 1.055821504993164e-05, "loss": 0.0957, "step": 535 }, { "epoch": 0.030848920863309353, "grad_norm": 0.9073213934898376, "learning_rate": 1.0523359562429441e-05, "loss": 0.081, "step": 536 }, { "epoch": 0.030906474820143887, "grad_norm": 2.930410385131836, "learning_rate": 1.0488497697956134e-05, "loss": 0.1306, "step": 537 }, { "epoch": 0.030964028776978417, "grad_norm": 15.254547119140625, "learning_rate": 1.0453629881292537e-05, "loss": 0.5873, "step": 538 }, { "epoch": 0.03102158273381295, "grad_norm": 9.663077354431152, "learning_rate": 1.0418756537291996e-05, "loss": 0.255, "step": 539 }, { "epoch": 0.03107913669064748, "grad_norm": 0.9969314932823181, "learning_rate": 1.03838780908752e-05, "loss": 0.0821, "step": 540 }, { "epoch": 0.031136690647482014, "grad_norm": 8.922150611877441, "learning_rate": 1.0348994967025012e-05, "loss": 0.2661, "step": 541 }, { "epoch": 0.031194244604316548, "grad_norm": 1.0743968486785889, "learning_rate": 1.0314107590781284e-05, "loss": 0.1089, "step": 542 }, { "epoch": 0.03125179856115108, "grad_norm": 4.709200382232666, "learning_rate": 1.0279216387235691e-05, "loss": 0.158, "step": 543 }, { "epoch": 0.03130935251798561, "grad_norm": 6.3187947273254395, "learning_rate": 1.0244321781526533e-05, "loss": 0.2659, "step": 544 }, { "epoch": 0.03136690647482014, "grad_norm": 0.4948660433292389, "learning_rate": 1.0209424198833571e-05, "loss": 0.0685, "step": 545 }, { "epoch": 0.03142446043165468, "grad_norm": 1.0766798257827759, "learning_rate": 1.0174524064372837e-05, "loss": 0.0989, "step": 546 }, { "epoch": 0.03148201438848921, "grad_norm": 4.577028751373291, "learning_rate": 1.0139621803391454e-05, "loss": 0.177, "step": 547 }, { "epoch": 0.03153956834532374, "grad_norm": 0.9121817946434021, "learning_rate": 1.010471784116246e-05, "loss": 0.086, "step": 548 }, { "epoch": 0.031597122302158276, "grad_norm": 8.214136123657227, "learning_rate": 1.0069812602979617e-05, "loss": 0.2782, "step": 549 }, { "epoch": 0.031654676258992806, "grad_norm": 0.6466065049171448, "learning_rate": 1.0034906514152239e-05, "loss": 0.0647, "step": 550 }, { "epoch": 0.031712230215827336, "grad_norm": 2.8535683155059814, "learning_rate": 1e-05, "loss": 0.1092, "step": 551 }, { "epoch": 0.03176978417266187, "grad_norm": 0.7769191861152649, "learning_rate": 9.965093485847766e-06, "loss": 0.0612, "step": 552 }, { "epoch": 0.0318273381294964, "grad_norm": 1.8424514532089233, "learning_rate": 9.930187397020385e-06, "loss": 0.1221, "step": 553 }, { "epoch": 0.031884892086330933, "grad_norm": 1.2432526350021362, "learning_rate": 9.895282158837545e-06, "loss": 0.0838, "step": 554 }, { "epoch": 0.03194244604316547, "grad_norm": 4.903514862060547, "learning_rate": 9.860378196608549e-06, "loss": 0.1696, "step": 555 }, { "epoch": 0.032, "grad_norm": 0.8313055038452148, "learning_rate": 9.825475935627165e-06, "loss": 0.0785, "step": 556 }, { "epoch": 0.03205755395683453, "grad_norm": 0.8260055780410767, "learning_rate": 9.790575801166432e-06, "loss": 0.0783, "step": 557 }, { "epoch": 0.03211510791366907, "grad_norm": 15.278909683227539, "learning_rate": 9.75567821847347e-06, "loss": 0.4247, "step": 558 }, { "epoch": 0.0321726618705036, "grad_norm": 1.1578339338302612, "learning_rate": 9.720783612764314e-06, "loss": 0.0904, "step": 559 }, { "epoch": 0.03223021582733813, "grad_norm": 2.438309669494629, "learning_rate": 9.685892409218718e-06, "loss": 0.1525, "step": 560 }, { "epoch": 0.032287769784172665, "grad_norm": 7.4440693855285645, "learning_rate": 9.651005032974994e-06, "loss": 0.2666, "step": 561 }, { "epoch": 0.032345323741007195, "grad_norm": 1.554640293121338, "learning_rate": 9.616121909124801e-06, "loss": 0.0693, "step": 562 }, { "epoch": 0.032402877697841725, "grad_norm": 3.4629485607147217, "learning_rate": 9.581243462708007e-06, "loss": 0.1241, "step": 563 }, { "epoch": 0.032460431654676256, "grad_norm": 1.3280606269836426, "learning_rate": 9.546370118707463e-06, "loss": 0.0894, "step": 564 }, { "epoch": 0.03251798561151079, "grad_norm": 3.824627637863159, "learning_rate": 9.511502302043867e-06, "loss": 0.1586, "step": 565 }, { "epoch": 0.03257553956834532, "grad_norm": 9.243664741516113, "learning_rate": 9.476640437570562e-06, "loss": 0.258, "step": 566 }, { "epoch": 0.03263309352517985, "grad_norm": 3.4505608081817627, "learning_rate": 9.441784950068362e-06, "loss": 0.134, "step": 567 }, { "epoch": 0.03269064748201439, "grad_norm": 13.31429672241211, "learning_rate": 9.406936264240386e-06, "loss": 0.4339, "step": 568 }, { "epoch": 0.03274820143884892, "grad_norm": 0.4915928542613983, "learning_rate": 9.372094804706867e-06, "loss": 0.0526, "step": 569 }, { "epoch": 0.03280575539568345, "grad_norm": 9.329924583435059, "learning_rate": 9.337260996000002e-06, "loss": 0.1747, "step": 570 }, { "epoch": 0.03286330935251799, "grad_norm": 1.121419906616211, "learning_rate": 9.302435262558748e-06, "loss": 0.0665, "step": 571 }, { "epoch": 0.03292086330935252, "grad_norm": 1.8707423210144043, "learning_rate": 9.267618028723687e-06, "loss": 0.1144, "step": 572 }, { "epoch": 0.03297841726618705, "grad_norm": 0.3859129548072815, "learning_rate": 9.232809718731815e-06, "loss": 0.0301, "step": 573 }, { "epoch": 0.033035971223021585, "grad_norm": 0.4926592707633972, "learning_rate": 9.198010756711413e-06, "loss": 0.0795, "step": 574 }, { "epoch": 0.033093525179856115, "grad_norm": 7.28505802154541, "learning_rate": 9.163221566676847e-06, "loss": 0.1051, "step": 575 }, { "epoch": 0.033151079136690645, "grad_norm": 10.257981300354004, "learning_rate": 9.128442572523418e-06, "loss": 0.4004, "step": 576 }, { "epoch": 0.03320863309352518, "grad_norm": 5.706249713897705, "learning_rate": 9.093674198022201e-06, "loss": 0.2009, "step": 577 }, { "epoch": 0.03326618705035971, "grad_norm": 3.457526922225952, "learning_rate": 9.058916866814857e-06, "loss": 0.1433, "step": 578 }, { "epoch": 0.03332374100719424, "grad_norm": 6.841573715209961, "learning_rate": 9.024171002408507e-06, "loss": 0.5298, "step": 579 }, { "epoch": 0.03338129496402878, "grad_norm": 1.4832878112792969, "learning_rate": 8.989437028170537e-06, "loss": 0.0853, "step": 580 }, { "epoch": 0.03343884892086331, "grad_norm": 11.173535346984863, "learning_rate": 8.954715367323468e-06, "loss": 0.2538, "step": 581 }, { "epoch": 0.03349640287769784, "grad_norm": 0.7396141290664673, "learning_rate": 8.920006442939772e-06, "loss": 0.025, "step": 582 }, { "epoch": 0.03355395683453238, "grad_norm": 4.9781293869018555, "learning_rate": 8.885310677936746e-06, "loss": 0.1923, "step": 583 }, { "epoch": 0.03361151079136691, "grad_norm": 0.557285726070404, "learning_rate": 8.850628495071336e-06, "loss": 0.0087, "step": 584 }, { "epoch": 0.03366906474820144, "grad_norm": 11.92111873626709, "learning_rate": 8.815960316934991e-06, "loss": 0.2586, "step": 585 }, { "epoch": 0.033726618705035974, "grad_norm": 1.8631489276885986, "learning_rate": 8.781306565948528e-06, "loss": 0.1197, "step": 586 }, { "epoch": 0.033784172661870504, "grad_norm": 1.890995740890503, "learning_rate": 8.746667664356957e-06, "loss": 0.1011, "step": 587 }, { "epoch": 0.033841726618705034, "grad_norm": 0.8469424247741699, "learning_rate": 8.712044034224374e-06, "loss": 0.0862, "step": 588 }, { "epoch": 0.03389928057553957, "grad_norm": 0.5587536096572876, "learning_rate": 8.677436097428775e-06, "loss": 0.0414, "step": 589 }, { "epoch": 0.0339568345323741, "grad_norm": 14.286009788513184, "learning_rate": 8.642844275656957e-06, "loss": 0.2033, "step": 590 }, { "epoch": 0.03401438848920863, "grad_norm": 0.828925371170044, "learning_rate": 8.60826899039935e-06, "loss": 0.0879, "step": 591 }, { "epoch": 0.03407194244604317, "grad_norm": 2.5899553298950195, "learning_rate": 8.573710662944884e-06, "loss": 0.1128, "step": 592 }, { "epoch": 0.0341294964028777, "grad_norm": 0.7016512751579285, "learning_rate": 8.539169714375885e-06, "loss": 0.0559, "step": 593 }, { "epoch": 0.03418705035971223, "grad_norm": 0.7938787937164307, "learning_rate": 8.504646565562907e-06, "loss": 0.0751, "step": 594 }, { "epoch": 0.034244604316546766, "grad_norm": 4.789397716522217, "learning_rate": 8.47014163715962e-06, "loss": 0.1729, "step": 595 }, { "epoch": 0.034302158273381296, "grad_norm": 8.780428886413574, "learning_rate": 8.43565534959769e-06, "loss": 0.2045, "step": 596 }, { "epoch": 0.034359712230215826, "grad_norm": 1.7636146545410156, "learning_rate": 8.401188123081653e-06, "loss": 0.0915, "step": 597 }, { "epoch": 0.034417266187050356, "grad_norm": 4.673388957977295, "learning_rate": 8.366740377583781e-06, "loss": 0.1396, "step": 598 }, { "epoch": 0.03447482014388489, "grad_norm": 10.594704627990723, "learning_rate": 8.332312532838978e-06, "loss": 0.2385, "step": 599 }, { "epoch": 0.034532374100719423, "grad_norm": 0.38678157329559326, "learning_rate": 8.297905008339677e-06, "loss": 0.0511, "step": 600 }, { "epoch": 0.034589928057553954, "grad_norm": 2.4528400897979736, "learning_rate": 8.263518223330698e-06, "loss": 0.1191, "step": 601 }, { "epoch": 0.03464748201438849, "grad_norm": 9.87134075164795, "learning_rate": 8.22915259680417e-06, "loss": 0.3747, "step": 602 }, { "epoch": 0.03470503597122302, "grad_norm": 2.0023324489593506, "learning_rate": 8.194808547494401e-06, "loss": 0.0745, "step": 603 }, { "epoch": 0.03476258992805755, "grad_norm": 14.806257247924805, "learning_rate": 8.1604864938728e-06, "loss": 0.5362, "step": 604 }, { "epoch": 0.03482014388489209, "grad_norm": 4.73097562789917, "learning_rate": 8.126186854142752e-06, "loss": 0.1718, "step": 605 }, { "epoch": 0.03487769784172662, "grad_norm": 1.0992658138275146, "learning_rate": 8.091910046234552e-06, "loss": 0.0868, "step": 606 }, { "epoch": 0.03493525179856115, "grad_norm": 0.5213341116905212, "learning_rate": 8.057656487800283e-06, "loss": 0.0579, "step": 607 }, { "epoch": 0.034992805755395685, "grad_norm": 0.8178665637969971, "learning_rate": 8.023426596208739e-06, "loss": 0.0724, "step": 608 }, { "epoch": 0.035050359712230215, "grad_norm": 2.403118133544922, "learning_rate": 7.989220788540356e-06, "loss": 0.1376, "step": 609 }, { "epoch": 0.035107913669064746, "grad_norm": 6.1761579513549805, "learning_rate": 7.955039481582098e-06, "loss": 0.2501, "step": 610 }, { "epoch": 0.03516546762589928, "grad_norm": 1.1938904523849487, "learning_rate": 7.92088309182241e-06, "loss": 0.1253, "step": 611 }, { "epoch": 0.03522302158273381, "grad_norm": 1.653713345527649, "learning_rate": 7.886752035446116e-06, "loss": 0.0934, "step": 612 }, { "epoch": 0.03528057553956834, "grad_norm": 0.9503543972969055, "learning_rate": 7.852646728329368e-06, "loss": 0.0567, "step": 613 }, { "epoch": 0.03533812949640288, "grad_norm": 4.422244548797607, "learning_rate": 7.818567586034578e-06, "loss": 0.1249, "step": 614 }, { "epoch": 0.03539568345323741, "grad_norm": 3.623196840286255, "learning_rate": 7.784515023805328e-06, "loss": 0.1158, "step": 615 }, { "epoch": 0.03545323741007194, "grad_norm": 9.297059059143066, "learning_rate": 7.750489456561351e-06, "loss": 0.2269, "step": 616 }, { "epoch": 0.03551079136690648, "grad_norm": 4.102265357971191, "learning_rate": 7.716491298893443e-06, "loss": 0.1605, "step": 617 }, { "epoch": 0.03556834532374101, "grad_norm": 1.0876208543777466, "learning_rate": 7.68252096505843e-06, "loss": 0.0712, "step": 618 }, { "epoch": 0.03562589928057554, "grad_norm": 3.1616358757019043, "learning_rate": 7.6485788689741e-06, "loss": 0.1216, "step": 619 }, { "epoch": 0.035683453237410075, "grad_norm": 1.6394915580749512, "learning_rate": 7.6146654242141935e-06, "loss": 0.1088, "step": 620 }, { "epoch": 0.035741007194244605, "grad_norm": 0.6849850416183472, "learning_rate": 7.580781044003324e-06, "loss": 0.0639, "step": 621 }, { "epoch": 0.035798561151079135, "grad_norm": 0.7542056441307068, "learning_rate": 7.546926141211975e-06, "loss": 0.0541, "step": 622 }, { "epoch": 0.03585611510791367, "grad_norm": 1.4969114065170288, "learning_rate": 7.513101128351454e-06, "loss": 0.0822, "step": 623 }, { "epoch": 0.0359136690647482, "grad_norm": 0.6035341620445251, "learning_rate": 7.4793064175688635e-06, "loss": 0.0831, "step": 624 }, { "epoch": 0.03597122302158273, "grad_norm": 9.536357879638672, "learning_rate": 7.445542420642097e-06, "loss": 0.297, "step": 625 }, { "epoch": 0.03602877697841727, "grad_norm": 10.490472793579102, "learning_rate": 7.411809548974792e-06, "loss": 0.2122, "step": 626 }, { "epoch": 0.0360863309352518, "grad_norm": 2.7282614707946777, "learning_rate": 7.378108213591355e-06, "loss": 0.0946, "step": 627 }, { "epoch": 0.03614388489208633, "grad_norm": 4.524618625640869, "learning_rate": 7.344438825131912e-06, "loss": 0.1423, "step": 628 }, { "epoch": 0.03620143884892087, "grad_norm": 7.327822685241699, "learning_rate": 7.310801793847344e-06, "loss": 0.2327, "step": 629 }, { "epoch": 0.0362589928057554, "grad_norm": 2.4597315788269043, "learning_rate": 7.277197529594257e-06, "loss": 0.1116, "step": 630 }, { "epoch": 0.03631654676258993, "grad_norm": 8.071950912475586, "learning_rate": 7.243626441830009e-06, "loss": 0.1441, "step": 631 }, { "epoch": 0.03637410071942446, "grad_norm": 0.2197929471731186, "learning_rate": 7.210088939607709e-06, "loss": 0.0039, "step": 632 }, { "epoch": 0.036431654676258994, "grad_norm": 1.8408621549606323, "learning_rate": 7.176585431571235e-06, "loss": 0.0781, "step": 633 }, { "epoch": 0.036489208633093524, "grad_norm": 9.146245002746582, "learning_rate": 7.143116325950266e-06, "loss": 0.2037, "step": 634 }, { "epoch": 0.036546762589928054, "grad_norm": 4.6287922859191895, "learning_rate": 7.109682030555283e-06, "loss": 0.1195, "step": 635 }, { "epoch": 0.03660431654676259, "grad_norm": 1.386813998222351, "learning_rate": 7.076282952772634e-06, "loss": 0.124, "step": 636 }, { "epoch": 0.03666187050359712, "grad_norm": 0.7986965775489807, "learning_rate": 7.042919499559538e-06, "loss": 0.0724, "step": 637 }, { "epoch": 0.03671942446043165, "grad_norm": 2.6151773929595947, "learning_rate": 7.009592077439135e-06, "loss": 0.0991, "step": 638 }, { "epoch": 0.03677697841726619, "grad_norm": 9.886137962341309, "learning_rate": 6.976301092495556e-06, "loss": 0.2132, "step": 639 }, { "epoch": 0.03683453237410072, "grad_norm": 1.0213325023651123, "learning_rate": 6.943046950368944e-06, "loss": 0.0824, "step": 640 }, { "epoch": 0.03689208633093525, "grad_norm": 5.319864273071289, "learning_rate": 6.909830056250527e-06, "loss": 0.1426, "step": 641 }, { "epoch": 0.036949640287769786, "grad_norm": 11.022806167602539, "learning_rate": 6.876650814877675e-06, "loss": 0.3286, "step": 642 }, { "epoch": 0.037007194244604316, "grad_norm": 7.383906841278076, "learning_rate": 6.843509630528977e-06, "loss": 0.1446, "step": 643 }, { "epoch": 0.037064748201438846, "grad_norm": 1.5146830081939697, "learning_rate": 6.8104069070193e-06, "loss": 0.0942, "step": 644 }, { "epoch": 0.03712230215827338, "grad_norm": 18.990802764892578, "learning_rate": 6.777343047694891e-06, "loss": 0.2854, "step": 645 }, { "epoch": 0.037179856115107913, "grad_norm": 2.318795680999756, "learning_rate": 6.744318455428436e-06, "loss": 0.0937, "step": 646 }, { "epoch": 0.037237410071942444, "grad_norm": 0.372890830039978, "learning_rate": 6.711333532614168e-06, "loss": 0.0548, "step": 647 }, { "epoch": 0.03729496402877698, "grad_norm": 2.1778178215026855, "learning_rate": 6.67838868116297e-06, "loss": 0.0613, "step": 648 }, { "epoch": 0.03735251798561151, "grad_norm": 7.633350372314453, "learning_rate": 6.645484302497452e-06, "loss": 0.2113, "step": 649 }, { "epoch": 0.03741007194244604, "grad_norm": 2.2798593044281006, "learning_rate": 6.612620797547087e-06, "loss": 0.1007, "step": 650 }, { "epoch": 0.03746762589928058, "grad_norm": 1.3645232915878296, "learning_rate": 6.579798566743314e-06, "loss": 0.1054, "step": 651 }, { "epoch": 0.03752517985611511, "grad_norm": 3.271883249282837, "learning_rate": 6.547018010014654e-06, "loss": 0.1515, "step": 652 }, { "epoch": 0.03758273381294964, "grad_norm": 10.690248489379883, "learning_rate": 6.5142795267818505e-06, "loss": 0.2535, "step": 653 }, { "epoch": 0.037640287769784175, "grad_norm": 7.398156642913818, "learning_rate": 6.481583515952983e-06, "loss": 0.1515, "step": 654 }, { "epoch": 0.037697841726618705, "grad_norm": 5.868314266204834, "learning_rate": 6.448930375918632e-06, "loss": 0.114, "step": 655 }, { "epoch": 0.037755395683453236, "grad_norm": 7.998626708984375, "learning_rate": 6.4163205045469975e-06, "loss": 0.1563, "step": 656 }, { "epoch": 0.03781294964028777, "grad_norm": 13.154156684875488, "learning_rate": 6.383754299179079e-06, "loss": 0.5158, "step": 657 }, { "epoch": 0.0378705035971223, "grad_norm": 1.3445124626159668, "learning_rate": 6.351232156623803e-06, "loss": 0.1183, "step": 658 }, { "epoch": 0.03792805755395683, "grad_norm": 1.747786521911621, "learning_rate": 6.318754473153221e-06, "loss": 0.0859, "step": 659 }, { "epoch": 0.03798561151079137, "grad_norm": 0.8909703493118286, "learning_rate": 6.286321644497655e-06, "loss": 0.0959, "step": 660 }, { "epoch": 0.0380431654676259, "grad_norm": 2.6019179821014404, "learning_rate": 6.25393406584088e-06, "loss": 0.1215, "step": 661 }, { "epoch": 0.03810071942446043, "grad_norm": 0.6137844324111938, "learning_rate": 6.22159213181533e-06, "loss": 0.0674, "step": 662 }, { "epoch": 0.03815827338129497, "grad_norm": 5.27604341506958, "learning_rate": 6.18929623649726e-06, "loss": 0.1627, "step": 663 }, { "epoch": 0.0382158273381295, "grad_norm": 1.312740445137024, "learning_rate": 6.157046773401964e-06, "loss": 0.0954, "step": 664 }, { "epoch": 0.03827338129496403, "grad_norm": 10.17847728729248, "learning_rate": 6.124844135478971e-06, "loss": 0.2274, "step": 665 }, { "epoch": 0.03833093525179856, "grad_norm": 2.691664457321167, "learning_rate": 6.092688715107265e-06, "loss": 0.0243, "step": 666 }, { "epoch": 0.038388489208633095, "grad_norm": 5.8143229484558105, "learning_rate": 6.06058090409049e-06, "loss": 0.1523, "step": 667 }, { "epoch": 0.038446043165467625, "grad_norm": 23.19382095336914, "learning_rate": 6.028521093652195e-06, "loss": 0.4124, "step": 668 }, { "epoch": 0.038503597122302155, "grad_norm": 9.377325057983398, "learning_rate": 5.996509674431053e-06, "loss": 0.2113, "step": 669 }, { "epoch": 0.03856115107913669, "grad_norm": 12.410694122314453, "learning_rate": 5.9645470364761e-06, "loss": 0.2557, "step": 670 }, { "epoch": 0.03861870503597122, "grad_norm": 7.3218865394592285, "learning_rate": 5.932633569242e-06, "loss": 0.1555, "step": 671 }, { "epoch": 0.03867625899280575, "grad_norm": 5.622396945953369, "learning_rate": 5.900769661584273e-06, "loss": 0.0826, "step": 672 }, { "epoch": 0.03873381294964029, "grad_norm": 5.398272514343262, "learning_rate": 5.868955701754584e-06, "loss": 0.1456, "step": 673 }, { "epoch": 0.03879136690647482, "grad_norm": 0.751691460609436, "learning_rate": 5.83719207739599e-06, "loss": 0.0772, "step": 674 }, { "epoch": 0.03884892086330935, "grad_norm": 0.6720765233039856, "learning_rate": 5.8054791755382286e-06, "loss": 0.0869, "step": 675 }, { "epoch": 0.03890647482014389, "grad_norm": 1.0747441053390503, "learning_rate": 5.773817382593008e-06, "loss": 0.072, "step": 676 }, { "epoch": 0.03896402877697842, "grad_norm": 10.877379417419434, "learning_rate": 5.742207084349274e-06, "loss": 0.2319, "step": 677 }, { "epoch": 0.03902158273381295, "grad_norm": 1.0769011974334717, "learning_rate": 5.710648665968543e-06, "loss": 0.072, "step": 678 }, { "epoch": 0.039079136690647484, "grad_norm": 17.446186065673828, "learning_rate": 5.679142511980176e-06, "loss": 0.3885, "step": 679 }, { "epoch": 0.039136690647482014, "grad_norm": 0.5651721358299255, "learning_rate": 5.647689006276727e-06, "loss": 0.0675, "step": 680 }, { "epoch": 0.039194244604316544, "grad_norm": 12.628124237060547, "learning_rate": 5.616288532109225e-06, "loss": 0.5707, "step": 681 }, { "epoch": 0.03925179856115108, "grad_norm": 1.9146260023117065, "learning_rate": 5.584941472082549e-06, "loss": 0.0913, "step": 682 }, { "epoch": 0.03930935251798561, "grad_norm": 13.525870323181152, "learning_rate": 5.553648208150728e-06, "loss": 0.4603, "step": 683 }, { "epoch": 0.03936690647482014, "grad_norm": 15.693315505981445, "learning_rate": 5.522409121612304e-06, "loss": 0.1253, "step": 684 }, { "epoch": 0.03942446043165468, "grad_norm": 2.555248975753784, "learning_rate": 5.491224593105695e-06, "loss": 0.065, "step": 685 }, { "epoch": 0.03948201438848921, "grad_norm": 13.866741180419922, "learning_rate": 5.460095002604533e-06, "loss": 0.2646, "step": 686 }, { "epoch": 0.03953956834532374, "grad_norm": 1.9787272214889526, "learning_rate": 5.429020729413062e-06, "loss": 0.0937, "step": 687 }, { "epoch": 0.039597122302158276, "grad_norm": 0.8634059429168701, "learning_rate": 5.398002152161484e-06, "loss": 0.0827, "step": 688 }, { "epoch": 0.039654676258992806, "grad_norm": 7.688801288604736, "learning_rate": 5.367039648801386e-06, "loss": 0.187, "step": 689 }, { "epoch": 0.039712230215827336, "grad_norm": 3.1192588806152344, "learning_rate": 5.336133596601089e-06, "loss": 0.1103, "step": 690 }, { "epoch": 0.03976978417266187, "grad_norm": 0.4575349688529968, "learning_rate": 5.305284372141095e-06, "loss": 0.0628, "step": 691 }, { "epoch": 0.039827338129496404, "grad_norm": 3.0432817935943604, "learning_rate": 5.274492351309462e-06, "loss": 0.0893, "step": 692 }, { "epoch": 0.039884892086330934, "grad_norm": 7.722862243652344, "learning_rate": 5.243757909297247e-06, "loss": 0.1858, "step": 693 }, { "epoch": 0.03994244604316547, "grad_norm": 1.459063172340393, "learning_rate": 5.213081420593933e-06, "loss": 0.0857, "step": 694 }, { "epoch": 0.04, "grad_norm": 2.9894986152648926, "learning_rate": 5.1824632589828465e-06, "loss": 0.1136, "step": 695 }, { "epoch": 0.04005755395683453, "grad_norm": 0.5634929537773132, "learning_rate": 5.151903797536631e-06, "loss": 0.0685, "step": 696 }, { "epoch": 0.04011510791366907, "grad_norm": 4.808594703674316, "learning_rate": 5.121403408612672e-06, "loss": 0.1148, "step": 697 }, { "epoch": 0.0401726618705036, "grad_norm": 0.5084381103515625, "learning_rate": 5.090962463848592e-06, "loss": 0.0665, "step": 698 }, { "epoch": 0.04023021582733813, "grad_norm": 11.494372367858887, "learning_rate": 5.060581334157693e-06, "loss": 0.212, "step": 699 }, { "epoch": 0.04028776978417266, "grad_norm": 13.936042785644531, "learning_rate": 5.030260389724447e-06, "loss": 0.403, "step": 700 }, { "epoch": 0.040345323741007195, "grad_norm": 4.563905239105225, "learning_rate": 5.000000000000003e-06, "loss": 0.2219, "step": 701 }, { "epoch": 0.040402877697841726, "grad_norm": 1.8324753046035767, "learning_rate": 4.96980053369765e-06, "loss": 0.0575, "step": 702 }, { "epoch": 0.040460431654676256, "grad_norm": 0.6643631458282471, "learning_rate": 4.939662358788364e-06, "loss": 0.0747, "step": 703 }, { "epoch": 0.04051798561151079, "grad_norm": 0.9275362491607666, "learning_rate": 4.909585842496287e-06, "loss": 0.0876, "step": 704 }, { "epoch": 0.04057553956834532, "grad_norm": 13.703351020812988, "learning_rate": 4.879571351294287e-06, "loss": 0.3226, "step": 705 }, { "epoch": 0.04063309352517985, "grad_norm": 0.6237985491752625, "learning_rate": 4.849619250899458e-06, "loss": 0.0687, "step": 706 }, { "epoch": 0.04069064748201439, "grad_norm": 0.9500293135643005, "learning_rate": 4.8197299062687e-06, "loss": 0.0092, "step": 707 }, { "epoch": 0.04074820143884892, "grad_norm": 31.131847381591797, "learning_rate": 4.78990368159424e-06, "loss": 0.7294, "step": 708 }, { "epoch": 0.04080575539568345, "grad_norm": 2.998800277709961, "learning_rate": 4.76014094029921e-06, "loss": 0.0865, "step": 709 }, { "epoch": 0.04086330935251799, "grad_norm": 2.464053153991699, "learning_rate": 4.7304420450332244e-06, "loss": 0.0947, "step": 710 }, { "epoch": 0.04092086330935252, "grad_norm": 5.868266582489014, "learning_rate": 4.700807357667953e-06, "loss": 0.1665, "step": 711 }, { "epoch": 0.04097841726618705, "grad_norm": 0.5691702365875244, "learning_rate": 4.671237239292699e-06, "loss": 0.0655, "step": 712 }, { "epoch": 0.041035971223021585, "grad_norm": 2.4520723819732666, "learning_rate": 4.641732050210032e-06, "loss": 0.1552, "step": 713 }, { "epoch": 0.041093525179856115, "grad_norm": 0.49852254986763, "learning_rate": 4.612292149931369e-06, "loss": 0.0324, "step": 714 }, { "epoch": 0.041151079136690645, "grad_norm": 12.27933120727539, "learning_rate": 4.582917897172603e-06, "loss": 0.1786, "step": 715 }, { "epoch": 0.04120863309352518, "grad_norm": 5.295310020446777, "learning_rate": 4.5536096498497295e-06, "loss": 0.0978, "step": 716 }, { "epoch": 0.04126618705035971, "grad_norm": 0.554686427116394, "learning_rate": 4.524367765074499e-06, "loss": 0.0519, "step": 717 }, { "epoch": 0.04132374100719424, "grad_norm": 0.8725594282150269, "learning_rate": 4.495192599150045e-06, "loss": 0.037, "step": 718 }, { "epoch": 0.04138129496402878, "grad_norm": 1.3279333114624023, "learning_rate": 4.46608450756656e-06, "loss": 0.0527, "step": 719 }, { "epoch": 0.04143884892086331, "grad_norm": 7.157567501068115, "learning_rate": 4.437043844996952e-06, "loss": 0.1266, "step": 720 }, { "epoch": 0.04149640287769784, "grad_norm": 1.4394761323928833, "learning_rate": 4.408070965292534e-06, "loss": 0.1102, "step": 721 }, { "epoch": 0.04155395683453238, "grad_norm": 1.952061653137207, "learning_rate": 4.379166221478697e-06, "loss": 0.1155, "step": 722 }, { "epoch": 0.04161151079136691, "grad_norm": 3.076018810272217, "learning_rate": 4.350329965750622e-06, "loss": 0.0957, "step": 723 }, { "epoch": 0.04166906474820144, "grad_norm": 5.298429012298584, "learning_rate": 4.321562549468991e-06, "loss": 0.0811, "step": 724 }, { "epoch": 0.041726618705035974, "grad_norm": 1.0459011793136597, "learning_rate": 4.292864323155684e-06, "loss": 0.0744, "step": 725 }, { "epoch": 0.041784172661870504, "grad_norm": 3.071275234222412, "learning_rate": 4.264235636489542e-06, "loss": 0.1764, "step": 726 }, { "epoch": 0.041841726618705034, "grad_norm": 0.6313245296478271, "learning_rate": 4.235676838302069e-06, "loss": 0.0659, "step": 727 }, { "epoch": 0.04189928057553957, "grad_norm": 0.4360724687576294, "learning_rate": 4.207188276573214e-06, "loss": 0.0432, "step": 728 }, { "epoch": 0.0419568345323741, "grad_norm": 1.621596097946167, "learning_rate": 4.178770298427107e-06, "loss": 0.0963, "step": 729 }, { "epoch": 0.04201438848920863, "grad_norm": 7.333670139312744, "learning_rate": 4.150423250127846e-06, "loss": 0.1117, "step": 730 }, { "epoch": 0.04207194244604317, "grad_norm": 9.378557205200195, "learning_rate": 4.12214747707527e-06, "loss": 0.1928, "step": 731 }, { "epoch": 0.0421294964028777, "grad_norm": 7.514365196228027, "learning_rate": 4.093943323800746e-06, "loss": 0.1798, "step": 732 }, { "epoch": 0.04218705035971223, "grad_norm": 3.0120790004730225, "learning_rate": 4.065811133962987e-06, "loss": 0.1246, "step": 733 }, { "epoch": 0.04224460431654676, "grad_norm": 1.4734376668930054, "learning_rate": 4.037751250343841e-06, "loss": 0.0872, "step": 734 }, { "epoch": 0.042302158273381296, "grad_norm": 0.6325392723083496, "learning_rate": 4.009764014844143e-06, "loss": 0.0912, "step": 735 }, { "epoch": 0.042359712230215826, "grad_norm": 2.082659959793091, "learning_rate": 3.981849768479516e-06, "loss": 0.1113, "step": 736 }, { "epoch": 0.042417266187050356, "grad_norm": 3.347140312194824, "learning_rate": 3.954008851376252e-06, "loss": 0.1088, "step": 737 }, { "epoch": 0.042474820143884894, "grad_norm": 0.7007344365119934, "learning_rate": 3.9262416027671354e-06, "loss": 0.0753, "step": 738 }, { "epoch": 0.042532374100719424, "grad_norm": 6.338494300842285, "learning_rate": 3.898548360987325e-06, "loss": 0.1733, "step": 739 }, { "epoch": 0.042589928057553954, "grad_norm": 7.861074924468994, "learning_rate": 3.8709294634702374e-06, "loss": 0.1722, "step": 740 }, { "epoch": 0.04264748201438849, "grad_norm": 1.3125121593475342, "learning_rate": 3.8433852467434175e-06, "loss": 0.0739, "step": 741 }, { "epoch": 0.04270503597122302, "grad_norm": 2.1111292839050293, "learning_rate": 3.81591604642446e-06, "loss": 0.1041, "step": 742 }, { "epoch": 0.04276258992805755, "grad_norm": 2.2051801681518555, "learning_rate": 3.7885221972168974e-06, "loss": 0.0826, "step": 743 }, { "epoch": 0.04282014388489209, "grad_norm": 4.703516960144043, "learning_rate": 3.7612040329061405e-06, "loss": 0.1218, "step": 744 }, { "epoch": 0.04287769784172662, "grad_norm": 0.9855811595916748, "learning_rate": 3.7339618863553983e-06, "loss": 0.0765, "step": 745 }, { "epoch": 0.04293525179856115, "grad_norm": 0.4522510766983032, "learning_rate": 3.7067960895016277e-06, "loss": 0.0639, "step": 746 }, { "epoch": 0.042992805755395685, "grad_norm": 0.5402107834815979, "learning_rate": 3.679706973351491e-06, "loss": 0.0463, "step": 747 }, { "epoch": 0.043050359712230216, "grad_norm": 8.379833221435547, "learning_rate": 3.6526948679773256e-06, "loss": 0.2167, "step": 748 }, { "epoch": 0.043107913669064746, "grad_norm": 0.9145704507827759, "learning_rate": 3.625760102513103e-06, "loss": 0.098, "step": 749 }, { "epoch": 0.04316546762589928, "grad_norm": 1.1772428750991821, "learning_rate": 3.598903005150444e-06, "loss": 0.0908, "step": 750 }, { "epoch": 0.04322302158273381, "grad_norm": 1.089911699295044, "learning_rate": 3.5721239031346067e-06, "loss": 0.0803, "step": 751 }, { "epoch": 0.04328057553956834, "grad_norm": 0.725675642490387, "learning_rate": 3.545423122760493e-06, "loss": 0.0763, "step": 752 }, { "epoch": 0.04333812949640288, "grad_norm": 1.3648688793182373, "learning_rate": 3.5188009893686916e-06, "loss": 0.0807, "step": 753 }, { "epoch": 0.04339568345323741, "grad_norm": 5.510678768157959, "learning_rate": 3.492257827341492e-06, "loss": 0.1315, "step": 754 }, { "epoch": 0.04345323741007194, "grad_norm": 1.459609031677246, "learning_rate": 3.4657939600989453e-06, "loss": 0.1219, "step": 755 }, { "epoch": 0.04351079136690648, "grad_norm": 8.776249885559082, "learning_rate": 3.4394097100949286e-06, "loss": 0.1991, "step": 756 }, { "epoch": 0.04356834532374101, "grad_norm": 2.417057514190674, "learning_rate": 3.4131053988131947e-06, "loss": 0.0998, "step": 757 }, { "epoch": 0.04362589928057554, "grad_norm": 12.036551475524902, "learning_rate": 3.3868813467634833e-06, "loss": 0.2524, "step": 758 }, { "epoch": 0.043683453237410075, "grad_norm": 14.389498710632324, "learning_rate": 3.360737873477584e-06, "loss": 0.3022, "step": 759 }, { "epoch": 0.043741007194244605, "grad_norm": 12.136941909790039, "learning_rate": 3.3346752975054763e-06, "loss": 0.2324, "step": 760 }, { "epoch": 0.043798561151079135, "grad_norm": 5.893527507781982, "learning_rate": 3.308693936411421e-06, "loss": 0.1189, "step": 761 }, { "epoch": 0.04385611510791367, "grad_norm": 1.1870757341384888, "learning_rate": 3.2827941067700996e-06, "loss": 0.0794, "step": 762 }, { "epoch": 0.0439136690647482, "grad_norm": 0.639008641242981, "learning_rate": 3.2569761241627694e-06, "loss": 0.0606, "step": 763 }, { "epoch": 0.04397122302158273, "grad_norm": 2.8850464820861816, "learning_rate": 3.2312403031733943e-06, "loss": 0.1593, "step": 764 }, { "epoch": 0.04402877697841727, "grad_norm": 15.170232772827148, "learning_rate": 3.2055869573848374e-06, "loss": 0.2961, "step": 765 }, { "epoch": 0.0440863309352518, "grad_norm": 1.2211562395095825, "learning_rate": 3.1800163993750166e-06, "loss": 0.1089, "step": 766 }, { "epoch": 0.04414388489208633, "grad_norm": 0.4650379717350006, "learning_rate": 3.1545289407131128e-06, "loss": 0.0683, "step": 767 }, { "epoch": 0.04420143884892087, "grad_norm": 1.6909816265106201, "learning_rate": 3.1291248919557717e-06, "loss": 0.123, "step": 768 }, { "epoch": 0.0442589928057554, "grad_norm": 0.07863793522119522, "learning_rate": 3.103804562643302e-06, "loss": 0.0008, "step": 769 }, { "epoch": 0.04431654676258993, "grad_norm": 8.237005233764648, "learning_rate": 3.0785682612959334e-06, "loss": 0.1946, "step": 770 }, { "epoch": 0.04437410071942446, "grad_norm": 1.3069466352462769, "learning_rate": 3.0534162954100264e-06, "loss": 0.0659, "step": 771 }, { "epoch": 0.044431654676258994, "grad_norm": 0.9894955158233643, "learning_rate": 3.028348971454356e-06, "loss": 0.0554, "step": 772 }, { "epoch": 0.044489208633093524, "grad_norm": 1.0656694173812866, "learning_rate": 3.003366594866345e-06, "loss": 0.0713, "step": 773 }, { "epoch": 0.044546762589928054, "grad_norm": 2.17790150642395, "learning_rate": 2.978469470048376e-06, "loss": 0.1119, "step": 774 }, { "epoch": 0.04460431654676259, "grad_norm": 1.1125558614730835, "learning_rate": 2.953657900364053e-06, "loss": 0.0799, "step": 775 }, { "epoch": 0.04466187050359712, "grad_norm": 0.7158892154693604, "learning_rate": 2.9289321881345257e-06, "loss": 0.0711, "step": 776 }, { "epoch": 0.04471942446043165, "grad_norm": 1.9382431507110596, "learning_rate": 2.9042926346347932e-06, "loss": 0.0626, "step": 777 }, { "epoch": 0.04477697841726619, "grad_norm": 3.6688735485076904, "learning_rate": 2.8797395400900362e-06, "loss": 0.1775, "step": 778 }, { "epoch": 0.04483453237410072, "grad_norm": 0.9889999032020569, "learning_rate": 2.855273203671969e-06, "loss": 0.0765, "step": 779 }, { "epoch": 0.04489208633093525, "grad_norm": 14.821270942687988, "learning_rate": 2.830893923495173e-06, "loss": 0.5594, "step": 780 }, { "epoch": 0.044949640287769786, "grad_norm": 2.756847620010376, "learning_rate": 2.8066019966134907e-06, "loss": 0.0805, "step": 781 }, { "epoch": 0.045007194244604316, "grad_norm": 2.8205065727233887, "learning_rate": 2.7823977190163788e-06, "loss": 0.1263, "step": 782 }, { "epoch": 0.045064748201438846, "grad_norm": 12.99278736114502, "learning_rate": 2.7582813856253276e-06, "loss": 0.4152, "step": 783 }, { "epoch": 0.045122302158273384, "grad_norm": 2.810309410095215, "learning_rate": 2.7342532902902418e-06, "loss": 0.0943, "step": 784 }, { "epoch": 0.045179856115107914, "grad_norm": 1.1653300523757935, "learning_rate": 2.7103137257858867e-06, "loss": 0.0877, "step": 785 }, { "epoch": 0.045237410071942444, "grad_norm": 5.321506023406982, "learning_rate": 2.6864629838082957e-06, "loss": 0.1012, "step": 786 }, { "epoch": 0.04529496402877698, "grad_norm": 0.6735225915908813, "learning_rate": 2.6627013549712355e-06, "loss": 0.066, "step": 787 }, { "epoch": 0.04535251798561151, "grad_norm": 0.7565364241600037, "learning_rate": 2.639029128802657e-06, "loss": 0.0728, "step": 788 }, { "epoch": 0.04541007194244604, "grad_norm": 1.2969605922698975, "learning_rate": 2.615446593741161e-06, "loss": 0.0699, "step": 789 }, { "epoch": 0.04546762589928058, "grad_norm": 1.2415449619293213, "learning_rate": 2.5919540371325005e-06, "loss": 0.0802, "step": 790 }, { "epoch": 0.04552517985611511, "grad_norm": 3.416377067565918, "learning_rate": 2.5685517452260566e-06, "loss": 0.0328, "step": 791 }, { "epoch": 0.04558273381294964, "grad_norm": 3.912259340286255, "learning_rate": 2.5452400031713786e-06, "loss": 0.0989, "step": 792 }, { "epoch": 0.045640287769784176, "grad_norm": 4.516401767730713, "learning_rate": 2.522019095014683e-06, "loss": 0.1032, "step": 793 }, { "epoch": 0.045697841726618706, "grad_norm": 1.5598896741867065, "learning_rate": 2.4988893036954045e-06, "loss": 0.0709, "step": 794 }, { "epoch": 0.045755395683453236, "grad_norm": 4.611602306365967, "learning_rate": 2.4758509110427576e-06, "loss": 0.1643, "step": 795 }, { "epoch": 0.04581294964028777, "grad_norm": 5.493204593658447, "learning_rate": 2.45290419777228e-06, "loss": 0.0297, "step": 796 }, { "epoch": 0.0458705035971223, "grad_norm": 0.6497253775596619, "learning_rate": 2.4300494434824373e-06, "loss": 0.0486, "step": 797 }, { "epoch": 0.04592805755395683, "grad_norm": 1.2920628786087036, "learning_rate": 2.407286926651192e-06, "loss": 0.0787, "step": 798 }, { "epoch": 0.04598561151079137, "grad_norm": 2.440001964569092, "learning_rate": 2.3846169246326345e-06, "loss": 0.1111, "step": 799 }, { "epoch": 0.0460431654676259, "grad_norm": 1.9451384544372559, "learning_rate": 2.362039713653581e-06, "loss": 0.0868, "step": 800 }, { "epoch": 0.04610071942446043, "grad_norm": 2.6515021324157715, "learning_rate": 2.339555568810221e-06, "loss": 0.1016, "step": 801 }, { "epoch": 0.04615827338129497, "grad_norm": 2.210442304611206, "learning_rate": 2.317164764064769e-06, "loss": 0.0648, "step": 802 }, { "epoch": 0.0462158273381295, "grad_norm": 1.4524075984954834, "learning_rate": 2.2948675722421086e-06, "loss": 0.0752, "step": 803 }, { "epoch": 0.04627338129496403, "grad_norm": 16.323625564575195, "learning_rate": 2.27266426502649e-06, "loss": 0.2994, "step": 804 }, { "epoch": 0.04633093525179856, "grad_norm": 0.39062535762786865, "learning_rate": 2.2505551129582047e-06, "loss": 0.0519, "step": 805 }, { "epoch": 0.046388489208633095, "grad_norm": 1.672765851020813, "learning_rate": 2.2285403854302912e-06, "loss": 0.064, "step": 806 }, { "epoch": 0.046446043165467625, "grad_norm": 8.528423309326172, "learning_rate": 2.206620350685257e-06, "loss": 0.1726, "step": 807 }, { "epoch": 0.046503597122302155, "grad_norm": 2.00990629196167, "learning_rate": 2.1847952758118118e-06, "loss": 0.0689, "step": 808 }, { "epoch": 0.04656115107913669, "grad_norm": 1.1098554134368896, "learning_rate": 2.163065426741603e-06, "loss": 0.0746, "step": 809 }, { "epoch": 0.04661870503597122, "grad_norm": 6.2346014976501465, "learning_rate": 2.1414310682459805e-06, "loss": 0.128, "step": 810 }, { "epoch": 0.04667625899280575, "grad_norm": 20.44989776611328, "learning_rate": 2.119892463932781e-06, "loss": 0.2095, "step": 811 }, { "epoch": 0.04673381294964029, "grad_norm": 6.084557056427002, "learning_rate": 2.098449876243096e-06, "loss": 0.1533, "step": 812 }, { "epoch": 0.04679136690647482, "grad_norm": 18.029020309448242, "learning_rate": 2.0771035664480944e-06, "loss": 0.2013, "step": 813 }, { "epoch": 0.04684892086330935, "grad_norm": 2.0399179458618164, "learning_rate": 2.0558537946458177e-06, "loss": 0.0732, "step": 814 }, { "epoch": 0.04690647482014389, "grad_norm": 0.7458027005195618, "learning_rate": 2.0347008197580376e-06, "loss": 0.0815, "step": 815 }, { "epoch": 0.04696402877697842, "grad_norm": 1.1791083812713623, "learning_rate": 2.013644899527074e-06, "loss": 0.0646, "step": 816 }, { "epoch": 0.04702158273381295, "grad_norm": 1.5898396968841553, "learning_rate": 1.9926862905126663e-06, "loss": 0.0786, "step": 817 }, { "epoch": 0.047079136690647484, "grad_norm": 0.5349504947662354, "learning_rate": 1.9718252480888567e-06, "loss": 0.0573, "step": 818 }, { "epoch": 0.047136690647482014, "grad_norm": 1.0246250629425049, "learning_rate": 1.95106202644086e-06, "loss": 0.0941, "step": 819 }, { "epoch": 0.047194244604316545, "grad_norm": 2.1197612285614014, "learning_rate": 1.930396878561983e-06, "loss": 0.0905, "step": 820 }, { "epoch": 0.04725179856115108, "grad_norm": 4.380585670471191, "learning_rate": 1.9098300562505266e-06, "loss": 0.1047, "step": 821 }, { "epoch": 0.04730935251798561, "grad_norm": 0.4877808094024658, "learning_rate": 1.8893618101067357e-06, "loss": 0.0526, "step": 822 }, { "epoch": 0.04736690647482014, "grad_norm": 5.169129848480225, "learning_rate": 1.8689923895297247e-06, "loss": 0.1075, "step": 823 }, { "epoch": 0.04742446043165468, "grad_norm": 0.528556227684021, "learning_rate": 1.848722042714457e-06, "loss": 0.0662, "step": 824 }, { "epoch": 0.04748201438848921, "grad_norm": 1.4885525703430176, "learning_rate": 1.8285510166487154e-06, "loss": 0.0809, "step": 825 }, { "epoch": 0.04753956834532374, "grad_norm": 0.9235038161277771, "learning_rate": 1.808479557110081e-06, "loss": 0.049, "step": 826 }, { "epoch": 0.047597122302158276, "grad_norm": 3.030466079711914, "learning_rate": 1.7885079086629598e-06, "loss": 0.0674, "step": 827 }, { "epoch": 0.047654676258992806, "grad_norm": 0.3380258083343506, "learning_rate": 1.7686363146555807e-06, "loss": 0.0407, "step": 828 }, { "epoch": 0.047712230215827336, "grad_norm": 0.3373538553714752, "learning_rate": 1.7488650172170496e-06, "loss": 0.0557, "step": 829 }, { "epoch": 0.047769784172661874, "grad_norm": 8.12345027923584, "learning_rate": 1.7291942572543806e-06, "loss": 0.1242, "step": 830 }, { "epoch": 0.047827338129496404, "grad_norm": 18.61945915222168, "learning_rate": 1.709624274449584e-06, "loss": 0.1312, "step": 831 }, { "epoch": 0.047884892086330934, "grad_norm": 1.0393412113189697, "learning_rate": 1.6901553072567189e-06, "loss": 0.0669, "step": 832 }, { "epoch": 0.04794244604316547, "grad_norm": 0.405079185962677, "learning_rate": 1.6707875928990059e-06, "loss": 0.0455, "step": 833 }, { "epoch": 0.048, "grad_norm": 1.6890630722045898, "learning_rate": 1.651521367365936e-06, "loss": 0.0977, "step": 834 }, { "epoch": 0.04805755395683453, "grad_norm": 2.578721523284912, "learning_rate": 1.6323568654103838e-06, "loss": 0.0692, "step": 835 }, { "epoch": 0.04811510791366907, "grad_norm": 1.0500586032867432, "learning_rate": 1.6132943205457607e-06, "loss": 0.1101, "step": 836 }, { "epoch": 0.0481726618705036, "grad_norm": 25.174041748046875, "learning_rate": 1.5943339650431578e-06, "loss": 0.341, "step": 837 }, { "epoch": 0.04823021582733813, "grad_norm": 0.7596455216407776, "learning_rate": 1.5754760299285255e-06, "loss": 0.0793, "step": 838 }, { "epoch": 0.04828776978417266, "grad_norm": 0.9107263684272766, "learning_rate": 1.5567207449798517e-06, "loss": 0.079, "step": 839 }, { "epoch": 0.048345323741007196, "grad_norm": 15.38805866241455, "learning_rate": 1.538068338724361e-06, "loss": 0.4072, "step": 840 }, { "epoch": 0.048402877697841726, "grad_norm": 0.3658386766910553, "learning_rate": 1.5195190384357405e-06, "loss": 0.0568, "step": 841 }, { "epoch": 0.048460431654676256, "grad_norm": 5.78350305557251, "learning_rate": 1.5010730701313626e-06, "loss": 0.078, "step": 842 }, { "epoch": 0.04851798561151079, "grad_norm": 0.8679736256599426, "learning_rate": 1.4827306585695234e-06, "loss": 0.0967, "step": 843 }, { "epoch": 0.04857553956834532, "grad_norm": 11.486621856689453, "learning_rate": 1.4644920272467245e-06, "loss": 0.1852, "step": 844 }, { "epoch": 0.04863309352517985, "grad_norm": 7.43509578704834, "learning_rate": 1.446357398394934e-06, "loss": 0.1715, "step": 845 }, { "epoch": 0.04869064748201439, "grad_norm": 0.7901409864425659, "learning_rate": 1.4283269929788779e-06, "loss": 0.0405, "step": 846 }, { "epoch": 0.04874820143884892, "grad_norm": 1.712028980255127, "learning_rate": 1.4104010306933558e-06, "loss": 0.088, "step": 847 }, { "epoch": 0.04880575539568345, "grad_norm": 1.992058277130127, "learning_rate": 1.3925797299605649e-06, "loss": 0.1193, "step": 848 }, { "epoch": 0.04886330935251799, "grad_norm": 0.7998857498168945, "learning_rate": 1.3748633079274254e-06, "loss": 0.0473, "step": 849 }, { "epoch": 0.04892086330935252, "grad_norm": 0.4824025332927704, "learning_rate": 1.3572519804629537e-06, "loss": 0.0652, "step": 850 }, { "epoch": 0.04897841726618705, "grad_norm": 0.8086461424827576, "learning_rate": 1.339745962155613e-06, "loss": 0.0645, "step": 851 }, { "epoch": 0.049035971223021585, "grad_norm": 27.238941192626953, "learning_rate": 1.322345466310717e-06, "loss": 0.4208, "step": 852 }, { "epoch": 0.049093525179856115, "grad_norm": 1.820080041885376, "learning_rate": 1.30505070494781e-06, "loss": 0.1134, "step": 853 }, { "epoch": 0.049151079136690645, "grad_norm": 0.9083067178726196, "learning_rate": 1.2878618887981064e-06, "loss": 0.0782, "step": 854 }, { "epoch": 0.04920863309352518, "grad_norm": 0.7896227240562439, "learning_rate": 1.2707792273019049e-06, "loss": 0.0586, "step": 855 }, { "epoch": 0.04926618705035971, "grad_norm": 0.8508791923522949, "learning_rate": 1.2538029286060428e-06, "loss": 0.059, "step": 856 }, { "epoch": 0.04932374100719424, "grad_norm": 1.0554744005203247, "learning_rate": 1.2369331995613664e-06, "loss": 0.0841, "step": 857 }, { "epoch": 0.04938129496402878, "grad_norm": 1.7418127059936523, "learning_rate": 1.2201702457201948e-06, "loss": 0.0688, "step": 858 }, { "epoch": 0.04943884892086331, "grad_norm": 3.3948047161102295, "learning_rate": 1.2035142713338366e-06, "loss": 0.0954, "step": 859 }, { "epoch": 0.04949640287769784, "grad_norm": 1.1235921382904053, "learning_rate": 1.1869654793500784e-06, "loss": 0.0726, "step": 860 }, { "epoch": 0.04955395683453238, "grad_norm": 4.876044750213623, "learning_rate": 1.1705240714107301e-06, "loss": 0.1062, "step": 861 }, { "epoch": 0.04961151079136691, "grad_norm": 6.03566837310791, "learning_rate": 1.1541902478491607e-06, "loss": 0.1015, "step": 862 }, { "epoch": 0.04966906474820144, "grad_norm": 0.6971962451934814, "learning_rate": 1.1379642076878528e-06, "loss": 0.0514, "step": 863 }, { "epoch": 0.049726618705035974, "grad_norm": 11.001822471618652, "learning_rate": 1.1218461486359878e-06, "loss": 0.467, "step": 864 }, { "epoch": 0.049784172661870504, "grad_norm": 1.605398178100586, "learning_rate": 1.1058362670870248e-06, "loss": 0.1081, "step": 865 }, { "epoch": 0.049841726618705035, "grad_norm": 17.996734619140625, "learning_rate": 1.0899347581163222e-06, "loss": 0.3179, "step": 866 }, { "epoch": 0.04989928057553957, "grad_norm": 0.2730165123939514, "learning_rate": 1.0741418154787443e-06, "loss": 0.0261, "step": 867 }, { "epoch": 0.0499568345323741, "grad_norm": 5.960131645202637, "learning_rate": 1.058457631606319e-06, "loss": 0.1051, "step": 868 }, { "epoch": 0.05001438848920863, "grad_norm": 2.7779898643493652, "learning_rate": 1.042882397605871e-06, "loss": 0.0692, "step": 869 }, { "epoch": 0.05007194244604317, "grad_norm": 0.33091625571250916, "learning_rate": 1.0274163032567165e-06, "loss": 0.0534, "step": 870 }, { "epoch": 0.0501294964028777, "grad_norm": 0.6273438334465027, "learning_rate": 1.012059537008332e-06, "loss": 0.0617, "step": 871 }, { "epoch": 0.05018705035971223, "grad_norm": 0.8654516935348511, "learning_rate": 9.968122859780648e-07, "loss": 0.0568, "step": 872 }, { "epoch": 0.05024460431654676, "grad_norm": 13.922266960144043, "learning_rate": 9.816747359488632e-07, "loss": 0.1749, "step": 873 }, { "epoch": 0.050302158273381296, "grad_norm": 16.455718994140625, "learning_rate": 9.666470713669918e-07, "loss": 0.2171, "step": 874 }, { "epoch": 0.050359712230215826, "grad_norm": 11.806230545043945, "learning_rate": 9.517294753398066e-07, "loss": 0.25, "step": 875 }, { "epoch": 0.05041726618705036, "grad_norm": 0.3645191192626953, "learning_rate": 9.369221296335007e-07, "loss": 0.05, "step": 876 }, { "epoch": 0.050474820143884894, "grad_norm": 0.4936099648475647, "learning_rate": 9.222252146709143e-07, "loss": 0.0519, "step": 877 }, { "epoch": 0.050532374100719424, "grad_norm": 13.692974090576172, "learning_rate": 9.076389095293148e-07, "loss": 0.2554, "step": 878 }, { "epoch": 0.050589928057553954, "grad_norm": 0.6284224987030029, "learning_rate": 8.931633919382299e-07, "loss": 0.0659, "step": 879 }, { "epoch": 0.05064748201438849, "grad_norm": 0.7923092842102051, "learning_rate": 8.787988382772705e-07, "loss": 0.0568, "step": 880 }, { "epoch": 0.05070503597122302, "grad_norm": 22.334707260131836, "learning_rate": 8.645454235739903e-07, "loss": 0.2308, "step": 881 }, { "epoch": 0.05076258992805755, "grad_norm": 4.818454742431641, "learning_rate": 8.504033215017527e-07, "loss": 0.1232, "step": 882 }, { "epoch": 0.05082014388489209, "grad_norm": 1.418079137802124, "learning_rate": 8.363727043776037e-07, "loss": 0.0688, "step": 883 }, { "epoch": 0.05087769784172662, "grad_norm": 0.7452939748764038, "learning_rate": 8.224537431601886e-07, "loss": 0.0562, "step": 884 }, { "epoch": 0.05093525179856115, "grad_norm": 1.132005214691162, "learning_rate": 8.086466074476562e-07, "loss": 0.0804, "step": 885 }, { "epoch": 0.050992805755395686, "grad_norm": 1.1662274599075317, "learning_rate": 7.949514654755963e-07, "loss": 0.0778, "step": 886 }, { "epoch": 0.051050359712230216, "grad_norm": 0.6592736840248108, "learning_rate": 7.81368484114996e-07, "loss": 0.0662, "step": 887 }, { "epoch": 0.051107913669064746, "grad_norm": 9.107586860656738, "learning_rate": 7.678978288701911e-07, "loss": 0.1515, "step": 888 }, { "epoch": 0.05116546762589928, "grad_norm": 11.478974342346191, "learning_rate": 7.545396638768698e-07, "loss": 0.496, "step": 889 }, { "epoch": 0.05122302158273381, "grad_norm": 0.9263907670974731, "learning_rate": 7.412941519000527e-07, "loss": 0.0756, "step": 890 }, { "epoch": 0.05128057553956834, "grad_norm": 20.043466567993164, "learning_rate": 7.281614543321269e-07, "loss": 0.5612, "step": 891 }, { "epoch": 0.05133812949640288, "grad_norm": 9.131746292114258, "learning_rate": 7.151417311908648e-07, "loss": 0.1701, "step": 892 }, { "epoch": 0.05139568345323741, "grad_norm": 2.0880978107452393, "learning_rate": 7.022351411174866e-07, "loss": 0.1408, "step": 893 }, { "epoch": 0.05145323741007194, "grad_norm": 0.7419898509979248, "learning_rate": 6.894418413747183e-07, "loss": 0.0609, "step": 894 }, { "epoch": 0.05151079136690648, "grad_norm": 18.692054748535156, "learning_rate": 6.767619878448783e-07, "loss": 0.3365, "step": 895 }, { "epoch": 0.05156834532374101, "grad_norm": 1.1873613595962524, "learning_rate": 6.641957350279838e-07, "loss": 0.0662, "step": 896 }, { "epoch": 0.05162589928057554, "grad_norm": 1.0309622287750244, "learning_rate": 6.517432360398556e-07, "loss": 0.062, "step": 897 }, { "epoch": 0.051683453237410075, "grad_norm": 1.502915620803833, "learning_rate": 6.394046426102673e-07, "loss": 0.0679, "step": 898 }, { "epoch": 0.051741007194244605, "grad_norm": 14.485411643981934, "learning_rate": 6.271801050810856e-07, "loss": 0.2849, "step": 899 }, { "epoch": 0.051798561151079135, "grad_norm": 0.6515581607818604, "learning_rate": 6.150697724044407e-07, "loss": 0.0497, "step": 900 }, { "epoch": 0.05185611510791367, "grad_norm": 1.2064189910888672, "learning_rate": 6.030737921409169e-07, "loss": 0.0801, "step": 901 }, { "epoch": 0.0519136690647482, "grad_norm": 9.05174446105957, "learning_rate": 5.911923104577455e-07, "loss": 0.1349, "step": 902 }, { "epoch": 0.05197122302158273, "grad_norm": 0.8396304845809937, "learning_rate": 5.794254721270331e-07, "loss": 0.0493, "step": 903 }, { "epoch": 0.05202877697841727, "grad_norm": 22.91259002685547, "learning_rate": 5.677734205239904e-07, "loss": 0.4679, "step": 904 }, { "epoch": 0.0520863309352518, "grad_norm": 0.31920236349105835, "learning_rate": 5.562362976251901e-07, "loss": 0.0485, "step": 905 }, { "epoch": 0.05214388489208633, "grad_norm": 0.7108873724937439, "learning_rate": 5.448142440068316e-07, "loss": 0.0509, "step": 906 }, { "epoch": 0.05220143884892086, "grad_norm": 1.8079339265823364, "learning_rate": 5.335073988430373e-07, "loss": 0.0969, "step": 907 }, { "epoch": 0.0522589928057554, "grad_norm": 10.256385803222656, "learning_rate": 5.223158999041444e-07, "loss": 0.2056, "step": 908 }, { "epoch": 0.05231654676258993, "grad_norm": 0.8384624123573303, "learning_rate": 5.112398835550348e-07, "loss": 0.0781, "step": 909 }, { "epoch": 0.05237410071942446, "grad_norm": 0.9950568079948425, "learning_rate": 5.002794847534765e-07, "loss": 0.0569, "step": 910 }, { "epoch": 0.052431654676258994, "grad_norm": 0.7972750067710876, "learning_rate": 4.894348370484648e-07, "loss": 0.072, "step": 911 }, { "epoch": 0.052489208633093525, "grad_norm": 8.564375877380371, "learning_rate": 4.787060725786141e-07, "loss": 0.1262, "step": 912 }, { "epoch": 0.052546762589928055, "grad_norm": 0.7976115942001343, "learning_rate": 4.6809332207053083e-07, "loss": 0.0669, "step": 913 }, { "epoch": 0.05260431654676259, "grad_norm": 0.6580243706703186, "learning_rate": 4.575967148372318e-07, "loss": 0.071, "step": 914 }, { "epoch": 0.05266187050359712, "grad_norm": 0.8931045532226562, "learning_rate": 4.4721637877656377e-07, "loss": 0.0581, "step": 915 }, { "epoch": 0.05271942446043165, "grad_norm": 0.581693172454834, "learning_rate": 4.3695244036964567e-07, "loss": 0.0569, "step": 916 }, { "epoch": 0.05277697841726619, "grad_norm": 14.89986801147461, "learning_rate": 4.268050246793276e-07, "loss": 0.3469, "step": 917 }, { "epoch": 0.05283453237410072, "grad_norm": 4.767395496368408, "learning_rate": 4.167742553486676e-07, "loss": 0.0892, "step": 918 }, { "epoch": 0.05289208633093525, "grad_norm": 1.3745571374893188, "learning_rate": 4.068602545994249e-07, "loss": 0.0874, "step": 919 }, { "epoch": 0.052949640287769786, "grad_norm": 7.833735466003418, "learning_rate": 3.9706314323056936e-07, "loss": 0.1356, "step": 920 }, { "epoch": 0.053007194244604317, "grad_norm": 1.1918085813522339, "learning_rate": 3.8738304061681107e-07, "loss": 0.0653, "step": 921 }, { "epoch": 0.05306474820143885, "grad_norm": 0.6890537142753601, "learning_rate": 3.7782006470714614e-07, "loss": 0.0692, "step": 922 }, { "epoch": 0.053122302158273384, "grad_norm": 0.6393564939498901, "learning_rate": 3.68374332023419e-07, "loss": 0.0206, "step": 923 }, { "epoch": 0.053179856115107914, "grad_norm": 1.7775517702102661, "learning_rate": 3.590459576589e-07, "loss": 0.1072, "step": 924 }, { "epoch": 0.053237410071942444, "grad_norm": 0.5203324556350708, "learning_rate": 3.498350552768859e-07, "loss": 0.0651, "step": 925 }, { "epoch": 0.05329496402877698, "grad_norm": 0.6219776272773743, "learning_rate": 3.4074173710931804e-07, "loss": 0.0653, "step": 926 }, { "epoch": 0.05335251798561151, "grad_norm": 1.8375036716461182, "learning_rate": 3.3176611395540625e-07, "loss": 0.0614, "step": 927 }, { "epoch": 0.05341007194244604, "grad_norm": 2.563394784927368, "learning_rate": 3.2290829518028867e-07, "loss": 0.1311, "step": 928 }, { "epoch": 0.05346762589928058, "grad_norm": 17.644411087036133, "learning_rate": 3.1416838871368925e-07, "loss": 0.3422, "step": 929 }, { "epoch": 0.05352517985611511, "grad_norm": 0.7539731860160828, "learning_rate": 3.0554650104861137e-07, "loss": 0.0837, "step": 930 }, { "epoch": 0.05358273381294964, "grad_norm": 5.77402400970459, "learning_rate": 2.970427372400353e-07, "loss": 0.1455, "step": 931 }, { "epoch": 0.053640287769784176, "grad_norm": 1.0641944408416748, "learning_rate": 2.8865720090364037e-07, "loss": 0.1101, "step": 932 }, { "epoch": 0.053697841726618706, "grad_norm": 1.1459119319915771, "learning_rate": 2.8038999421453827e-07, "loss": 0.0608, "step": 933 }, { "epoch": 0.053755395683453236, "grad_norm": 12.172619819641113, "learning_rate": 2.7224121790603517e-07, "loss": 0.3217, "step": 934 }, { "epoch": 0.05381294964028777, "grad_norm": 11.910560607910156, "learning_rate": 2.6421097126839714e-07, "loss": 0.5355, "step": 935 }, { "epoch": 0.0538705035971223, "grad_norm": 2.0426604747772217, "learning_rate": 2.5629935214764866e-07, "loss": 0.1398, "step": 936 }, { "epoch": 0.05392805755395683, "grad_norm": 1.1188652515411377, "learning_rate": 2.4850645694436736e-07, "loss": 0.1328, "step": 937 }, { "epoch": 0.05398561151079137, "grad_norm": 2.1541991233825684, "learning_rate": 2.4083238061252565e-07, "loss": 0.1437, "step": 938 }, { "epoch": 0.0540431654676259, "grad_norm": 0.8203990459442139, "learning_rate": 2.332772166583208e-07, "loss": 0.0617, "step": 939 }, { "epoch": 0.05410071942446043, "grad_norm": 14.203892707824707, "learning_rate": 2.2584105713904126e-07, "loss": 0.151, "step": 940 }, { "epoch": 0.05415827338129496, "grad_norm": 0.3919861912727356, "learning_rate": 2.1852399266194312e-07, "loss": 0.0057, "step": 941 }, { "epoch": 0.0542158273381295, "grad_norm": 3.8022513389587402, "learning_rate": 2.1132611238315004e-07, "loss": 0.118, "step": 942 }, { "epoch": 0.05427338129496403, "grad_norm": 15.274145126342773, "learning_rate": 2.0424750400655947e-07, "loss": 0.2713, "step": 943 }, { "epoch": 0.05433093525179856, "grad_norm": 1.0683073997497559, "learning_rate": 1.9728825378278248e-07, "loss": 0.0581, "step": 944 }, { "epoch": 0.054388489208633095, "grad_norm": 13.709763526916504, "learning_rate": 1.9044844650808468e-07, "loss": 0.2315, "step": 945 }, { "epoch": 0.054446043165467625, "grad_norm": 2.2819840908050537, "learning_rate": 1.8372816552336025e-07, "loss": 0.1143, "step": 946 }, { "epoch": 0.054503597122302155, "grad_norm": 1.1235344409942627, "learning_rate": 1.7712749271311392e-07, "loss": 0.0717, "step": 947 }, { "epoch": 0.05456115107913669, "grad_norm": 1.0503443479537964, "learning_rate": 1.706465085044584e-07, "loss": 0.0961, "step": 948 }, { "epoch": 0.05461870503597122, "grad_norm": 0.7241622805595398, "learning_rate": 1.6428529186614195e-07, "loss": 0.078, "step": 949 }, { "epoch": 0.05467625899280575, "grad_norm": 1.2095136642456055, "learning_rate": 1.580439203075812e-07, "loss": 0.0911, "step": 950 }, { "epoch": 0.05473381294964029, "grad_norm": 4.268467426300049, "learning_rate": 1.519224698779198e-07, "loss": 0.0737, "step": 951 }, { "epoch": 0.05479136690647482, "grad_norm": 0.36049625277519226, "learning_rate": 1.4592101516509916e-07, "loss": 0.0585, "step": 952 }, { "epoch": 0.05484892086330935, "grad_norm": 2.7988123893737793, "learning_rate": 1.400396292949513e-07, "loss": 0.0986, "step": 953 }, { "epoch": 0.05490647482014389, "grad_norm": 16.90985870361328, "learning_rate": 1.3427838393030634e-07, "loss": 0.5819, "step": 954 }, { "epoch": 0.05496402877697842, "grad_norm": 0.7780059576034546, "learning_rate": 1.2863734927012094e-07, "loss": 0.0765, "step": 955 }, { "epoch": 0.05502158273381295, "grad_norm": 1.1509917974472046, "learning_rate": 1.231165940486234e-07, "loss": 0.0709, "step": 956 }, { "epoch": 0.055079136690647484, "grad_norm": 1.6221928596496582, "learning_rate": 1.1771618553447217e-07, "loss": 0.0934, "step": 957 }, { "epoch": 0.055136690647482015, "grad_norm": 1.4992139339447021, "learning_rate": 1.1243618952994195e-07, "loss": 0.0649, "step": 958 }, { "epoch": 0.055194244604316545, "grad_norm": 19.954071044921875, "learning_rate": 1.0727667037011668e-07, "loss": 0.4697, "step": 959 }, { "epoch": 0.05525179856115108, "grad_norm": 1.0766866207122803, "learning_rate": 1.0223769092211012e-07, "loss": 0.0877, "step": 960 }, { "epoch": 0.05530935251798561, "grad_norm": 5.830136299133301, "learning_rate": 9.731931258429638e-08, "loss": 0.1061, "step": 961 }, { "epoch": 0.05536690647482014, "grad_norm": 0.7689708471298218, "learning_rate": 9.252159528556404e-08, "loss": 0.0681, "step": 962 }, { "epoch": 0.05542446043165468, "grad_norm": 2.8470983505249023, "learning_rate": 8.784459748458318e-08, "loss": 0.1102, "step": 963 }, { "epoch": 0.05548201438848921, "grad_norm": 13.154084205627441, "learning_rate": 8.328837616909612e-08, "loss": 0.3053, "step": 964 }, { "epoch": 0.05553956834532374, "grad_norm": 3.423447370529175, "learning_rate": 7.885298685522235e-08, "loss": 0.071, "step": 965 }, { "epoch": 0.055597122302158276, "grad_norm": 0.8389829397201538, "learning_rate": 7.453848358678018e-08, "loss": 0.0703, "step": 966 }, { "epoch": 0.055654676258992807, "grad_norm": 1.1691104173660278, "learning_rate": 7.034491893463059e-08, "loss": 0.0365, "step": 967 }, { "epoch": 0.05571223021582734, "grad_norm": 0.6704636216163635, "learning_rate": 6.627234399603554e-08, "loss": 0.0514, "step": 968 }, { "epoch": 0.055769784172661874, "grad_norm": 1.450444221496582, "learning_rate": 6.232080839403631e-08, "loss": 0.0943, "step": 969 }, { "epoch": 0.055827338129496404, "grad_norm": 1.0474416017532349, "learning_rate": 5.849036027684607e-08, "loss": 0.1006, "step": 970 }, { "epoch": 0.055884892086330934, "grad_norm": 0.4784161150455475, "learning_rate": 5.4781046317267103e-08, "loss": 0.0408, "step": 971 }, { "epoch": 0.05594244604316547, "grad_norm": 16.443510055541992, "learning_rate": 5.119291171211793e-08, "loss": 0.1611, "step": 972 }, { "epoch": 0.056, "grad_norm": 0.6649782657623291, "learning_rate": 4.772600018168816e-08, "loss": 0.0723, "step": 973 }, { "epoch": 0.05605755395683453, "grad_norm": 11.152069091796875, "learning_rate": 4.438035396920004e-08, "loss": 0.7074, "step": 974 }, { "epoch": 0.05611510791366906, "grad_norm": 0.5316632390022278, "learning_rate": 4.115601384029666e-08, "loss": 0.0642, "step": 975 }, { "epoch": 0.0561726618705036, "grad_norm": 2.629650354385376, "learning_rate": 3.805301908254455e-08, "loss": 0.1437, "step": 976 }, { "epoch": 0.05623021582733813, "grad_norm": 1.2182719707489014, "learning_rate": 3.50714075049563e-08, "loss": 0.094, "step": 977 }, { "epoch": 0.05628776978417266, "grad_norm": 1.433416485786438, "learning_rate": 3.22112154375287e-08, "loss": 0.1248, "step": 978 }, { "epoch": 0.056345323741007196, "grad_norm": 0.4791356027126312, "learning_rate": 2.947247773079753e-08, "loss": 0.0573, "step": 979 }, { "epoch": 0.056402877697841726, "grad_norm": 0.43710970878601074, "learning_rate": 2.6855227755419046e-08, "loss": 0.0632, "step": 980 }, { "epoch": 0.056460431654676256, "grad_norm": 1.2036529779434204, "learning_rate": 2.4359497401758026e-08, "loss": 0.0758, "step": 981 }, { "epoch": 0.05651798561151079, "grad_norm": 1.0879039764404297, "learning_rate": 2.1985317079500358e-08, "loss": 0.0864, "step": 982 }, { "epoch": 0.05657553956834532, "grad_norm": 2.7998852729797363, "learning_rate": 1.973271571728441e-08, "loss": 0.0904, "step": 983 }, { "epoch": 0.05663309352517985, "grad_norm": 0.8842798471450806, "learning_rate": 1.7601720762346895e-08, "loss": 0.072, "step": 984 }, { "epoch": 0.05669064748201439, "grad_norm": 30.26481819152832, "learning_rate": 1.5592358180189782e-08, "loss": 0.3083, "step": 985 }, { "epoch": 0.05674820143884892, "grad_norm": 1.791191577911377, "learning_rate": 1.370465245426167e-08, "loss": 0.0949, "step": 986 }, { "epoch": 0.05680575539568345, "grad_norm": 1.102792501449585, "learning_rate": 1.1938626585660252e-08, "loss": 0.078, "step": 987 }, { "epoch": 0.05686330935251799, "grad_norm": 1.305518627166748, "learning_rate": 1.0294302092853647e-08, "loss": 0.1071, "step": 988 }, { "epoch": 0.05692086330935252, "grad_norm": 1.006156325340271, "learning_rate": 8.771699011416169e-09, "loss": 0.0736, "step": 989 }, { "epoch": 0.05697841726618705, "grad_norm": 0.8825615048408508, "learning_rate": 7.370835893788508e-09, "loss": 0.0776, "step": 990 }, { "epoch": 0.057035971223021585, "grad_norm": 1.2495946884155273, "learning_rate": 6.091729809042379e-09, "loss": 0.0883, "step": 991 }, { "epoch": 0.057093525179856115, "grad_norm": 4.397529125213623, "learning_rate": 4.9343963426840006e-09, "loss": 0.1277, "step": 992 }, { "epoch": 0.057151079136690645, "grad_norm": 20.961580276489258, "learning_rate": 3.898849596456477e-09, "loss": 0.3655, "step": 993 }, { "epoch": 0.05720863309352518, "grad_norm": 0.7759275436401367, "learning_rate": 2.9851021881688314e-09, "loss": 0.0783, "step": 994 }, { "epoch": 0.05726618705035971, "grad_norm": 0.40887534618377686, "learning_rate": 2.193165251545004e-09, "loss": 0.059, "step": 995 }, { "epoch": 0.05732374100719424, "grad_norm": 1.2675024271011353, "learning_rate": 1.5230484360873043e-09, "loss": 0.0846, "step": 996 }, { "epoch": 0.05738129496402878, "grad_norm": 0.7930789589881897, "learning_rate": 9.74759906957612e-10, "loss": 0.0724, "step": 997 }, { "epoch": 0.05743884892086331, "grad_norm": 13.144028663635254, "learning_rate": 5.483063448785686e-10, "loss": 0.2582, "step": 998 }, { "epoch": 0.05749640287769784, "grad_norm": 2.8434152603149414, "learning_rate": 2.436929460525317e-10, "loss": 0.0955, "step": 999 }, { "epoch": 0.05755395683453238, "grad_norm": 0.9370110630989075, "learning_rate": 6.092342209607083e-11, "loss": 0.0814, "step": 1000 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }