diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,78371 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999329863515536, + "eval_steps": 500, + "global_step": 11191, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 8.9351531261867e-05, + "grad_norm": 0.7860376238822937, + "learning_rate": 2.976190476190476e-07, + "loss": 1.8548, + "step": 1 + }, + { + "epoch": 0.000178703062523734, + "grad_norm": 0.5748365521430969, + "learning_rate": 5.952380952380952e-07, + "loss": 1.8254, + "step": 2 + }, + { + "epoch": 0.000268054593785601, + "grad_norm": 0.8621116876602173, + "learning_rate": 8.928571428571428e-07, + "loss": 1.8688, + "step": 3 + }, + { + "epoch": 0.000357406125047468, + "grad_norm": 0.6630846261978149, + "learning_rate": 1.1904761904761904e-06, + "loss": 1.8504, + "step": 4 + }, + { + "epoch": 0.000446757656309335, + "grad_norm": 0.7190170884132385, + "learning_rate": 1.4880952380952381e-06, + "loss": 1.8197, + "step": 5 + }, + { + "epoch": 0.000536109187571202, + "grad_norm": 0.7450002431869507, + "learning_rate": 1.7857142857142857e-06, + "loss": 1.8078, + "step": 6 + }, + { + "epoch": 0.000625460718833069, + "grad_norm": 0.903852641582489, + "learning_rate": 2.0833333333333334e-06, + "loss": 1.9407, + "step": 7 + }, + { + "epoch": 0.000714812250094936, + "grad_norm": 0.8060555458068848, + "learning_rate": 2.3809523809523808e-06, + "loss": 1.7434, + "step": 8 + }, + { + "epoch": 0.000804163781356803, + "grad_norm": 1.1107157468795776, + "learning_rate": 2.6785714285714285e-06, + "loss": 1.9593, + "step": 9 + }, + { + "epoch": 0.00089351531261867, + "grad_norm": 0.7935706973075867, + "learning_rate": 2.9761904761904763e-06, + "loss": 1.8844, + "step": 10 + }, + { + "epoch": 0.000982866843880537, + "grad_norm": 0.825862467288971, + "learning_rate": 3.273809523809524e-06, + "loss": 1.8664, + "step": 11 + }, + { + "epoch": 0.001072218375142404, + "grad_norm": 0.8665326833724976, + "learning_rate": 3.5714285714285714e-06, + "loss": 1.8477, + "step": 12 + }, + { + "epoch": 0.001161569906404271, + "grad_norm": 0.7275350689888, + "learning_rate": 3.869047619047619e-06, + "loss": 1.862, + "step": 13 + }, + { + "epoch": 0.001250921437666138, + "grad_norm": 0.9508900046348572, + "learning_rate": 4.166666666666667e-06, + "loss": 1.8362, + "step": 14 + }, + { + "epoch": 0.001340272968928005, + "grad_norm": 0.9345436096191406, + "learning_rate": 4.464285714285715e-06, + "loss": 1.9073, + "step": 15 + }, + { + "epoch": 0.001429624500189872, + "grad_norm": 0.8254266977310181, + "learning_rate": 4.7619047619047615e-06, + "loss": 1.8047, + "step": 16 + }, + { + "epoch": 0.001518976031451739, + "grad_norm": 0.8926609754562378, + "learning_rate": 5.05952380952381e-06, + "loss": 1.8944, + "step": 17 + }, + { + "epoch": 0.001608327562713606, + "grad_norm": 0.8045806288719177, + "learning_rate": 5.357142857142857e-06, + "loss": 1.8269, + "step": 18 + }, + { + "epoch": 0.001697679093975473, + "grad_norm": 0.7670357823371887, + "learning_rate": 5.654761904761905e-06, + "loss": 1.7618, + "step": 19 + }, + { + "epoch": 0.00178703062523734, + "grad_norm": 0.8601244688034058, + "learning_rate": 5.9523809523809525e-06, + "loss": 1.8391, + "step": 20 + }, + { + "epoch": 0.001876382156499207, + "grad_norm": 0.9491680264472961, + "learning_rate": 6.25e-06, + "loss": 1.8631, + "step": 21 + }, + { + "epoch": 0.001965733687761074, + "grad_norm": 0.8884113430976868, + "learning_rate": 6.547619047619048e-06, + "loss": 1.786, + "step": 22 + }, + { + "epoch": 0.002055085219022941, + "grad_norm": 0.8622804880142212, + "learning_rate": 6.845238095238096e-06, + "loss": 1.7653, + "step": 23 + }, + { + "epoch": 0.002144436750284808, + "grad_norm": 0.8542262315750122, + "learning_rate": 7.142857142857143e-06, + "loss": 1.7977, + "step": 24 + }, + { + "epoch": 0.002233788281546675, + "grad_norm": 0.6878840923309326, + "learning_rate": 7.4404761904761905e-06, + "loss": 1.7566, + "step": 25 + }, + { + "epoch": 0.002323139812808542, + "grad_norm": 1.2377084493637085, + "learning_rate": 7.738095238095238e-06, + "loss": 1.7302, + "step": 26 + }, + { + "epoch": 0.002412491344070409, + "grad_norm": 0.9201034903526306, + "learning_rate": 8.035714285714286e-06, + "loss": 1.7652, + "step": 27 + }, + { + "epoch": 0.002501842875332276, + "grad_norm": 0.9445549249649048, + "learning_rate": 8.333333333333334e-06, + "loss": 1.7671, + "step": 28 + }, + { + "epoch": 0.002591194406594143, + "grad_norm": 0.7296178340911865, + "learning_rate": 8.630952380952381e-06, + "loss": 1.6999, + "step": 29 + }, + { + "epoch": 0.00268054593785601, + "grad_norm": 0.6263100504875183, + "learning_rate": 8.92857142857143e-06, + "loss": 1.7825, + "step": 30 + }, + { + "epoch": 0.002769897469117877, + "grad_norm": 0.5757091045379639, + "learning_rate": 9.226190476190477e-06, + "loss": 1.6878, + "step": 31 + }, + { + "epoch": 0.002859249000379744, + "grad_norm": 0.5896580219268799, + "learning_rate": 9.523809523809523e-06, + "loss": 1.658, + "step": 32 + }, + { + "epoch": 0.002948600531641611, + "grad_norm": 0.4904349446296692, + "learning_rate": 9.821428571428573e-06, + "loss": 1.6135, + "step": 33 + }, + { + "epoch": 0.003037952062903478, + "grad_norm": 0.628957211971283, + "learning_rate": 1.011904761904762e-05, + "loss": 1.617, + "step": 34 + }, + { + "epoch": 0.003127303594165345, + "grad_norm": 0.5395067930221558, + "learning_rate": 1.0416666666666668e-05, + "loss": 1.6439, + "step": 35 + }, + { + "epoch": 0.003216655125427212, + "grad_norm": 0.4463937282562256, + "learning_rate": 1.0714285714285714e-05, + "loss": 1.5962, + "step": 36 + }, + { + "epoch": 0.003306006656689079, + "grad_norm": 0.4546874463558197, + "learning_rate": 1.1011904761904762e-05, + "loss": 1.6277, + "step": 37 + }, + { + "epoch": 0.003395358187950946, + "grad_norm": 0.45821845531463623, + "learning_rate": 1.130952380952381e-05, + "loss": 1.6352, + "step": 38 + }, + { + "epoch": 0.003484709719212813, + "grad_norm": 0.59076327085495, + "learning_rate": 1.1607142857142857e-05, + "loss": 1.6098, + "step": 39 + }, + { + "epoch": 0.00357406125047468, + "grad_norm": 0.39401739835739136, + "learning_rate": 1.1904761904761905e-05, + "loss": 1.6404, + "step": 40 + }, + { + "epoch": 0.003663412781736547, + "grad_norm": 0.3501727879047394, + "learning_rate": 1.2202380952380953e-05, + "loss": 1.6279, + "step": 41 + }, + { + "epoch": 0.003752764312998414, + "grad_norm": 0.3471863269805908, + "learning_rate": 1.25e-05, + "loss": 1.5823, + "step": 42 + }, + { + "epoch": 0.003842115844260281, + "grad_norm": 0.3801725506782532, + "learning_rate": 1.2797619047619047e-05, + "loss": 1.5969, + "step": 43 + }, + { + "epoch": 0.003931467375522148, + "grad_norm": 0.3517632782459259, + "learning_rate": 1.3095238095238096e-05, + "loss": 1.6075, + "step": 44 + }, + { + "epoch": 0.004020818906784015, + "grad_norm": 0.32643967866897583, + "learning_rate": 1.3392857142857144e-05, + "loss": 1.537, + "step": 45 + }, + { + "epoch": 0.004110170438045882, + "grad_norm": 0.36205053329467773, + "learning_rate": 1.3690476190476192e-05, + "loss": 1.5646, + "step": 46 + }, + { + "epoch": 0.004199521969307749, + "grad_norm": 0.3534471094608307, + "learning_rate": 1.398809523809524e-05, + "loss": 1.5713, + "step": 47 + }, + { + "epoch": 0.004288873500569616, + "grad_norm": 0.349733829498291, + "learning_rate": 1.4285714285714285e-05, + "loss": 1.5545, + "step": 48 + }, + { + "epoch": 0.004378225031831483, + "grad_norm": 0.3604637682437897, + "learning_rate": 1.4583333333333335e-05, + "loss": 1.569, + "step": 49 + }, + { + "epoch": 0.00446757656309335, + "grad_norm": 0.3466920256614685, + "learning_rate": 1.4880952380952381e-05, + "loss": 1.571, + "step": 50 + }, + { + "epoch": 0.004556928094355217, + "grad_norm": 0.3057480752468109, + "learning_rate": 1.5178571428571429e-05, + "loss": 1.4928, + "step": 51 + }, + { + "epoch": 0.004646279625617084, + "grad_norm": 0.3168238401412964, + "learning_rate": 1.5476190476190476e-05, + "loss": 1.5952, + "step": 52 + }, + { + "epoch": 0.004735631156878951, + "grad_norm": 0.33002302050590515, + "learning_rate": 1.5773809523809524e-05, + "loss": 1.5885, + "step": 53 + }, + { + "epoch": 0.004824982688140818, + "grad_norm": 0.35049816966056824, + "learning_rate": 1.6071428571428572e-05, + "loss": 1.5679, + "step": 54 + }, + { + "epoch": 0.004914334219402685, + "grad_norm": 0.398250937461853, + "learning_rate": 1.636904761904762e-05, + "loss": 1.4671, + "step": 55 + }, + { + "epoch": 0.005003685750664552, + "grad_norm": 0.34120944142341614, + "learning_rate": 1.6666666666666667e-05, + "loss": 1.5189, + "step": 56 + }, + { + "epoch": 0.005093037281926419, + "grad_norm": 0.3141593635082245, + "learning_rate": 1.6964285714285715e-05, + "loss": 1.5387, + "step": 57 + }, + { + "epoch": 0.005182388813188286, + "grad_norm": 0.2942512035369873, + "learning_rate": 1.7261904761904763e-05, + "loss": 1.5823, + "step": 58 + }, + { + "epoch": 0.005271740344450153, + "grad_norm": 0.3249220550060272, + "learning_rate": 1.755952380952381e-05, + "loss": 1.4426, + "step": 59 + }, + { + "epoch": 0.00536109187571202, + "grad_norm": 0.3305152356624603, + "learning_rate": 1.785714285714286e-05, + "loss": 1.4518, + "step": 60 + }, + { + "epoch": 0.005450443406973887, + "grad_norm": 0.31893137097358704, + "learning_rate": 1.8154761904761906e-05, + "loss": 1.4131, + "step": 61 + }, + { + "epoch": 0.005539794938235754, + "grad_norm": 0.35763436555862427, + "learning_rate": 1.8452380952380954e-05, + "loss": 1.4497, + "step": 62 + }, + { + "epoch": 0.005629146469497621, + "grad_norm": 0.34308096766471863, + "learning_rate": 1.8750000000000002e-05, + "loss": 1.445, + "step": 63 + }, + { + "epoch": 0.005718498000759488, + "grad_norm": 0.3230386972427368, + "learning_rate": 1.9047619047619046e-05, + "loss": 1.4297, + "step": 64 + }, + { + "epoch": 0.005807849532021355, + "grad_norm": 0.3689243495464325, + "learning_rate": 1.9345238095238097e-05, + "loss": 1.4096, + "step": 65 + }, + { + "epoch": 0.005897201063283222, + "grad_norm": 0.2907058894634247, + "learning_rate": 1.9642857142857145e-05, + "loss": 1.49, + "step": 66 + }, + { + "epoch": 0.005986552594545089, + "grad_norm": 0.3283701539039612, + "learning_rate": 1.9940476190476193e-05, + "loss": 1.4467, + "step": 67 + }, + { + "epoch": 0.006075904125806956, + "grad_norm": 0.29510149359703064, + "learning_rate": 2.023809523809524e-05, + "loss": 1.4465, + "step": 68 + }, + { + "epoch": 0.006165255657068823, + "grad_norm": 0.3028480112552643, + "learning_rate": 2.0535714285714285e-05, + "loss": 1.5013, + "step": 69 + }, + { + "epoch": 0.00625460718833069, + "grad_norm": 0.30100324749946594, + "learning_rate": 2.0833333333333336e-05, + "loss": 1.3521, + "step": 70 + }, + { + "epoch": 0.006343958719592557, + "grad_norm": 0.2899802029132843, + "learning_rate": 2.113095238095238e-05, + "loss": 1.4652, + "step": 71 + }, + { + "epoch": 0.006433310250854424, + "grad_norm": 0.37080496549606323, + "learning_rate": 2.1428571428571428e-05, + "loss": 1.3875, + "step": 72 + }, + { + "epoch": 0.006522661782116291, + "grad_norm": 0.3363150656223297, + "learning_rate": 2.172619047619048e-05, + "loss": 1.355, + "step": 73 + }, + { + "epoch": 0.006612013313378158, + "grad_norm": 0.3124528229236603, + "learning_rate": 2.2023809523809524e-05, + "loss": 1.3817, + "step": 74 + }, + { + "epoch": 0.006701364844640025, + "grad_norm": 0.293714702129364, + "learning_rate": 2.2321428571428575e-05, + "loss": 1.3825, + "step": 75 + }, + { + "epoch": 0.006790716375901892, + "grad_norm": 0.3099130690097809, + "learning_rate": 2.261904761904762e-05, + "loss": 1.4276, + "step": 76 + }, + { + "epoch": 0.006880067907163759, + "grad_norm": 0.32042503356933594, + "learning_rate": 2.2916666666666667e-05, + "loss": 1.4544, + "step": 77 + }, + { + "epoch": 0.006969419438425626, + "grad_norm": 0.3349694311618805, + "learning_rate": 2.3214285714285715e-05, + "loss": 1.3263, + "step": 78 + }, + { + "epoch": 0.007058770969687493, + "grad_norm": 0.3454136848449707, + "learning_rate": 2.3511904761904762e-05, + "loss": 1.3386, + "step": 79 + }, + { + "epoch": 0.00714812250094936, + "grad_norm": 0.30371707677841187, + "learning_rate": 2.380952380952381e-05, + "loss": 1.3713, + "step": 80 + }, + { + "epoch": 0.007237474032211227, + "grad_norm": 0.35178881883621216, + "learning_rate": 2.4107142857142858e-05, + "loss": 1.2754, + "step": 81 + }, + { + "epoch": 0.007326825563473094, + "grad_norm": 0.34617844223976135, + "learning_rate": 2.4404761904761906e-05, + "loss": 1.4094, + "step": 82 + }, + { + "epoch": 0.007416177094734961, + "grad_norm": 0.31313589215278625, + "learning_rate": 2.4702380952380953e-05, + "loss": 1.3358, + "step": 83 + }, + { + "epoch": 0.007505528625996828, + "grad_norm": 0.3196007311344147, + "learning_rate": 2.5e-05, + "loss": 1.4212, + "step": 84 + }, + { + "epoch": 0.007594880157258695, + "grad_norm": 0.3096368610858917, + "learning_rate": 2.529761904761905e-05, + "loss": 1.408, + "step": 85 + }, + { + "epoch": 0.007684231688520562, + "grad_norm": 0.4201064705848694, + "learning_rate": 2.5595238095238093e-05, + "loss": 1.4166, + "step": 86 + }, + { + "epoch": 0.007773583219782429, + "grad_norm": 0.2974705398082733, + "learning_rate": 2.5892857142857148e-05, + "loss": 1.4298, + "step": 87 + }, + { + "epoch": 0.007862934751044296, + "grad_norm": 0.29475000500679016, + "learning_rate": 2.6190476190476192e-05, + "loss": 1.363, + "step": 88 + }, + { + "epoch": 0.007952286282306162, + "grad_norm": 0.29409998655319214, + "learning_rate": 2.648809523809524e-05, + "loss": 1.3672, + "step": 89 + }, + { + "epoch": 0.00804163781356803, + "grad_norm": 0.33392786979675293, + "learning_rate": 2.6785714285714288e-05, + "loss": 1.3839, + "step": 90 + }, + { + "epoch": 0.008130989344829897, + "grad_norm": 0.35475441813468933, + "learning_rate": 2.7083333333333332e-05, + "loss": 1.3376, + "step": 91 + }, + { + "epoch": 0.008220340876091765, + "grad_norm": 0.3339526951313019, + "learning_rate": 2.7380952380952383e-05, + "loss": 1.3865, + "step": 92 + }, + { + "epoch": 0.00830969240735363, + "grad_norm": 0.35839566588401794, + "learning_rate": 2.767857142857143e-05, + "loss": 1.2942, + "step": 93 + }, + { + "epoch": 0.008399043938615498, + "grad_norm": 0.3491029739379883, + "learning_rate": 2.797619047619048e-05, + "loss": 1.2828, + "step": 94 + }, + { + "epoch": 0.008488395469877365, + "grad_norm": 0.3070695400238037, + "learning_rate": 2.8273809523809523e-05, + "loss": 1.3261, + "step": 95 + }, + { + "epoch": 0.008577747001139231, + "grad_norm": 0.3386940360069275, + "learning_rate": 2.857142857142857e-05, + "loss": 1.3626, + "step": 96 + }, + { + "epoch": 0.008667098532401099, + "grad_norm": 0.3212776184082031, + "learning_rate": 2.886904761904762e-05, + "loss": 1.3482, + "step": 97 + }, + { + "epoch": 0.008756450063662966, + "grad_norm": 0.3101659119129181, + "learning_rate": 2.916666666666667e-05, + "loss": 1.3217, + "step": 98 + }, + { + "epoch": 0.008845801594924834, + "grad_norm": 0.3669290542602539, + "learning_rate": 2.9464285714285718e-05, + "loss": 1.3461, + "step": 99 + }, + { + "epoch": 0.0089351531261867, + "grad_norm": 0.3357386291027069, + "learning_rate": 2.9761904761904762e-05, + "loss": 1.3613, + "step": 100 + }, + { + "epoch": 0.009024504657448567, + "grad_norm": 0.3187052607536316, + "learning_rate": 3.005952380952381e-05, + "loss": 1.2509, + "step": 101 + }, + { + "epoch": 0.009113856188710434, + "grad_norm": 0.36679479479789734, + "learning_rate": 3.0357142857142857e-05, + "loss": 1.3448, + "step": 102 + }, + { + "epoch": 0.0092032077199723, + "grad_norm": 0.3677942752838135, + "learning_rate": 3.0654761904761905e-05, + "loss": 1.3676, + "step": 103 + }, + { + "epoch": 0.009292559251234168, + "grad_norm": 0.36349645256996155, + "learning_rate": 3.095238095238095e-05, + "loss": 1.3115, + "step": 104 + }, + { + "epoch": 0.009381910782496035, + "grad_norm": 0.3469372093677521, + "learning_rate": 3.125e-05, + "loss": 1.3233, + "step": 105 + }, + { + "epoch": 0.009471262313757903, + "grad_norm": 0.556049644947052, + "learning_rate": 3.154761904761905e-05, + "loss": 1.299, + "step": 106 + }, + { + "epoch": 0.009560613845019768, + "grad_norm": 0.3648083508014679, + "learning_rate": 3.1845238095238096e-05, + "loss": 1.3484, + "step": 107 + }, + { + "epoch": 0.009649965376281636, + "grad_norm": 0.3116719424724579, + "learning_rate": 3.2142857142857144e-05, + "loss": 1.3059, + "step": 108 + }, + { + "epoch": 0.009739316907543503, + "grad_norm": 0.3249325454235077, + "learning_rate": 3.244047619047619e-05, + "loss": 1.2851, + "step": 109 + }, + { + "epoch": 0.00982866843880537, + "grad_norm": 0.3656911551952362, + "learning_rate": 3.273809523809524e-05, + "loss": 1.2504, + "step": 110 + }, + { + "epoch": 0.009918019970067237, + "grad_norm": 0.3634350895881653, + "learning_rate": 3.303571428571429e-05, + "loss": 1.4084, + "step": 111 + }, + { + "epoch": 0.010007371501329104, + "grad_norm": 0.39990171790122986, + "learning_rate": 3.3333333333333335e-05, + "loss": 1.3367, + "step": 112 + }, + { + "epoch": 0.010096723032590972, + "grad_norm": 0.4223339855670929, + "learning_rate": 3.363095238095238e-05, + "loss": 1.2866, + "step": 113 + }, + { + "epoch": 0.010186074563852838, + "grad_norm": 0.3745315968990326, + "learning_rate": 3.392857142857143e-05, + "loss": 1.3021, + "step": 114 + }, + { + "epoch": 0.010275426095114705, + "grad_norm": 0.4239887297153473, + "learning_rate": 3.422619047619048e-05, + "loss": 1.2906, + "step": 115 + }, + { + "epoch": 0.010364777626376573, + "grad_norm": 0.3679508566856384, + "learning_rate": 3.4523809523809526e-05, + "loss": 1.2425, + "step": 116 + }, + { + "epoch": 0.010454129157638438, + "grad_norm": 0.38168981671333313, + "learning_rate": 3.4821428571428574e-05, + "loss": 1.213, + "step": 117 + }, + { + "epoch": 0.010543480688900306, + "grad_norm": 0.33976343274116516, + "learning_rate": 3.511904761904762e-05, + "loss": 1.2012, + "step": 118 + }, + { + "epoch": 0.010632832220162173, + "grad_norm": 0.3905802369117737, + "learning_rate": 3.541666666666667e-05, + "loss": 1.2856, + "step": 119 + }, + { + "epoch": 0.01072218375142404, + "grad_norm": 0.3847375810146332, + "learning_rate": 3.571428571428572e-05, + "loss": 1.3329, + "step": 120 + }, + { + "epoch": 0.010811535282685907, + "grad_norm": 0.38759222626686096, + "learning_rate": 3.6011904761904765e-05, + "loss": 1.3153, + "step": 121 + }, + { + "epoch": 0.010900886813947774, + "grad_norm": 0.36399710178375244, + "learning_rate": 3.630952380952381e-05, + "loss": 1.2413, + "step": 122 + }, + { + "epoch": 0.010990238345209642, + "grad_norm": 0.390902042388916, + "learning_rate": 3.6607142857142853e-05, + "loss": 1.2319, + "step": 123 + }, + { + "epoch": 0.011079589876471507, + "grad_norm": 0.37981510162353516, + "learning_rate": 3.690476190476191e-05, + "loss": 1.2977, + "step": 124 + }, + { + "epoch": 0.011168941407733375, + "grad_norm": 0.3691035807132721, + "learning_rate": 3.7202380952380956e-05, + "loss": 1.2927, + "step": 125 + }, + { + "epoch": 0.011258292938995242, + "grad_norm": 0.3708125948905945, + "learning_rate": 3.7500000000000003e-05, + "loss": 1.2989, + "step": 126 + }, + { + "epoch": 0.01134764447025711, + "grad_norm": 0.3609068989753723, + "learning_rate": 3.779761904761905e-05, + "loss": 1.3181, + "step": 127 + }, + { + "epoch": 0.011436996001518976, + "grad_norm": 0.4091266095638275, + "learning_rate": 3.809523809523809e-05, + "loss": 1.2894, + "step": 128 + }, + { + "epoch": 0.011526347532780843, + "grad_norm": 0.41847166419029236, + "learning_rate": 3.839285714285715e-05, + "loss": 1.1952, + "step": 129 + }, + { + "epoch": 0.01161569906404271, + "grad_norm": 0.5045241713523865, + "learning_rate": 3.8690476190476195e-05, + "loss": 1.2808, + "step": 130 + }, + { + "epoch": 0.011705050595304576, + "grad_norm": 0.4047856032848358, + "learning_rate": 3.898809523809524e-05, + "loss": 1.2858, + "step": 131 + }, + { + "epoch": 0.011794402126566444, + "grad_norm": 0.4096450209617615, + "learning_rate": 3.928571428571429e-05, + "loss": 1.3116, + "step": 132 + }, + { + "epoch": 0.011883753657828311, + "grad_norm": 0.3629261553287506, + "learning_rate": 3.958333333333333e-05, + "loss": 1.2968, + "step": 133 + }, + { + "epoch": 0.011973105189090179, + "grad_norm": 0.40072211623191833, + "learning_rate": 3.9880952380952386e-05, + "loss": 1.257, + "step": 134 + }, + { + "epoch": 0.012062456720352045, + "grad_norm": 0.40854790806770325, + "learning_rate": 4.017857142857143e-05, + "loss": 1.2404, + "step": 135 + }, + { + "epoch": 0.012151808251613912, + "grad_norm": 0.43366968631744385, + "learning_rate": 4.047619047619048e-05, + "loss": 1.2719, + "step": 136 + }, + { + "epoch": 0.01224115978287578, + "grad_norm": 0.35978153347969055, + "learning_rate": 4.077380952380952e-05, + "loss": 1.2299, + "step": 137 + }, + { + "epoch": 0.012330511314137645, + "grad_norm": 0.3845697343349457, + "learning_rate": 4.107142857142857e-05, + "loss": 1.2342, + "step": 138 + }, + { + "epoch": 0.012419862845399513, + "grad_norm": 0.48100224137306213, + "learning_rate": 4.136904761904762e-05, + "loss": 1.2568, + "step": 139 + }, + { + "epoch": 0.01250921437666138, + "grad_norm": 0.40048471093177795, + "learning_rate": 4.166666666666667e-05, + "loss": 1.2981, + "step": 140 + }, + { + "epoch": 0.012598565907923248, + "grad_norm": 0.46120962500572205, + "learning_rate": 4.196428571428572e-05, + "loss": 1.2794, + "step": 141 + }, + { + "epoch": 0.012687917439185114, + "grad_norm": 0.4530184864997864, + "learning_rate": 4.226190476190476e-05, + "loss": 1.2063, + "step": 142 + }, + { + "epoch": 0.012777268970446981, + "grad_norm": 0.39693745970726013, + "learning_rate": 4.255952380952381e-05, + "loss": 1.3012, + "step": 143 + }, + { + "epoch": 0.012866620501708849, + "grad_norm": 0.40875282883644104, + "learning_rate": 4.2857142857142856e-05, + "loss": 1.2265, + "step": 144 + }, + { + "epoch": 0.012955972032970714, + "grad_norm": 0.4045838415622711, + "learning_rate": 4.315476190476191e-05, + "loss": 1.2228, + "step": 145 + }, + { + "epoch": 0.013045323564232582, + "grad_norm": 0.4586333632469177, + "learning_rate": 4.345238095238096e-05, + "loss": 1.215, + "step": 146 + }, + { + "epoch": 0.01313467509549445, + "grad_norm": 0.4658620059490204, + "learning_rate": 4.375e-05, + "loss": 1.3559, + "step": 147 + }, + { + "epoch": 0.013224026626756317, + "grad_norm": 0.4266040325164795, + "learning_rate": 4.404761904761905e-05, + "loss": 1.2215, + "step": 148 + }, + { + "epoch": 0.013313378158018183, + "grad_norm": 0.39535924792289734, + "learning_rate": 4.4345238095238095e-05, + "loss": 1.3024, + "step": 149 + }, + { + "epoch": 0.01340272968928005, + "grad_norm": 0.4783201515674591, + "learning_rate": 4.464285714285715e-05, + "loss": 1.2651, + "step": 150 + }, + { + "epoch": 0.013492081220541918, + "grad_norm": 0.44912487268447876, + "learning_rate": 4.494047619047619e-05, + "loss": 1.2107, + "step": 151 + }, + { + "epoch": 0.013581432751803783, + "grad_norm": 0.40533971786499023, + "learning_rate": 4.523809523809524e-05, + "loss": 1.2386, + "step": 152 + }, + { + "epoch": 0.01367078428306565, + "grad_norm": 0.47869575023651123, + "learning_rate": 4.5535714285714286e-05, + "loss": 1.2709, + "step": 153 + }, + { + "epoch": 0.013760135814327518, + "grad_norm": 0.45831164717674255, + "learning_rate": 4.5833333333333334e-05, + "loss": 1.2435, + "step": 154 + }, + { + "epoch": 0.013849487345589386, + "grad_norm": 0.5027099847793579, + "learning_rate": 4.613095238095239e-05, + "loss": 1.2706, + "step": 155 + }, + { + "epoch": 0.013938838876851252, + "grad_norm": 0.4425562024116516, + "learning_rate": 4.642857142857143e-05, + "loss": 1.1868, + "step": 156 + }, + { + "epoch": 0.014028190408113119, + "grad_norm": 0.5197780728340149, + "learning_rate": 4.672619047619048e-05, + "loss": 1.2776, + "step": 157 + }, + { + "epoch": 0.014117541939374987, + "grad_norm": 0.4800921380519867, + "learning_rate": 4.7023809523809525e-05, + "loss": 1.2213, + "step": 158 + }, + { + "epoch": 0.014206893470636852, + "grad_norm": 0.4182106852531433, + "learning_rate": 4.732142857142857e-05, + "loss": 1.2358, + "step": 159 + }, + { + "epoch": 0.01429624500189872, + "grad_norm": 0.4423435926437378, + "learning_rate": 4.761904761904762e-05, + "loss": 1.263, + "step": 160 + }, + { + "epoch": 0.014385596533160587, + "grad_norm": 0.5085612535476685, + "learning_rate": 4.791666666666667e-05, + "loss": 1.206, + "step": 161 + }, + { + "epoch": 0.014474948064422455, + "grad_norm": 0.4774646461009979, + "learning_rate": 4.8214285714285716e-05, + "loss": 1.2295, + "step": 162 + }, + { + "epoch": 0.01456429959568432, + "grad_norm": 0.4879673719406128, + "learning_rate": 4.8511904761904764e-05, + "loss": 1.2492, + "step": 163 + }, + { + "epoch": 0.014653651126946188, + "grad_norm": 0.4564456343650818, + "learning_rate": 4.880952380952381e-05, + "loss": 1.196, + "step": 164 + }, + { + "epoch": 0.014743002658208056, + "grad_norm": 0.4189358353614807, + "learning_rate": 4.910714285714286e-05, + "loss": 1.2245, + "step": 165 + }, + { + "epoch": 0.014832354189469921, + "grad_norm": 0.48813703656196594, + "learning_rate": 4.940476190476191e-05, + "loss": 1.1321, + "step": 166 + }, + { + "epoch": 0.014921705720731789, + "grad_norm": 0.5083624124526978, + "learning_rate": 4.9702380952380955e-05, + "loss": 1.1557, + "step": 167 + }, + { + "epoch": 0.015011057251993656, + "grad_norm": 0.4980645179748535, + "learning_rate": 5e-05, + "loss": 1.2758, + "step": 168 + }, + { + "epoch": 0.015100408783255524, + "grad_norm": 0.4460103511810303, + "learning_rate": 5.029761904761905e-05, + "loss": 1.2135, + "step": 169 + }, + { + "epoch": 0.01518976031451739, + "grad_norm": 0.4435397684574127, + "learning_rate": 5.05952380952381e-05, + "loss": 1.295, + "step": 170 + }, + { + "epoch": 0.015279111845779257, + "grad_norm": 0.5360916256904602, + "learning_rate": 5.089285714285714e-05, + "loss": 1.2055, + "step": 171 + }, + { + "epoch": 0.015368463377041125, + "grad_norm": 0.5073984265327454, + "learning_rate": 5.119047619047619e-05, + "loss": 1.2104, + "step": 172 + }, + { + "epoch": 0.01545781490830299, + "grad_norm": 0.4850315451622009, + "learning_rate": 5.1488095238095234e-05, + "loss": 1.2246, + "step": 173 + }, + { + "epoch": 0.015547166439564858, + "grad_norm": 0.46031928062438965, + "learning_rate": 5.1785714285714296e-05, + "loss": 1.1649, + "step": 174 + }, + { + "epoch": 0.015636517970826724, + "grad_norm": 0.4688253104686737, + "learning_rate": 5.208333333333334e-05, + "loss": 1.2064, + "step": 175 + }, + { + "epoch": 0.015725869502088593, + "grad_norm": 0.44692277908325195, + "learning_rate": 5.2380952380952384e-05, + "loss": 1.2672, + "step": 176 + }, + { + "epoch": 0.01581522103335046, + "grad_norm": 0.5310292840003967, + "learning_rate": 5.267857142857143e-05, + "loss": 1.2706, + "step": 177 + }, + { + "epoch": 0.015904572564612324, + "grad_norm": 0.5005099177360535, + "learning_rate": 5.297619047619048e-05, + "loss": 1.2482, + "step": 178 + }, + { + "epoch": 0.015993924095874194, + "grad_norm": 0.45003390312194824, + "learning_rate": 5.327380952380953e-05, + "loss": 1.25, + "step": 179 + }, + { + "epoch": 0.01608327562713606, + "grad_norm": 0.4997723400592804, + "learning_rate": 5.3571428571428575e-05, + "loss": 1.1774, + "step": 180 + }, + { + "epoch": 0.01617262715839793, + "grad_norm": 0.4725038409233093, + "learning_rate": 5.3869047619047616e-05, + "loss": 1.226, + "step": 181 + }, + { + "epoch": 0.016261978689659794, + "grad_norm": 0.47115814685821533, + "learning_rate": 5.4166666666666664e-05, + "loss": 1.2809, + "step": 182 + }, + { + "epoch": 0.01635133022092166, + "grad_norm": 0.4939117133617401, + "learning_rate": 5.446428571428571e-05, + "loss": 1.2195, + "step": 183 + }, + { + "epoch": 0.01644068175218353, + "grad_norm": 0.4947628974914551, + "learning_rate": 5.4761904761904766e-05, + "loss": 1.1385, + "step": 184 + }, + { + "epoch": 0.016530033283445395, + "grad_norm": 0.43742480874061584, + "learning_rate": 5.5059523809523814e-05, + "loss": 1.2191, + "step": 185 + }, + { + "epoch": 0.01661938481470726, + "grad_norm": 0.48737379908561707, + "learning_rate": 5.535714285714286e-05, + "loss": 1.2033, + "step": 186 + }, + { + "epoch": 0.01670873634596913, + "grad_norm": 0.46898481249809265, + "learning_rate": 5.565476190476191e-05, + "loss": 1.2672, + "step": 187 + }, + { + "epoch": 0.016798087877230996, + "grad_norm": 0.5089002251625061, + "learning_rate": 5.595238095238096e-05, + "loss": 1.1853, + "step": 188 + }, + { + "epoch": 0.01688743940849286, + "grad_norm": 0.5069641470909119, + "learning_rate": 5.6250000000000005e-05, + "loss": 1.1838, + "step": 189 + }, + { + "epoch": 0.01697679093975473, + "grad_norm": 0.48622408509254456, + "learning_rate": 5.6547619047619046e-05, + "loss": 1.2445, + "step": 190 + }, + { + "epoch": 0.017066142471016597, + "grad_norm": 0.48085153102874756, + "learning_rate": 5.6845238095238094e-05, + "loss": 1.1636, + "step": 191 + }, + { + "epoch": 0.017155494002278462, + "grad_norm": 0.4827207326889038, + "learning_rate": 5.714285714285714e-05, + "loss": 1.2002, + "step": 192 + }, + { + "epoch": 0.01724484553354033, + "grad_norm": 0.5341354608535767, + "learning_rate": 5.744047619047619e-05, + "loss": 1.2213, + "step": 193 + }, + { + "epoch": 0.017334197064802197, + "grad_norm": 0.4994199872016907, + "learning_rate": 5.773809523809524e-05, + "loss": 1.1627, + "step": 194 + }, + { + "epoch": 0.017423548596064067, + "grad_norm": 0.5125264525413513, + "learning_rate": 5.803571428571429e-05, + "loss": 1.2425, + "step": 195 + }, + { + "epoch": 0.017512900127325932, + "grad_norm": 0.534212589263916, + "learning_rate": 5.833333333333334e-05, + "loss": 1.1437, + "step": 196 + }, + { + "epoch": 0.017602251658587798, + "grad_norm": 0.4822109341621399, + "learning_rate": 5.863095238095239e-05, + "loss": 1.1876, + "step": 197 + }, + { + "epoch": 0.017691603189849667, + "grad_norm": 0.4595533311367035, + "learning_rate": 5.8928571428571435e-05, + "loss": 1.1951, + "step": 198 + }, + { + "epoch": 0.017780954721111533, + "grad_norm": 0.4773840606212616, + "learning_rate": 5.922619047619048e-05, + "loss": 1.2074, + "step": 199 + }, + { + "epoch": 0.0178703062523734, + "grad_norm": 0.4863610565662384, + "learning_rate": 5.9523809523809524e-05, + "loss": 1.1874, + "step": 200 + }, + { + "epoch": 0.017959657783635268, + "grad_norm": 0.5983034372329712, + "learning_rate": 5.982142857142857e-05, + "loss": 1.1794, + "step": 201 + }, + { + "epoch": 0.018049009314897134, + "grad_norm": 0.5169399380683899, + "learning_rate": 6.011904761904762e-05, + "loss": 1.25, + "step": 202 + }, + { + "epoch": 0.018138360846159, + "grad_norm": 0.49807149171829224, + "learning_rate": 6.041666666666667e-05, + "loss": 1.2069, + "step": 203 + }, + { + "epoch": 0.01822771237742087, + "grad_norm": 0.488290399312973, + "learning_rate": 6.0714285714285715e-05, + "loss": 1.2747, + "step": 204 + }, + { + "epoch": 0.018317063908682735, + "grad_norm": 0.49439024925231934, + "learning_rate": 6.101190476190477e-05, + "loss": 1.2401, + "step": 205 + }, + { + "epoch": 0.0184064154399446, + "grad_norm": 0.5511735081672668, + "learning_rate": 6.130952380952381e-05, + "loss": 1.1785, + "step": 206 + }, + { + "epoch": 0.01849576697120647, + "grad_norm": 0.5392850637435913, + "learning_rate": 6.160714285714286e-05, + "loss": 1.2753, + "step": 207 + }, + { + "epoch": 0.018585118502468335, + "grad_norm": 0.4930606484413147, + "learning_rate": 6.19047619047619e-05, + "loss": 1.2354, + "step": 208 + }, + { + "epoch": 0.018674470033730205, + "grad_norm": 0.49877017736434937, + "learning_rate": 6.220238095238095e-05, + "loss": 1.2199, + "step": 209 + }, + { + "epoch": 0.01876382156499207, + "grad_norm": 0.5193923711776733, + "learning_rate": 6.25e-05, + "loss": 1.1794, + "step": 210 + }, + { + "epoch": 0.018853173096253936, + "grad_norm": 0.5702071189880371, + "learning_rate": 6.279761904761905e-05, + "loss": 1.237, + "step": 211 + }, + { + "epoch": 0.018942524627515805, + "grad_norm": 0.542413055896759, + "learning_rate": 6.30952380952381e-05, + "loss": 1.2469, + "step": 212 + }, + { + "epoch": 0.01903187615877767, + "grad_norm": 0.518735945224762, + "learning_rate": 6.339285714285714e-05, + "loss": 1.1726, + "step": 213 + }, + { + "epoch": 0.019121227690039537, + "grad_norm": 0.49173060059547424, + "learning_rate": 6.369047619047619e-05, + "loss": 1.1974, + "step": 214 + }, + { + "epoch": 0.019210579221301406, + "grad_norm": 0.46574926376342773, + "learning_rate": 6.398809523809524e-05, + "loss": 1.2094, + "step": 215 + }, + { + "epoch": 0.019299930752563272, + "grad_norm": 0.5576440691947937, + "learning_rate": 6.428571428571429e-05, + "loss": 1.1892, + "step": 216 + }, + { + "epoch": 0.019389282283825138, + "grad_norm": 0.527204692363739, + "learning_rate": 6.458333333333334e-05, + "loss": 1.2032, + "step": 217 + }, + { + "epoch": 0.019478633815087007, + "grad_norm": 0.5345803499221802, + "learning_rate": 6.488095238095238e-05, + "loss": 1.1977, + "step": 218 + }, + { + "epoch": 0.019567985346348873, + "grad_norm": 0.5545653700828552, + "learning_rate": 6.517857142857143e-05, + "loss": 1.1988, + "step": 219 + }, + { + "epoch": 0.01965733687761074, + "grad_norm": 0.5376014113426208, + "learning_rate": 6.547619047619048e-05, + "loss": 1.1331, + "step": 220 + }, + { + "epoch": 0.019746688408872608, + "grad_norm": 0.5965554714202881, + "learning_rate": 6.577380952380953e-05, + "loss": 1.1333, + "step": 221 + }, + { + "epoch": 0.019836039940134473, + "grad_norm": 0.48786482214927673, + "learning_rate": 6.607142857142857e-05, + "loss": 1.2119, + "step": 222 + }, + { + "epoch": 0.019925391471396343, + "grad_norm": 0.5010102391242981, + "learning_rate": 6.636904761904762e-05, + "loss": 1.1576, + "step": 223 + }, + { + "epoch": 0.02001474300265821, + "grad_norm": 0.5798513293266296, + "learning_rate": 6.666666666666667e-05, + "loss": 1.2731, + "step": 224 + }, + { + "epoch": 0.020104094533920074, + "grad_norm": 0.600602388381958, + "learning_rate": 6.696428571428572e-05, + "loss": 1.1185, + "step": 225 + }, + { + "epoch": 0.020193446065181943, + "grad_norm": 0.5265418291091919, + "learning_rate": 6.726190476190477e-05, + "loss": 1.2328, + "step": 226 + }, + { + "epoch": 0.02028279759644381, + "grad_norm": 0.6028567552566528, + "learning_rate": 6.755952380952381e-05, + "loss": 1.222, + "step": 227 + }, + { + "epoch": 0.020372149127705675, + "grad_norm": 0.5283799171447754, + "learning_rate": 6.785714285714286e-05, + "loss": 1.2209, + "step": 228 + }, + { + "epoch": 0.020461500658967544, + "grad_norm": 0.5363395810127258, + "learning_rate": 6.815476190476191e-05, + "loss": 1.1309, + "step": 229 + }, + { + "epoch": 0.02055085219022941, + "grad_norm": 0.5541881918907166, + "learning_rate": 6.845238095238096e-05, + "loss": 1.1629, + "step": 230 + }, + { + "epoch": 0.020640203721491276, + "grad_norm": 0.5489634275436401, + "learning_rate": 6.875e-05, + "loss": 1.148, + "step": 231 + }, + { + "epoch": 0.020729555252753145, + "grad_norm": 0.582408607006073, + "learning_rate": 6.904761904761905e-05, + "loss": 1.1505, + "step": 232 + }, + { + "epoch": 0.02081890678401501, + "grad_norm": 0.5389354825019836, + "learning_rate": 6.93452380952381e-05, + "loss": 1.1644, + "step": 233 + }, + { + "epoch": 0.020908258315276877, + "grad_norm": 0.6120844483375549, + "learning_rate": 6.964285714285715e-05, + "loss": 1.1522, + "step": 234 + }, + { + "epoch": 0.020997609846538746, + "grad_norm": 0.5017001628875732, + "learning_rate": 6.99404761904762e-05, + "loss": 1.2153, + "step": 235 + }, + { + "epoch": 0.02108696137780061, + "grad_norm": 0.5250133275985718, + "learning_rate": 7.023809523809524e-05, + "loss": 1.2756, + "step": 236 + }, + { + "epoch": 0.02117631290906248, + "grad_norm": 0.5131645202636719, + "learning_rate": 7.053571428571429e-05, + "loss": 1.1909, + "step": 237 + }, + { + "epoch": 0.021265664440324347, + "grad_norm": 0.5468138456344604, + "learning_rate": 7.083333333333334e-05, + "loss": 1.1209, + "step": 238 + }, + { + "epoch": 0.021355015971586212, + "grad_norm": 0.5460432171821594, + "learning_rate": 7.113095238095239e-05, + "loss": 1.2472, + "step": 239 + }, + { + "epoch": 0.02144436750284808, + "grad_norm": 0.5693673491477966, + "learning_rate": 7.142857142857143e-05, + "loss": 1.1466, + "step": 240 + }, + { + "epoch": 0.021533719034109947, + "grad_norm": 0.5243647694587708, + "learning_rate": 7.172619047619048e-05, + "loss": 1.1108, + "step": 241 + }, + { + "epoch": 0.021623070565371813, + "grad_norm": 0.51932692527771, + "learning_rate": 7.202380952380953e-05, + "loss": 1.2149, + "step": 242 + }, + { + "epoch": 0.021712422096633682, + "grad_norm": 0.5603271722793579, + "learning_rate": 7.232142857142858e-05, + "loss": 1.1534, + "step": 243 + }, + { + "epoch": 0.021801773627895548, + "grad_norm": 0.5533414483070374, + "learning_rate": 7.261904761904762e-05, + "loss": 1.1692, + "step": 244 + }, + { + "epoch": 0.021891125159157414, + "grad_norm": 0.5640945434570312, + "learning_rate": 7.291666666666667e-05, + "loss": 1.191, + "step": 245 + }, + { + "epoch": 0.021980476690419283, + "grad_norm": 0.565290629863739, + "learning_rate": 7.321428571428571e-05, + "loss": 1.1163, + "step": 246 + }, + { + "epoch": 0.02206982822168115, + "grad_norm": 0.5642583966255188, + "learning_rate": 7.351190476190477e-05, + "loss": 1.1447, + "step": 247 + }, + { + "epoch": 0.022159179752943015, + "grad_norm": 0.5096793174743652, + "learning_rate": 7.380952380952382e-05, + "loss": 1.1812, + "step": 248 + }, + { + "epoch": 0.022248531284204884, + "grad_norm": 0.49320727586746216, + "learning_rate": 7.410714285714286e-05, + "loss": 1.1238, + "step": 249 + }, + { + "epoch": 0.02233788281546675, + "grad_norm": 0.5757387280464172, + "learning_rate": 7.440476190476191e-05, + "loss": 1.2047, + "step": 250 + }, + { + "epoch": 0.02242723434672862, + "grad_norm": 0.5943542718887329, + "learning_rate": 7.470238095238096e-05, + "loss": 1.146, + "step": 251 + }, + { + "epoch": 0.022516585877990485, + "grad_norm": 0.5858287215232849, + "learning_rate": 7.500000000000001e-05, + "loss": 1.1685, + "step": 252 + }, + { + "epoch": 0.02260593740925235, + "grad_norm": 0.4847318232059479, + "learning_rate": 7.529761904761905e-05, + "loss": 1.1975, + "step": 253 + }, + { + "epoch": 0.02269528894051422, + "grad_norm": 0.5576103925704956, + "learning_rate": 7.55952380952381e-05, + "loss": 1.158, + "step": 254 + }, + { + "epoch": 0.022784640471776085, + "grad_norm": 0.5339264869689941, + "learning_rate": 7.589285714285714e-05, + "loss": 1.1694, + "step": 255 + }, + { + "epoch": 0.02287399200303795, + "grad_norm": 0.5801980495452881, + "learning_rate": 7.619047619047618e-05, + "loss": 1.1646, + "step": 256 + }, + { + "epoch": 0.02296334353429982, + "grad_norm": 0.5421757698059082, + "learning_rate": 7.648809523809523e-05, + "loss": 1.1546, + "step": 257 + }, + { + "epoch": 0.023052695065561686, + "grad_norm": 0.546415388584137, + "learning_rate": 7.67857142857143e-05, + "loss": 1.2262, + "step": 258 + }, + { + "epoch": 0.023142046596823552, + "grad_norm": 0.584190309047699, + "learning_rate": 7.708333333333334e-05, + "loss": 1.1528, + "step": 259 + }, + { + "epoch": 0.02323139812808542, + "grad_norm": 0.5467146635055542, + "learning_rate": 7.738095238095239e-05, + "loss": 1.1452, + "step": 260 + }, + { + "epoch": 0.023320749659347287, + "grad_norm": 0.5554835796356201, + "learning_rate": 7.767857142857144e-05, + "loss": 1.1936, + "step": 261 + }, + { + "epoch": 0.023410101190609153, + "grad_norm": 0.5491194725036621, + "learning_rate": 7.797619047619048e-05, + "loss": 1.1152, + "step": 262 + }, + { + "epoch": 0.023499452721871022, + "grad_norm": 0.5814348459243774, + "learning_rate": 7.827380952380953e-05, + "loss": 1.1432, + "step": 263 + }, + { + "epoch": 0.023588804253132888, + "grad_norm": 0.598092257976532, + "learning_rate": 7.857142857142858e-05, + "loss": 1.1661, + "step": 264 + }, + { + "epoch": 0.023678155784394753, + "grad_norm": 0.6232405304908752, + "learning_rate": 7.886904761904761e-05, + "loss": 1.1545, + "step": 265 + }, + { + "epoch": 0.023767507315656623, + "grad_norm": 0.53800368309021, + "learning_rate": 7.916666666666666e-05, + "loss": 1.2058, + "step": 266 + }, + { + "epoch": 0.02385685884691849, + "grad_norm": 0.5912578105926514, + "learning_rate": 7.946428571428571e-05, + "loss": 1.1638, + "step": 267 + }, + { + "epoch": 0.023946210378180358, + "grad_norm": 0.6035777926445007, + "learning_rate": 7.976190476190477e-05, + "loss": 1.1325, + "step": 268 + }, + { + "epoch": 0.024035561909442223, + "grad_norm": 0.5701507925987244, + "learning_rate": 8.005952380952382e-05, + "loss": 1.1693, + "step": 269 + }, + { + "epoch": 0.02412491344070409, + "grad_norm": 0.9521252512931824, + "learning_rate": 8.035714285714287e-05, + "loss": 1.1926, + "step": 270 + }, + { + "epoch": 0.02421426497196596, + "grad_norm": 0.5579119324684143, + "learning_rate": 8.065476190476191e-05, + "loss": 1.2114, + "step": 271 + }, + { + "epoch": 0.024303616503227824, + "grad_norm": 0.5646568536758423, + "learning_rate": 8.095238095238096e-05, + "loss": 1.1351, + "step": 272 + }, + { + "epoch": 0.02439296803448969, + "grad_norm": 0.5492066740989685, + "learning_rate": 8.125000000000001e-05, + "loss": 1.187, + "step": 273 + }, + { + "epoch": 0.02448231956575156, + "grad_norm": 0.5730810761451721, + "learning_rate": 8.154761904761904e-05, + "loss": 1.1687, + "step": 274 + }, + { + "epoch": 0.024571671097013425, + "grad_norm": 0.5157971978187561, + "learning_rate": 8.184523809523809e-05, + "loss": 1.1312, + "step": 275 + }, + { + "epoch": 0.02466102262827529, + "grad_norm": 0.5865328311920166, + "learning_rate": 8.214285714285714e-05, + "loss": 1.207, + "step": 276 + }, + { + "epoch": 0.02475037415953716, + "grad_norm": 0.47492775321006775, + "learning_rate": 8.244047619047619e-05, + "loss": 1.1525, + "step": 277 + }, + { + "epoch": 0.024839725690799026, + "grad_norm": 0.5046519041061401, + "learning_rate": 8.273809523809524e-05, + "loss": 1.1959, + "step": 278 + }, + { + "epoch": 0.02492907722206089, + "grad_norm": 0.5498790144920349, + "learning_rate": 8.30357142857143e-05, + "loss": 1.1969, + "step": 279 + }, + { + "epoch": 0.02501842875332276, + "grad_norm": 0.5528784394264221, + "learning_rate": 8.333333333333334e-05, + "loss": 1.0772, + "step": 280 + }, + { + "epoch": 0.025107780284584626, + "grad_norm": 0.5229126811027527, + "learning_rate": 8.363095238095239e-05, + "loss": 1.1485, + "step": 281 + }, + { + "epoch": 0.025197131815846496, + "grad_norm": 0.5191675424575806, + "learning_rate": 8.392857142857144e-05, + "loss": 1.1568, + "step": 282 + }, + { + "epoch": 0.02528648334710836, + "grad_norm": 0.5272664427757263, + "learning_rate": 8.422619047619049e-05, + "loss": 1.1794, + "step": 283 + }, + { + "epoch": 0.025375834878370227, + "grad_norm": 0.5842853784561157, + "learning_rate": 8.452380952380952e-05, + "loss": 1.101, + "step": 284 + }, + { + "epoch": 0.025465186409632096, + "grad_norm": 0.501756489276886, + "learning_rate": 8.482142857142857e-05, + "loss": 1.1639, + "step": 285 + }, + { + "epoch": 0.025554537940893962, + "grad_norm": 0.491220623254776, + "learning_rate": 8.511904761904762e-05, + "loss": 1.2092, + "step": 286 + }, + { + "epoch": 0.025643889472155828, + "grad_norm": 0.5032030940055847, + "learning_rate": 8.541666666666666e-05, + "loss": 1.181, + "step": 287 + }, + { + "epoch": 0.025733241003417697, + "grad_norm": 0.5295215249061584, + "learning_rate": 8.571428571428571e-05, + "loss": 1.1861, + "step": 288 + }, + { + "epoch": 0.025822592534679563, + "grad_norm": 0.5146911144256592, + "learning_rate": 8.601190476190477e-05, + "loss": 1.1854, + "step": 289 + }, + { + "epoch": 0.02591194406594143, + "grad_norm": 0.5277708172798157, + "learning_rate": 8.630952380952382e-05, + "loss": 1.1318, + "step": 290 + }, + { + "epoch": 0.026001295597203298, + "grad_norm": 0.5158389210700989, + "learning_rate": 8.660714285714287e-05, + "loss": 1.1603, + "step": 291 + }, + { + "epoch": 0.026090647128465164, + "grad_norm": 0.4982542395591736, + "learning_rate": 8.690476190476192e-05, + "loss": 1.185, + "step": 292 + }, + { + "epoch": 0.02617999865972703, + "grad_norm": 0.5195929408073425, + "learning_rate": 8.720238095238095e-05, + "loss": 1.2306, + "step": 293 + }, + { + "epoch": 0.0262693501909889, + "grad_norm": 0.48286932706832886, + "learning_rate": 8.75e-05, + "loss": 1.1665, + "step": 294 + }, + { + "epoch": 0.026358701722250764, + "grad_norm": 0.5617235898971558, + "learning_rate": 8.779761904761905e-05, + "loss": 1.1484, + "step": 295 + }, + { + "epoch": 0.026448053253512634, + "grad_norm": 0.5541561841964722, + "learning_rate": 8.80952380952381e-05, + "loss": 0.9718, + "step": 296 + }, + { + "epoch": 0.0265374047847745, + "grad_norm": 0.47326454520225525, + "learning_rate": 8.839285714285714e-05, + "loss": 1.1289, + "step": 297 + }, + { + "epoch": 0.026626756316036365, + "grad_norm": 0.5681378841400146, + "learning_rate": 8.869047619047619e-05, + "loss": 1.1569, + "step": 298 + }, + { + "epoch": 0.026716107847298234, + "grad_norm": 0.5514600276947021, + "learning_rate": 8.898809523809524e-05, + "loss": 1.1495, + "step": 299 + }, + { + "epoch": 0.0268054593785601, + "grad_norm": 0.5337786078453064, + "learning_rate": 8.92857142857143e-05, + "loss": 1.2366, + "step": 300 + }, + { + "epoch": 0.026894810909821966, + "grad_norm": 0.5156247019767761, + "learning_rate": 8.958333333333335e-05, + "loss": 1.1969, + "step": 301 + }, + { + "epoch": 0.026984162441083835, + "grad_norm": 0.5061919689178467, + "learning_rate": 8.988095238095238e-05, + "loss": 1.1633, + "step": 302 + }, + { + "epoch": 0.0270735139723457, + "grad_norm": 0.6192631721496582, + "learning_rate": 9.017857142857143e-05, + "loss": 1.1552, + "step": 303 + }, + { + "epoch": 0.027162865503607567, + "grad_norm": 0.5444059371948242, + "learning_rate": 9.047619047619048e-05, + "loss": 1.1532, + "step": 304 + }, + { + "epoch": 0.027252217034869436, + "grad_norm": 0.49790215492248535, + "learning_rate": 9.077380952380952e-05, + "loss": 1.1339, + "step": 305 + }, + { + "epoch": 0.0273415685661313, + "grad_norm": 0.5882278680801392, + "learning_rate": 9.107142857142857e-05, + "loss": 1.1195, + "step": 306 + }, + { + "epoch": 0.027430920097393167, + "grad_norm": 0.5879011154174805, + "learning_rate": 9.136904761904762e-05, + "loss": 1.1341, + "step": 307 + }, + { + "epoch": 0.027520271628655037, + "grad_norm": 0.5458969473838806, + "learning_rate": 9.166666666666667e-05, + "loss": 1.1498, + "step": 308 + }, + { + "epoch": 0.027609623159916902, + "grad_norm": 0.5539296865463257, + "learning_rate": 9.196428571428572e-05, + "loss": 1.1261, + "step": 309 + }, + { + "epoch": 0.02769897469117877, + "grad_norm": 0.508406400680542, + "learning_rate": 9.226190476190478e-05, + "loss": 1.1318, + "step": 310 + }, + { + "epoch": 0.027788326222440637, + "grad_norm": 0.5266230702400208, + "learning_rate": 9.255952380952382e-05, + "loss": 1.1463, + "step": 311 + }, + { + "epoch": 0.027877677753702503, + "grad_norm": 0.5170016288757324, + "learning_rate": 9.285714285714286e-05, + "loss": 1.1736, + "step": 312 + }, + { + "epoch": 0.027967029284964372, + "grad_norm": 0.5622848272323608, + "learning_rate": 9.31547619047619e-05, + "loss": 1.2113, + "step": 313 + }, + { + "epoch": 0.028056380816226238, + "grad_norm": 0.5831321477890015, + "learning_rate": 9.345238095238095e-05, + "loss": 1.1497, + "step": 314 + }, + { + "epoch": 0.028145732347488104, + "grad_norm": 0.46541884541511536, + "learning_rate": 9.375e-05, + "loss": 1.146, + "step": 315 + }, + { + "epoch": 0.028235083878749973, + "grad_norm": 0.5889435410499573, + "learning_rate": 9.404761904761905e-05, + "loss": 1.1694, + "step": 316 + }, + { + "epoch": 0.02832443541001184, + "grad_norm": 0.49173977971076965, + "learning_rate": 9.43452380952381e-05, + "loss": 1.1854, + "step": 317 + }, + { + "epoch": 0.028413786941273705, + "grad_norm": 0.6178017258644104, + "learning_rate": 9.464285714285715e-05, + "loss": 1.1072, + "step": 318 + }, + { + "epoch": 0.028503138472535574, + "grad_norm": 0.5283975005149841, + "learning_rate": 9.494047619047619e-05, + "loss": 1.0578, + "step": 319 + }, + { + "epoch": 0.02859249000379744, + "grad_norm": 0.5168601870536804, + "learning_rate": 9.523809523809524e-05, + "loss": 1.1003, + "step": 320 + }, + { + "epoch": 0.028681841535059305, + "grad_norm": 0.4885217845439911, + "learning_rate": 9.553571428571429e-05, + "loss": 1.1953, + "step": 321 + }, + { + "epoch": 0.028771193066321175, + "grad_norm": 0.4856249690055847, + "learning_rate": 9.583333333333334e-05, + "loss": 1.2207, + "step": 322 + }, + { + "epoch": 0.02886054459758304, + "grad_norm": 0.5465936660766602, + "learning_rate": 9.613095238095238e-05, + "loss": 1.1668, + "step": 323 + }, + { + "epoch": 0.02894989612884491, + "grad_norm": 0.5614081025123596, + "learning_rate": 9.642857142857143e-05, + "loss": 1.1091, + "step": 324 + }, + { + "epoch": 0.029039247660106775, + "grad_norm": 0.5728946328163147, + "learning_rate": 9.672619047619048e-05, + "loss": 1.1232, + "step": 325 + }, + { + "epoch": 0.02912859919136864, + "grad_norm": 0.6166178584098816, + "learning_rate": 9.702380952380953e-05, + "loss": 1.1756, + "step": 326 + }, + { + "epoch": 0.02921795072263051, + "grad_norm": 0.5272330641746521, + "learning_rate": 9.732142857142858e-05, + "loss": 1.1131, + "step": 327 + }, + { + "epoch": 0.029307302253892376, + "grad_norm": 0.4859834909439087, + "learning_rate": 9.761904761904762e-05, + "loss": 1.2047, + "step": 328 + }, + { + "epoch": 0.029396653785154242, + "grad_norm": 0.5186814069747925, + "learning_rate": 9.791666666666667e-05, + "loss": 1.0997, + "step": 329 + }, + { + "epoch": 0.02948600531641611, + "grad_norm": 0.5721623301506042, + "learning_rate": 9.821428571428572e-05, + "loss": 1.0986, + "step": 330 + }, + { + "epoch": 0.029575356847677977, + "grad_norm": 0.5345954895019531, + "learning_rate": 9.851190476190477e-05, + "loss": 1.1971, + "step": 331 + }, + { + "epoch": 0.029664708378939843, + "grad_norm": 0.5651283264160156, + "learning_rate": 9.880952380952381e-05, + "loss": 1.1753, + "step": 332 + }, + { + "epoch": 0.029754059910201712, + "grad_norm": 0.6897709369659424, + "learning_rate": 9.910714285714286e-05, + "loss": 1.1662, + "step": 333 + }, + { + "epoch": 0.029843411441463578, + "grad_norm": 0.5349520444869995, + "learning_rate": 9.940476190476191e-05, + "loss": 1.1446, + "step": 334 + }, + { + "epoch": 0.029932762972725444, + "grad_norm": 0.5659092664718628, + "learning_rate": 9.970238095238096e-05, + "loss": 1.1273, + "step": 335 + }, + { + "epoch": 0.030022114503987313, + "grad_norm": 0.4331722557544708, + "learning_rate": 0.0001, + "loss": 1.192, + "step": 336 + }, + { + "epoch": 0.03011146603524918, + "grad_norm": 0.5557327270507812, + "learning_rate": 9.999999790598352e-05, + "loss": 1.1184, + "step": 337 + }, + { + "epoch": 0.030200817566511048, + "grad_norm": 0.5717477798461914, + "learning_rate": 9.999999162393425e-05, + "loss": 1.1695, + "step": 338 + }, + { + "epoch": 0.030290169097772913, + "grad_norm": 0.6119928359985352, + "learning_rate": 9.999998115385273e-05, + "loss": 1.1202, + "step": 339 + }, + { + "epoch": 0.03037952062903478, + "grad_norm": 0.5220287442207336, + "learning_rate": 9.999996649573982e-05, + "loss": 1.0898, + "step": 340 + }, + { + "epoch": 0.03046887216029665, + "grad_norm": 0.5707947015762329, + "learning_rate": 9.999994764959675e-05, + "loss": 1.1645, + "step": 341 + }, + { + "epoch": 0.030558223691558514, + "grad_norm": 0.5686008930206299, + "learning_rate": 9.99999246154251e-05, + "loss": 1.1141, + "step": 342 + }, + { + "epoch": 0.03064757522282038, + "grad_norm": 0.551203727722168, + "learning_rate": 9.999989739322682e-05, + "loss": 1.1881, + "step": 343 + }, + { + "epoch": 0.03073692675408225, + "grad_norm": 0.5531251430511475, + "learning_rate": 9.999986598300417e-05, + "loss": 1.0881, + "step": 344 + }, + { + "epoch": 0.030826278285344115, + "grad_norm": 0.5076424479484558, + "learning_rate": 9.999983038475978e-05, + "loss": 1.1175, + "step": 345 + }, + { + "epoch": 0.03091562981660598, + "grad_norm": 0.5364487171173096, + "learning_rate": 9.999979059849662e-05, + "loss": 1.1934, + "step": 346 + }, + { + "epoch": 0.03100498134786785, + "grad_norm": 0.557384729385376, + "learning_rate": 9.999974662421805e-05, + "loss": 1.1748, + "step": 347 + }, + { + "epoch": 0.031094332879129716, + "grad_norm": 0.5250005125999451, + "learning_rate": 9.999969846192774e-05, + "loss": 1.1338, + "step": 348 + }, + { + "epoch": 0.03118368441039158, + "grad_norm": 0.4806252121925354, + "learning_rate": 9.999964611162974e-05, + "loss": 1.1639, + "step": 349 + }, + { + "epoch": 0.03127303594165345, + "grad_norm": 0.520580530166626, + "learning_rate": 9.99995895733284e-05, + "loss": 1.1534, + "step": 350 + }, + { + "epoch": 0.03136238747291532, + "grad_norm": 0.5017713308334351, + "learning_rate": 9.999952884702848e-05, + "loss": 1.1473, + "step": 351 + }, + { + "epoch": 0.031451739004177186, + "grad_norm": 0.5286357402801514, + "learning_rate": 9.999946393273506e-05, + "loss": 1.179, + "step": 352 + }, + { + "epoch": 0.03154109053543905, + "grad_norm": 0.5509297251701355, + "learning_rate": 9.999939483045359e-05, + "loss": 1.1269, + "step": 353 + }, + { + "epoch": 0.03163044206670092, + "grad_norm": 0.4697697162628174, + "learning_rate": 9.999932154018983e-05, + "loss": 1.1569, + "step": 354 + }, + { + "epoch": 0.031719793597962787, + "grad_norm": 0.4342000484466553, + "learning_rate": 9.999924406194996e-05, + "loss": 1.1648, + "step": 355 + }, + { + "epoch": 0.03180914512922465, + "grad_norm": 0.5409436225891113, + "learning_rate": 9.999916239574043e-05, + "loss": 1.0954, + "step": 356 + }, + { + "epoch": 0.03189849666048652, + "grad_norm": 0.529708981513977, + "learning_rate": 9.99990765415681e-05, + "loss": 1.1114, + "step": 357 + }, + { + "epoch": 0.03198784819174839, + "grad_norm": 0.5057774186134338, + "learning_rate": 9.999898649944016e-05, + "loss": 1.1176, + "step": 358 + }, + { + "epoch": 0.032077199723010257, + "grad_norm": 0.4980236291885376, + "learning_rate": 9.999889226936415e-05, + "loss": 1.1574, + "step": 359 + }, + { + "epoch": 0.03216655125427212, + "grad_norm": 0.49295690655708313, + "learning_rate": 9.999879385134797e-05, + "loss": 1.1922, + "step": 360 + }, + { + "epoch": 0.03225590278553399, + "grad_norm": 0.4822183847427368, + "learning_rate": 9.999869124539984e-05, + "loss": 1.0773, + "step": 361 + }, + { + "epoch": 0.03234525431679586, + "grad_norm": 0.5717249512672424, + "learning_rate": 9.999858445152839e-05, + "loss": 1.1254, + "step": 362 + }, + { + "epoch": 0.03243460584805772, + "grad_norm": 0.49593716859817505, + "learning_rate": 9.999847346974253e-05, + "loss": 1.1105, + "step": 363 + }, + { + "epoch": 0.03252395737931959, + "grad_norm": 0.5179683566093445, + "learning_rate": 9.999835830005158e-05, + "loss": 1.0686, + "step": 364 + }, + { + "epoch": 0.03261330891058146, + "grad_norm": 0.5655368566513062, + "learning_rate": 9.999823894246517e-05, + "loss": 1.0889, + "step": 365 + }, + { + "epoch": 0.03270266044184332, + "grad_norm": 0.49578356742858887, + "learning_rate": 9.999811539699331e-05, + "loss": 1.1124, + "step": 366 + }, + { + "epoch": 0.03279201197310519, + "grad_norm": 0.5003335475921631, + "learning_rate": 9.999798766364634e-05, + "loss": 1.1669, + "step": 367 + }, + { + "epoch": 0.03288136350436706, + "grad_norm": 0.5258259177207947, + "learning_rate": 9.999785574243496e-05, + "loss": 1.1333, + "step": 368 + }, + { + "epoch": 0.03297071503562892, + "grad_norm": 0.5132459998130798, + "learning_rate": 9.999771963337024e-05, + "loss": 1.1795, + "step": 369 + }, + { + "epoch": 0.03306006656689079, + "grad_norm": 0.48852595686912537, + "learning_rate": 9.999757933646354e-05, + "loss": 1.127, + "step": 370 + }, + { + "epoch": 0.03314941809815266, + "grad_norm": 0.530531108379364, + "learning_rate": 9.999743485172666e-05, + "loss": 1.1424, + "step": 371 + }, + { + "epoch": 0.03323876962941452, + "grad_norm": 0.45526745915412903, + "learning_rate": 9.999728617917165e-05, + "loss": 1.213, + "step": 372 + }, + { + "epoch": 0.03332812116067639, + "grad_norm": 0.5421516299247742, + "learning_rate": 9.9997133318811e-05, + "loss": 1.1793, + "step": 373 + }, + { + "epoch": 0.03341747269193826, + "grad_norm": 0.5866374373435974, + "learning_rate": 9.999697627065752e-05, + "loss": 1.0692, + "step": 374 + }, + { + "epoch": 0.03350682422320012, + "grad_norm": 0.5176992416381836, + "learning_rate": 9.999681503472433e-05, + "loss": 1.1502, + "step": 375 + }, + { + "epoch": 0.03359617575446199, + "grad_norm": 0.5438038110733032, + "learning_rate": 9.999664961102495e-05, + "loss": 1.1342, + "step": 376 + }, + { + "epoch": 0.03368552728572386, + "grad_norm": 0.5158547759056091, + "learning_rate": 9.999647999957325e-05, + "loss": 1.1954, + "step": 377 + }, + { + "epoch": 0.03377487881698572, + "grad_norm": 0.46927282214164734, + "learning_rate": 9.999630620038343e-05, + "loss": 1.1292, + "step": 378 + }, + { + "epoch": 0.03386423034824759, + "grad_norm": 0.5032052397727966, + "learning_rate": 9.999612821347003e-05, + "loss": 1.108, + "step": 379 + }, + { + "epoch": 0.03395358187950946, + "grad_norm": 0.48456865549087524, + "learning_rate": 9.999594603884798e-05, + "loss": 1.1719, + "step": 380 + }, + { + "epoch": 0.034042933410771324, + "grad_norm": 0.49805065989494324, + "learning_rate": 9.999575967653252e-05, + "loss": 1.2259, + "step": 381 + }, + { + "epoch": 0.03413228494203319, + "grad_norm": 0.5582330226898193, + "learning_rate": 9.999556912653929e-05, + "loss": 1.2235, + "step": 382 + }, + { + "epoch": 0.03422163647329506, + "grad_norm": 0.5083693265914917, + "learning_rate": 9.999537438888423e-05, + "loss": 1.1593, + "step": 383 + }, + { + "epoch": 0.034310988004556925, + "grad_norm": 0.49797725677490234, + "learning_rate": 9.999517546358364e-05, + "loss": 1.1255, + "step": 384 + }, + { + "epoch": 0.034400339535818794, + "grad_norm": 0.5025597214698792, + "learning_rate": 9.999497235065418e-05, + "loss": 1.093, + "step": 385 + }, + { + "epoch": 0.03448969106708066, + "grad_norm": 0.49305295944213867, + "learning_rate": 9.999476505011289e-05, + "loss": 1.1264, + "step": 386 + }, + { + "epoch": 0.034579042598342526, + "grad_norm": 0.46636465191841125, + "learning_rate": 9.999455356197713e-05, + "loss": 1.0769, + "step": 387 + }, + { + "epoch": 0.034668394129604395, + "grad_norm": 0.5070281028747559, + "learning_rate": 9.999433788626461e-05, + "loss": 1.0925, + "step": 388 + }, + { + "epoch": 0.034757745660866264, + "grad_norm": 0.5060571432113647, + "learning_rate": 9.999411802299339e-05, + "loss": 1.2208, + "step": 389 + }, + { + "epoch": 0.03484709719212813, + "grad_norm": 0.5558078289031982, + "learning_rate": 9.999389397218186e-05, + "loss": 1.1395, + "step": 390 + }, + { + "epoch": 0.034936448723389996, + "grad_norm": 0.49151378870010376, + "learning_rate": 9.999366573384884e-05, + "loss": 1.1426, + "step": 391 + }, + { + "epoch": 0.035025800254651865, + "grad_norm": 0.4773429036140442, + "learning_rate": 9.99934333080134e-05, + "loss": 1.1587, + "step": 392 + }, + { + "epoch": 0.035115151785913734, + "grad_norm": 0.43944311141967773, + "learning_rate": 9.999319669469505e-05, + "loss": 1.2301, + "step": 393 + }, + { + "epoch": 0.035204503317175596, + "grad_norm": 0.4840754270553589, + "learning_rate": 9.999295589391358e-05, + "loss": 1.1404, + "step": 394 + }, + { + "epoch": 0.035293854848437466, + "grad_norm": 0.49783623218536377, + "learning_rate": 9.999271090568918e-05, + "loss": 1.0676, + "step": 395 + }, + { + "epoch": 0.035383206379699335, + "grad_norm": 0.47931092977523804, + "learning_rate": 9.999246173004233e-05, + "loss": 1.1528, + "step": 396 + }, + { + "epoch": 0.0354725579109612, + "grad_norm": 0.47932660579681396, + "learning_rate": 9.999220836699395e-05, + "loss": 1.1604, + "step": 397 + }, + { + "epoch": 0.035561909442223066, + "grad_norm": 0.5339416861534119, + "learning_rate": 9.999195081656522e-05, + "loss": 1.097, + "step": 398 + }, + { + "epoch": 0.035651260973484936, + "grad_norm": 0.5252090096473694, + "learning_rate": 9.999168907877776e-05, + "loss": 1.085, + "step": 399 + }, + { + "epoch": 0.0357406125047468, + "grad_norm": 0.5167770981788635, + "learning_rate": 9.999142315365345e-05, + "loss": 1.1563, + "step": 400 + }, + { + "epoch": 0.03582996403600867, + "grad_norm": 0.6297523975372314, + "learning_rate": 9.999115304121457e-05, + "loss": 1.0762, + "step": 401 + }, + { + "epoch": 0.035919315567270536, + "grad_norm": 0.5326511859893799, + "learning_rate": 9.999087874148379e-05, + "loss": 1.1082, + "step": 402 + }, + { + "epoch": 0.0360086670985324, + "grad_norm": 0.5279747843742371, + "learning_rate": 9.999060025448403e-05, + "loss": 1.1412, + "step": 403 + }, + { + "epoch": 0.03609801862979427, + "grad_norm": 0.4827171564102173, + "learning_rate": 9.999031758023863e-05, + "loss": 1.1705, + "step": 404 + }, + { + "epoch": 0.03618737016105614, + "grad_norm": 0.5073494911193848, + "learning_rate": 9.999003071877129e-05, + "loss": 1.1579, + "step": 405 + }, + { + "epoch": 0.036276721692318, + "grad_norm": 0.4927610158920288, + "learning_rate": 9.9989739670106e-05, + "loss": 1.1158, + "step": 406 + }, + { + "epoch": 0.03636607322357987, + "grad_norm": 0.45825162529945374, + "learning_rate": 9.998944443426719e-05, + "loss": 1.1761, + "step": 407 + }, + { + "epoch": 0.03645542475484174, + "grad_norm": 0.5363956093788147, + "learning_rate": 9.998914501127954e-05, + "loss": 1.15, + "step": 408 + }, + { + "epoch": 0.0365447762861036, + "grad_norm": 0.5403926968574524, + "learning_rate": 9.998884140116816e-05, + "loss": 1.0831, + "step": 409 + }, + { + "epoch": 0.03663412781736547, + "grad_norm": 0.5043397545814514, + "learning_rate": 9.998853360395846e-05, + "loss": 1.0479, + "step": 410 + }, + { + "epoch": 0.03672347934862734, + "grad_norm": 0.5072253346443176, + "learning_rate": 9.998822161967623e-05, + "loss": 1.0596, + "step": 411 + }, + { + "epoch": 0.0368128308798892, + "grad_norm": 0.4440356492996216, + "learning_rate": 9.99879054483476e-05, + "loss": 1.1423, + "step": 412 + }, + { + "epoch": 0.03690218241115107, + "grad_norm": 0.5258128046989441, + "learning_rate": 9.998758508999906e-05, + "loss": 1.1072, + "step": 413 + }, + { + "epoch": 0.03699153394241294, + "grad_norm": 0.48225632309913635, + "learning_rate": 9.998726054465744e-05, + "loss": 1.1436, + "step": 414 + }, + { + "epoch": 0.0370808854736748, + "grad_norm": 0.49311602115631104, + "learning_rate": 9.998693181234992e-05, + "loss": 1.0847, + "step": 415 + }, + { + "epoch": 0.03717023700493667, + "grad_norm": 0.46585437655448914, + "learning_rate": 9.998659889310406e-05, + "loss": 1.1562, + "step": 416 + }, + { + "epoch": 0.03725958853619854, + "grad_norm": 0.5066165924072266, + "learning_rate": 9.99862617869477e-05, + "loss": 1.1278, + "step": 417 + }, + { + "epoch": 0.03734894006746041, + "grad_norm": 0.4618039131164551, + "learning_rate": 9.998592049390911e-05, + "loss": 1.1738, + "step": 418 + }, + { + "epoch": 0.03743829159872227, + "grad_norm": 0.4884462356567383, + "learning_rate": 9.998557501401687e-05, + "loss": 1.1826, + "step": 419 + }, + { + "epoch": 0.03752764312998414, + "grad_norm": 0.4807124137878418, + "learning_rate": 9.99852253472999e-05, + "loss": 1.1397, + "step": 420 + }, + { + "epoch": 0.03761699466124601, + "grad_norm": 0.4974918067455292, + "learning_rate": 9.998487149378752e-05, + "loss": 1.1481, + "step": 421 + }, + { + "epoch": 0.03770634619250787, + "grad_norm": 0.42765894532203674, + "learning_rate": 9.998451345350935e-05, + "loss": 1.1967, + "step": 422 + }, + { + "epoch": 0.03779569772376974, + "grad_norm": 0.509459912776947, + "learning_rate": 9.998415122649537e-05, + "loss": 1.1086, + "step": 423 + }, + { + "epoch": 0.03788504925503161, + "grad_norm": 0.4979933202266693, + "learning_rate": 9.998378481277593e-05, + "loss": 1.1003, + "step": 424 + }, + { + "epoch": 0.03797440078629347, + "grad_norm": 0.4624142348766327, + "learning_rate": 9.998341421238173e-05, + "loss": 1.1323, + "step": 425 + }, + { + "epoch": 0.03806375231755534, + "grad_norm": 0.4574481248855591, + "learning_rate": 9.998303942534382e-05, + "loss": 1.1195, + "step": 426 + }, + { + "epoch": 0.03815310384881721, + "grad_norm": 0.542635977268219, + "learning_rate": 9.998266045169356e-05, + "loss": 1.1185, + "step": 427 + }, + { + "epoch": 0.038242455380079074, + "grad_norm": 0.48302701115608215, + "learning_rate": 9.99822772914627e-05, + "loss": 1.13, + "step": 428 + }, + { + "epoch": 0.03833180691134094, + "grad_norm": 0.4780206084251404, + "learning_rate": 9.998188994468337e-05, + "loss": 1.0496, + "step": 429 + }, + { + "epoch": 0.03842115844260281, + "grad_norm": 0.5406165719032288, + "learning_rate": 9.998149841138797e-05, + "loss": 1.0936, + "step": 430 + }, + { + "epoch": 0.038510509973864675, + "grad_norm": 0.4520138204097748, + "learning_rate": 9.998110269160932e-05, + "loss": 1.2128, + "step": 431 + }, + { + "epoch": 0.038599861505126544, + "grad_norm": 0.4702879786491394, + "learning_rate": 9.998070278538057e-05, + "loss": 1.2563, + "step": 432 + }, + { + "epoch": 0.03868921303638841, + "grad_norm": 0.5223757028579712, + "learning_rate": 9.998029869273518e-05, + "loss": 1.091, + "step": 433 + }, + { + "epoch": 0.038778564567650275, + "grad_norm": 0.5411685705184937, + "learning_rate": 9.997989041370704e-05, + "loss": 1.1356, + "step": 434 + }, + { + "epoch": 0.038867916098912145, + "grad_norm": 0.48428136110305786, + "learning_rate": 9.997947794833034e-05, + "loss": 1.2075, + "step": 435 + }, + { + "epoch": 0.038957267630174014, + "grad_norm": 0.4824248254299164, + "learning_rate": 9.997906129663961e-05, + "loss": 1.1918, + "step": 436 + }, + { + "epoch": 0.039046619161435876, + "grad_norm": 0.5200150609016418, + "learning_rate": 9.997864045866975e-05, + "loss": 1.1364, + "step": 437 + }, + { + "epoch": 0.039135970692697745, + "grad_norm": 0.5119284391403198, + "learning_rate": 9.997821543445602e-05, + "loss": 1.1136, + "step": 438 + }, + { + "epoch": 0.039225322223959615, + "grad_norm": 0.4953431785106659, + "learning_rate": 9.997778622403402e-05, + "loss": 1.2217, + "step": 439 + }, + { + "epoch": 0.03931467375522148, + "grad_norm": 0.49554356932640076, + "learning_rate": 9.997735282743969e-05, + "loss": 1.1535, + "step": 440 + }, + { + "epoch": 0.039404025286483346, + "grad_norm": 0.5138264894485474, + "learning_rate": 9.997691524470936e-05, + "loss": 0.9905, + "step": 441 + }, + { + "epoch": 0.039493376817745215, + "grad_norm": 0.4627537727355957, + "learning_rate": 9.997647347587964e-05, + "loss": 1.1075, + "step": 442 + }, + { + "epoch": 0.03958272834900708, + "grad_norm": 0.5191687941551208, + "learning_rate": 9.997602752098758e-05, + "loss": 1.172, + "step": 443 + }, + { + "epoch": 0.03967207988026895, + "grad_norm": 0.4805525839328766, + "learning_rate": 9.997557738007049e-05, + "loss": 1.154, + "step": 444 + }, + { + "epoch": 0.039761431411530816, + "grad_norm": 0.5130792260169983, + "learning_rate": 9.99751230531661e-05, + "loss": 1.0842, + "step": 445 + }, + { + "epoch": 0.039850782942792685, + "grad_norm": 0.4610874354839325, + "learning_rate": 9.997466454031246e-05, + "loss": 1.0973, + "step": 446 + }, + { + "epoch": 0.03994013447405455, + "grad_norm": 0.43982231616973877, + "learning_rate": 9.997420184154798e-05, + "loss": 1.156, + "step": 447 + }, + { + "epoch": 0.04002948600531642, + "grad_norm": 0.49081990122795105, + "learning_rate": 9.99737349569114e-05, + "loss": 1.0796, + "step": 448 + }, + { + "epoch": 0.040118837536578286, + "grad_norm": 0.5369452238082886, + "learning_rate": 9.997326388644183e-05, + "loss": 1.0677, + "step": 449 + }, + { + "epoch": 0.04020818906784015, + "grad_norm": 0.4784664213657379, + "learning_rate": 9.997278863017874e-05, + "loss": 1.0792, + "step": 450 + }, + { + "epoch": 0.04029754059910202, + "grad_norm": 0.53533935546875, + "learning_rate": 9.997230918816191e-05, + "loss": 1.0963, + "step": 451 + }, + { + "epoch": 0.04038689213036389, + "grad_norm": 0.5231233239173889, + "learning_rate": 9.997182556043155e-05, + "loss": 1.1181, + "step": 452 + }, + { + "epoch": 0.04047624366162575, + "grad_norm": 0.4834752380847931, + "learning_rate": 9.997133774702812e-05, + "loss": 1.1174, + "step": 453 + }, + { + "epoch": 0.04056559519288762, + "grad_norm": 0.48885536193847656, + "learning_rate": 9.997084574799252e-05, + "loss": 1.0655, + "step": 454 + }, + { + "epoch": 0.04065494672414949, + "grad_norm": 0.4479488730430603, + "learning_rate": 9.99703495633659e-05, + "loss": 1.1568, + "step": 455 + }, + { + "epoch": 0.04074429825541135, + "grad_norm": 0.5108731985092163, + "learning_rate": 9.996984919318989e-05, + "loss": 1.1712, + "step": 456 + }, + { + "epoch": 0.04083364978667322, + "grad_norm": 0.5030118227005005, + "learning_rate": 9.996934463750636e-05, + "loss": 1.0666, + "step": 457 + }, + { + "epoch": 0.04092300131793509, + "grad_norm": 0.5227623581886292, + "learning_rate": 9.996883589635757e-05, + "loss": 1.1652, + "step": 458 + }, + { + "epoch": 0.04101235284919695, + "grad_norm": 0.4760054349899292, + "learning_rate": 9.996832296978616e-05, + "loss": 1.151, + "step": 459 + }, + { + "epoch": 0.04110170438045882, + "grad_norm": 0.42174920439720154, + "learning_rate": 9.996780585783508e-05, + "loss": 1.1306, + "step": 460 + }, + { + "epoch": 0.04119105591172069, + "grad_norm": 0.42818742990493774, + "learning_rate": 9.996728456054762e-05, + "loss": 1.1775, + "step": 461 + }, + { + "epoch": 0.04128040744298255, + "grad_norm": 0.48023608326911926, + "learning_rate": 9.996675907796749e-05, + "loss": 1.1809, + "step": 462 + }, + { + "epoch": 0.04136975897424442, + "grad_norm": 0.45978084206581116, + "learning_rate": 9.996622941013867e-05, + "loss": 1.2312, + "step": 463 + }, + { + "epoch": 0.04145911050550629, + "grad_norm": 0.4810321629047394, + "learning_rate": 9.996569555710553e-05, + "loss": 1.0946, + "step": 464 + }, + { + "epoch": 0.04154846203676815, + "grad_norm": 0.43682757019996643, + "learning_rate": 9.996515751891279e-05, + "loss": 1.1556, + "step": 465 + }, + { + "epoch": 0.04163781356803002, + "grad_norm": 0.4703795313835144, + "learning_rate": 9.996461529560553e-05, + "loss": 1.1268, + "step": 466 + }, + { + "epoch": 0.04172716509929189, + "grad_norm": 0.44790583848953247, + "learning_rate": 9.996406888722914e-05, + "loss": 1.0907, + "step": 467 + }, + { + "epoch": 0.04181651663055375, + "grad_norm": 0.5048056840896606, + "learning_rate": 9.996351829382941e-05, + "loss": 1.0709, + "step": 468 + }, + { + "epoch": 0.04190586816181562, + "grad_norm": 0.4316783845424652, + "learning_rate": 9.996296351545244e-05, + "loss": 1.1355, + "step": 469 + }, + { + "epoch": 0.04199521969307749, + "grad_norm": 0.4551528990268707, + "learning_rate": 9.996240455214472e-05, + "loss": 1.0943, + "step": 470 + }, + { + "epoch": 0.042084571224339354, + "grad_norm": 0.4864242970943451, + "learning_rate": 9.996184140395306e-05, + "loss": 1.0194, + "step": 471 + }, + { + "epoch": 0.04217392275560122, + "grad_norm": 0.5039882659912109, + "learning_rate": 9.996127407092462e-05, + "loss": 1.1392, + "step": 472 + }, + { + "epoch": 0.04226327428686309, + "grad_norm": 0.5886643528938293, + "learning_rate": 9.996070255310692e-05, + "loss": 1.0797, + "step": 473 + }, + { + "epoch": 0.04235262581812496, + "grad_norm": 0.5036435723304749, + "learning_rate": 9.996012685054786e-05, + "loss": 1.1886, + "step": 474 + }, + { + "epoch": 0.042441977349386824, + "grad_norm": 0.4894576370716095, + "learning_rate": 9.995954696329562e-05, + "loss": 1.1631, + "step": 475 + }, + { + "epoch": 0.04253132888064869, + "grad_norm": 0.4920431673526764, + "learning_rate": 9.99589628913988e-05, + "loss": 1.1382, + "step": 476 + }, + { + "epoch": 0.04262068041191056, + "grad_norm": 0.42358994483947754, + "learning_rate": 9.995837463490632e-05, + "loss": 1.1086, + "step": 477 + }, + { + "epoch": 0.042710031943172425, + "grad_norm": 0.4317459464073181, + "learning_rate": 9.995778219386744e-05, + "loss": 1.132, + "step": 478 + }, + { + "epoch": 0.042799383474434294, + "grad_norm": 0.4853540062904358, + "learning_rate": 9.995718556833178e-05, + "loss": 1.0694, + "step": 479 + }, + { + "epoch": 0.04288873500569616, + "grad_norm": 0.49975448846817017, + "learning_rate": 9.995658475834933e-05, + "loss": 1.1211, + "step": 480 + }, + { + "epoch": 0.042978086536958025, + "grad_norm": 0.4265710115432739, + "learning_rate": 9.995597976397042e-05, + "loss": 1.1266, + "step": 481 + }, + { + "epoch": 0.043067438068219895, + "grad_norm": 0.5126653909683228, + "learning_rate": 9.995537058524569e-05, + "loss": 1.1113, + "step": 482 + }, + { + "epoch": 0.043156789599481764, + "grad_norm": 0.5130075812339783, + "learning_rate": 9.99547572222262e-05, + "loss": 1.1382, + "step": 483 + }, + { + "epoch": 0.043246141130743626, + "grad_norm": 0.46531277894973755, + "learning_rate": 9.995413967496333e-05, + "loss": 1.0806, + "step": 484 + }, + { + "epoch": 0.043335492662005495, + "grad_norm": 0.4815559983253479, + "learning_rate": 9.995351794350876e-05, + "loss": 1.136, + "step": 485 + }, + { + "epoch": 0.043424844193267365, + "grad_norm": 0.417111873626709, + "learning_rate": 9.99528920279146e-05, + "loss": 1.158, + "step": 486 + }, + { + "epoch": 0.04351419572452923, + "grad_norm": 0.4570912718772888, + "learning_rate": 9.995226192823329e-05, + "loss": 1.134, + "step": 487 + }, + { + "epoch": 0.043603547255791096, + "grad_norm": 0.5166110992431641, + "learning_rate": 9.995162764451758e-05, + "loss": 1.1111, + "step": 488 + }, + { + "epoch": 0.043692898787052965, + "grad_norm": 0.4371122419834137, + "learning_rate": 9.99509891768206e-05, + "loss": 1.1687, + "step": 489 + }, + { + "epoch": 0.04378225031831483, + "grad_norm": 0.5123320817947388, + "learning_rate": 9.995034652519586e-05, + "loss": 1.1159, + "step": 490 + }, + { + "epoch": 0.0438716018495767, + "grad_norm": 0.5270020961761475, + "learning_rate": 9.994969968969715e-05, + "loss": 1.0321, + "step": 491 + }, + { + "epoch": 0.043960953380838566, + "grad_norm": 0.43343213200569153, + "learning_rate": 9.994904867037867e-05, + "loss": 1.1311, + "step": 492 + }, + { + "epoch": 0.04405030491210043, + "grad_norm": 0.5779858827590942, + "learning_rate": 9.994839346729495e-05, + "loss": 1.0312, + "step": 493 + }, + { + "epoch": 0.0441396564433623, + "grad_norm": 0.4757930338382721, + "learning_rate": 9.994773408050084e-05, + "loss": 1.1148, + "step": 494 + }, + { + "epoch": 0.04422900797462417, + "grad_norm": 0.483888179063797, + "learning_rate": 9.994707051005164e-05, + "loss": 1.1387, + "step": 495 + }, + { + "epoch": 0.04431835950588603, + "grad_norm": 0.4878624677658081, + "learning_rate": 9.994640275600285e-05, + "loss": 1.1294, + "step": 496 + }, + { + "epoch": 0.0444077110371479, + "grad_norm": 0.5229454040527344, + "learning_rate": 9.994573081841046e-05, + "loss": 1.0685, + "step": 497 + }, + { + "epoch": 0.04449706256840977, + "grad_norm": 0.49779602885246277, + "learning_rate": 9.994505469733071e-05, + "loss": 1.1596, + "step": 498 + }, + { + "epoch": 0.04458641409967163, + "grad_norm": 0.5035576224327087, + "learning_rate": 9.994437439282027e-05, + "loss": 1.1966, + "step": 499 + }, + { + "epoch": 0.0446757656309335, + "grad_norm": 0.4735001027584076, + "learning_rate": 9.99436899049361e-05, + "loss": 1.1213, + "step": 500 + }, + { + "epoch": 0.04476511716219537, + "grad_norm": 0.5072652697563171, + "learning_rate": 9.994300123373554e-05, + "loss": 1.1671, + "step": 501 + }, + { + "epoch": 0.04485446869345724, + "grad_norm": 0.4931294620037079, + "learning_rate": 9.994230837927627e-05, + "loss": 1.1065, + "step": 502 + }, + { + "epoch": 0.0449438202247191, + "grad_norm": 0.4530344307422638, + "learning_rate": 9.994161134161634e-05, + "loss": 1.1528, + "step": 503 + }, + { + "epoch": 0.04503317175598097, + "grad_norm": 0.5151768326759338, + "learning_rate": 9.99409101208141e-05, + "loss": 0.9932, + "step": 504 + }, + { + "epoch": 0.04512252328724284, + "grad_norm": 0.5038042068481445, + "learning_rate": 9.994020471692833e-05, + "loss": 1.1132, + "step": 505 + }, + { + "epoch": 0.0452118748185047, + "grad_norm": 0.47841110825538635, + "learning_rate": 9.993949513001807e-05, + "loss": 1.1563, + "step": 506 + }, + { + "epoch": 0.04530122634976657, + "grad_norm": 0.5167407989501953, + "learning_rate": 9.993878136014278e-05, + "loss": 1.1383, + "step": 507 + }, + { + "epoch": 0.04539057788102844, + "grad_norm": 0.5208450555801392, + "learning_rate": 9.993806340736225e-05, + "loss": 1.1225, + "step": 508 + }, + { + "epoch": 0.0454799294122903, + "grad_norm": 0.5085687637329102, + "learning_rate": 9.99373412717366e-05, + "loss": 1.1747, + "step": 509 + }, + { + "epoch": 0.04556928094355217, + "grad_norm": 0.5266485214233398, + "learning_rate": 9.993661495332633e-05, + "loss": 1.1311, + "step": 510 + }, + { + "epoch": 0.04565863247481404, + "grad_norm": 0.5289018154144287, + "learning_rate": 9.993588445219227e-05, + "loss": 1.114, + "step": 511 + }, + { + "epoch": 0.0457479840060759, + "grad_norm": 0.4422353208065033, + "learning_rate": 9.99351497683956e-05, + "loss": 1.178, + "step": 512 + }, + { + "epoch": 0.04583733553733777, + "grad_norm": 0.5575171113014221, + "learning_rate": 9.993441090199787e-05, + "loss": 1.0279, + "step": 513 + }, + { + "epoch": 0.04592668706859964, + "grad_norm": 0.53556227684021, + "learning_rate": 9.993366785306097e-05, + "loss": 1.1212, + "step": 514 + }, + { + "epoch": 0.0460160385998615, + "grad_norm": 0.5543893575668335, + "learning_rate": 9.993292062164714e-05, + "loss": 1.0113, + "step": 515 + }, + { + "epoch": 0.04610539013112337, + "grad_norm": 0.5223544239997864, + "learning_rate": 9.993216920781894e-05, + "loss": 1.0224, + "step": 516 + }, + { + "epoch": 0.04619474166238524, + "grad_norm": 0.5060791969299316, + "learning_rate": 9.993141361163935e-05, + "loss": 1.1628, + "step": 517 + }, + { + "epoch": 0.046284093193647104, + "grad_norm": 0.4724702537059784, + "learning_rate": 9.993065383317163e-05, + "loss": 1.1255, + "step": 518 + }, + { + "epoch": 0.04637344472490897, + "grad_norm": 0.47285956144332886, + "learning_rate": 9.992988987247944e-05, + "loss": 1.1659, + "step": 519 + }, + { + "epoch": 0.04646279625617084, + "grad_norm": 0.4982796013355255, + "learning_rate": 9.992912172962674e-05, + "loss": 1.1952, + "step": 520 + }, + { + "epoch": 0.046552147787432704, + "grad_norm": 0.48061710596084595, + "learning_rate": 9.99283494046779e-05, + "loss": 1.1388, + "step": 521 + }, + { + "epoch": 0.046641499318694574, + "grad_norm": 0.45872175693511963, + "learning_rate": 9.99275728976976e-05, + "loss": 1.1262, + "step": 522 + }, + { + "epoch": 0.04673085084995644, + "grad_norm": 0.4480443000793457, + "learning_rate": 9.992679220875088e-05, + "loss": 1.1235, + "step": 523 + }, + { + "epoch": 0.046820202381218305, + "grad_norm": 0.43175145983695984, + "learning_rate": 9.992600733790314e-05, + "loss": 1.1185, + "step": 524 + }, + { + "epoch": 0.046909553912480174, + "grad_norm": 0.41958916187286377, + "learning_rate": 9.99252182852201e-05, + "loss": 1.1661, + "step": 525 + }, + { + "epoch": 0.046998905443742044, + "grad_norm": 0.4344329237937927, + "learning_rate": 9.992442505076787e-05, + "loss": 1.1364, + "step": 526 + }, + { + "epoch": 0.047088256975003906, + "grad_norm": 0.4704360067844391, + "learning_rate": 9.992362763461287e-05, + "loss": 1.0588, + "step": 527 + }, + { + "epoch": 0.047177608506265775, + "grad_norm": 0.4403749704360962, + "learning_rate": 9.992282603682192e-05, + "loss": 1.1517, + "step": 528 + }, + { + "epoch": 0.047266960037527644, + "grad_norm": 0.4906516969203949, + "learning_rate": 9.992202025746215e-05, + "loss": 1.0916, + "step": 529 + }, + { + "epoch": 0.04735631156878951, + "grad_norm": 0.5123149752616882, + "learning_rate": 9.992121029660106e-05, + "loss": 1.0794, + "step": 530 + }, + { + "epoch": 0.047445663100051376, + "grad_norm": 0.46089616417884827, + "learning_rate": 9.992039615430648e-05, + "loss": 1.1846, + "step": 531 + }, + { + "epoch": 0.047535014631313245, + "grad_norm": 0.5317028760910034, + "learning_rate": 9.99195778306466e-05, + "loss": 1.1289, + "step": 532 + }, + { + "epoch": 0.047624366162575114, + "grad_norm": 0.48291924595832825, + "learning_rate": 9.991875532568999e-05, + "loss": 1.1427, + "step": 533 + }, + { + "epoch": 0.04771371769383698, + "grad_norm": 0.46745729446411133, + "learning_rate": 9.991792863950552e-05, + "loss": 1.1027, + "step": 534 + }, + { + "epoch": 0.047803069225098846, + "grad_norm": 0.4555657207965851, + "learning_rate": 9.991709777216242e-05, + "loss": 1.0926, + "step": 535 + }, + { + "epoch": 0.047892420756360715, + "grad_norm": 0.4779694080352783, + "learning_rate": 9.991626272373033e-05, + "loss": 1.0919, + "step": 536 + }, + { + "epoch": 0.04798177228762258, + "grad_norm": 0.4468933045864105, + "learning_rate": 9.991542349427916e-05, + "loss": 1.1903, + "step": 537 + }, + { + "epoch": 0.04807112381888445, + "grad_norm": 0.5170602202415466, + "learning_rate": 9.99145800838792e-05, + "loss": 1.0183, + "step": 538 + }, + { + "epoch": 0.048160475350146316, + "grad_norm": 0.4570893347263336, + "learning_rate": 9.991373249260112e-05, + "loss": 1.0834, + "step": 539 + }, + { + "epoch": 0.04824982688140818, + "grad_norm": 0.4547278881072998, + "learning_rate": 9.99128807205159e-05, + "loss": 1.1534, + "step": 540 + }, + { + "epoch": 0.04833917841267005, + "grad_norm": 0.47675079107284546, + "learning_rate": 9.991202476769488e-05, + "loss": 1.0934, + "step": 541 + }, + { + "epoch": 0.04842852994393192, + "grad_norm": 0.4400666654109955, + "learning_rate": 9.991116463420976e-05, + "loss": 1.0891, + "step": 542 + }, + { + "epoch": 0.04851788147519378, + "grad_norm": 0.4726406931877136, + "learning_rate": 9.99103003201326e-05, + "loss": 1.0805, + "step": 543 + }, + { + "epoch": 0.04860723300645565, + "grad_norm": 0.484070748090744, + "learning_rate": 9.990943182553579e-05, + "loss": 1.0694, + "step": 544 + }, + { + "epoch": 0.04869658453771752, + "grad_norm": 0.5411748290061951, + "learning_rate": 9.990855915049204e-05, + "loss": 1.0275, + "step": 545 + }, + { + "epoch": 0.04878593606897938, + "grad_norm": 0.46557214856147766, + "learning_rate": 9.990768229507447e-05, + "loss": 1.2306, + "step": 546 + }, + { + "epoch": 0.04887528760024125, + "grad_norm": 0.5048271417617798, + "learning_rate": 9.990680125935657e-05, + "loss": 1.063, + "step": 547 + }, + { + "epoch": 0.04896463913150312, + "grad_norm": 0.4694403409957886, + "learning_rate": 9.990591604341206e-05, + "loss": 1.117, + "step": 548 + }, + { + "epoch": 0.04905399066276498, + "grad_norm": 0.5101834535598755, + "learning_rate": 9.990502664731515e-05, + "loss": 1.096, + "step": 549 + }, + { + "epoch": 0.04914334219402685, + "grad_norm": 0.4212850332260132, + "learning_rate": 9.99041330711403e-05, + "loss": 1.2322, + "step": 550 + }, + { + "epoch": 0.04923269372528872, + "grad_norm": 0.4730430245399475, + "learning_rate": 9.990323531496235e-05, + "loss": 1.1706, + "step": 551 + }, + { + "epoch": 0.04932204525655058, + "grad_norm": 0.4167949855327606, + "learning_rate": 9.990233337885652e-05, + "loss": 1.1796, + "step": 552 + }, + { + "epoch": 0.04941139678781245, + "grad_norm": 0.44390869140625, + "learning_rate": 9.990142726289837e-05, + "loss": 1.1358, + "step": 553 + }, + { + "epoch": 0.04950074831907432, + "grad_norm": 0.47111496329307556, + "learning_rate": 9.990051696716375e-05, + "loss": 1.1188, + "step": 554 + }, + { + "epoch": 0.04959009985033618, + "grad_norm": 0.40082886815071106, + "learning_rate": 9.989960249172894e-05, + "loss": 1.1225, + "step": 555 + }, + { + "epoch": 0.04967945138159805, + "grad_norm": 0.42682352662086487, + "learning_rate": 9.989868383667054e-05, + "loss": 1.0989, + "step": 556 + }, + { + "epoch": 0.04976880291285992, + "grad_norm": 0.45663878321647644, + "learning_rate": 9.989776100206548e-05, + "loss": 1.1296, + "step": 557 + }, + { + "epoch": 0.04985815444412178, + "grad_norm": 0.5189902782440186, + "learning_rate": 9.989683398799106e-05, + "loss": 1.0356, + "step": 558 + }, + { + "epoch": 0.04994750597538365, + "grad_norm": 0.41676539182662964, + "learning_rate": 9.989590279452492e-05, + "loss": 1.0858, + "step": 559 + }, + { + "epoch": 0.05003685750664552, + "grad_norm": 0.47391507029533386, + "learning_rate": 9.989496742174509e-05, + "loss": 1.1989, + "step": 560 + }, + { + "epoch": 0.05012620903790739, + "grad_norm": 0.48219195008277893, + "learning_rate": 9.989402786972988e-05, + "loss": 1.0994, + "step": 561 + }, + { + "epoch": 0.05021556056916925, + "grad_norm": 0.5182317495346069, + "learning_rate": 9.989308413855802e-05, + "loss": 1.068, + "step": 562 + }, + { + "epoch": 0.05030491210043112, + "grad_norm": 0.44291019439697266, + "learning_rate": 9.989213622830853e-05, + "loss": 1.0893, + "step": 563 + }, + { + "epoch": 0.05039426363169299, + "grad_norm": 0.4342930018901825, + "learning_rate": 9.989118413906082e-05, + "loss": 1.1266, + "step": 564 + }, + { + "epoch": 0.050483615162954854, + "grad_norm": 0.4785180985927582, + "learning_rate": 9.989022787089463e-05, + "loss": 1.1128, + "step": 565 + }, + { + "epoch": 0.05057296669421672, + "grad_norm": 0.4673145115375519, + "learning_rate": 9.988926742389009e-05, + "loss": 1.087, + "step": 566 + }, + { + "epoch": 0.05066231822547859, + "grad_norm": 0.42585358023643494, + "learning_rate": 9.98883027981276e-05, + "loss": 1.1635, + "step": 567 + }, + { + "epoch": 0.050751669756740454, + "grad_norm": 0.4596778154373169, + "learning_rate": 9.988733399368799e-05, + "loss": 1.1818, + "step": 568 + }, + { + "epoch": 0.050841021288002324, + "grad_norm": 0.4845656454563141, + "learning_rate": 9.988636101065239e-05, + "loss": 1.1218, + "step": 569 + }, + { + "epoch": 0.05093037281926419, + "grad_norm": 0.4579009413719177, + "learning_rate": 9.988538384910231e-05, + "loss": 1.0752, + "step": 570 + }, + { + "epoch": 0.051019724350526055, + "grad_norm": 0.4427371025085449, + "learning_rate": 9.988440250911959e-05, + "loss": 1.1323, + "step": 571 + }, + { + "epoch": 0.051109075881787924, + "grad_norm": 0.4617055654525757, + "learning_rate": 9.988341699078643e-05, + "loss": 1.1641, + "step": 572 + }, + { + "epoch": 0.051198427413049793, + "grad_norm": 0.4859091639518738, + "learning_rate": 9.988242729418538e-05, + "loss": 1.0782, + "step": 573 + }, + { + "epoch": 0.051287778944311656, + "grad_norm": 0.4534977674484253, + "learning_rate": 9.988143341939933e-05, + "loss": 1.1194, + "step": 574 + }, + { + "epoch": 0.051377130475573525, + "grad_norm": 0.46073511242866516, + "learning_rate": 9.988043536651153e-05, + "loss": 1.091, + "step": 575 + }, + { + "epoch": 0.051466482006835394, + "grad_norm": 0.4721985459327698, + "learning_rate": 9.98794331356056e-05, + "loss": 1.1074, + "step": 576 + }, + { + "epoch": 0.05155583353809726, + "grad_norm": 0.45086175203323364, + "learning_rate": 9.987842672676544e-05, + "loss": 1.0933, + "step": 577 + }, + { + "epoch": 0.051645185069359126, + "grad_norm": 0.4174632728099823, + "learning_rate": 9.98774161400754e-05, + "loss": 1.1387, + "step": 578 + }, + { + "epoch": 0.051734536600620995, + "grad_norm": 0.4772718548774719, + "learning_rate": 9.987640137562008e-05, + "loss": 1.1551, + "step": 579 + }, + { + "epoch": 0.05182388813188286, + "grad_norm": 0.4615418016910553, + "learning_rate": 9.987538243348453e-05, + "loss": 1.1593, + "step": 580 + }, + { + "epoch": 0.05191323966314473, + "grad_norm": 0.48848336935043335, + "learning_rate": 9.987435931375406e-05, + "loss": 1.1167, + "step": 581 + }, + { + "epoch": 0.052002591194406596, + "grad_norm": 0.4644308090209961, + "learning_rate": 9.987333201651436e-05, + "loss": 1.0925, + "step": 582 + }, + { + "epoch": 0.05209194272566846, + "grad_norm": 0.46790382266044617, + "learning_rate": 9.98723005418515e-05, + "loss": 1.1186, + "step": 583 + }, + { + "epoch": 0.05218129425693033, + "grad_norm": 0.4854678213596344, + "learning_rate": 9.987126488985188e-05, + "loss": 1.1301, + "step": 584 + }, + { + "epoch": 0.0522706457881922, + "grad_norm": 0.47811320424079895, + "learning_rate": 9.987022506060221e-05, + "loss": 1.1431, + "step": 585 + }, + { + "epoch": 0.05235999731945406, + "grad_norm": 0.5481966733932495, + "learning_rate": 9.986918105418963e-05, + "loss": 1.0918, + "step": 586 + }, + { + "epoch": 0.05244934885071593, + "grad_norm": 0.4821578562259674, + "learning_rate": 9.986813287070158e-05, + "loss": 1.0715, + "step": 587 + }, + { + "epoch": 0.0525387003819778, + "grad_norm": 0.46433693170547485, + "learning_rate": 9.986708051022583e-05, + "loss": 1.0865, + "step": 588 + }, + { + "epoch": 0.052628051913239667, + "grad_norm": 0.44410815834999084, + "learning_rate": 9.986602397285054e-05, + "loss": 1.0884, + "step": 589 + }, + { + "epoch": 0.05271740344450153, + "grad_norm": 0.4573810398578644, + "learning_rate": 9.986496325866422e-05, + "loss": 1.1896, + "step": 590 + }, + { + "epoch": 0.0528067549757634, + "grad_norm": 0.5605972409248352, + "learning_rate": 9.986389836775569e-05, + "loss": 1.0038, + "step": 591 + }, + { + "epoch": 0.05289610650702527, + "grad_norm": 0.4885507822036743, + "learning_rate": 9.986282930021418e-05, + "loss": 1.1346, + "step": 592 + }, + { + "epoch": 0.05298545803828713, + "grad_norm": 0.4384009838104248, + "learning_rate": 9.986175605612921e-05, + "loss": 1.1165, + "step": 593 + }, + { + "epoch": 0.053074809569549, + "grad_norm": 0.4354461133480072, + "learning_rate": 9.986067863559067e-05, + "loss": 1.0599, + "step": 594 + }, + { + "epoch": 0.05316416110081087, + "grad_norm": 0.503073513507843, + "learning_rate": 9.985959703868884e-05, + "loss": 1.1166, + "step": 595 + }, + { + "epoch": 0.05325351263207273, + "grad_norm": 0.5100207924842834, + "learning_rate": 9.985851126551428e-05, + "loss": 1.1165, + "step": 596 + }, + { + "epoch": 0.0533428641633346, + "grad_norm": 0.5106130838394165, + "learning_rate": 9.985742131615794e-05, + "loss": 1.0688, + "step": 597 + }, + { + "epoch": 0.05343221569459647, + "grad_norm": 0.46070772409439087, + "learning_rate": 9.985632719071113e-05, + "loss": 1.1599, + "step": 598 + }, + { + "epoch": 0.05352156722585833, + "grad_norm": 0.45849481225013733, + "learning_rate": 9.985522888926549e-05, + "loss": 1.0446, + "step": 599 + }, + { + "epoch": 0.0536109187571202, + "grad_norm": 0.4356798827648163, + "learning_rate": 9.985412641191301e-05, + "loss": 1.1269, + "step": 600 + }, + { + "epoch": 0.05370027028838207, + "grad_norm": 0.5090345144271851, + "learning_rate": 9.985301975874604e-05, + "loss": 1.0689, + "step": 601 + }, + { + "epoch": 0.05378962181964393, + "grad_norm": 0.45965901017189026, + "learning_rate": 9.985190892985726e-05, + "loss": 1.0531, + "step": 602 + }, + { + "epoch": 0.0538789733509058, + "grad_norm": 0.45686468482017517, + "learning_rate": 9.985079392533974e-05, + "loss": 1.0928, + "step": 603 + }, + { + "epoch": 0.05396832488216767, + "grad_norm": 0.46496498584747314, + "learning_rate": 9.984967474528684e-05, + "loss": 0.9913, + "step": 604 + }, + { + "epoch": 0.05405767641342953, + "grad_norm": 0.41646531224250793, + "learning_rate": 9.984855138979233e-05, + "loss": 1.1572, + "step": 605 + }, + { + "epoch": 0.0541470279446914, + "grad_norm": 0.46098536252975464, + "learning_rate": 9.984742385895029e-05, + "loss": 1.0705, + "step": 606 + }, + { + "epoch": 0.05423637947595327, + "grad_norm": 0.44525283575057983, + "learning_rate": 9.984629215285516e-05, + "loss": 1.1466, + "step": 607 + }, + { + "epoch": 0.05432573100721513, + "grad_norm": 0.5084784030914307, + "learning_rate": 9.984515627160176e-05, + "loss": 1.0186, + "step": 608 + }, + { + "epoch": 0.054415082538477, + "grad_norm": 0.4281807839870453, + "learning_rate": 9.98440162152852e-05, + "loss": 1.1293, + "step": 609 + }, + { + "epoch": 0.05450443406973887, + "grad_norm": 0.44532129168510437, + "learning_rate": 9.984287198400098e-05, + "loss": 1.1377, + "step": 610 + }, + { + "epoch": 0.054593785601000734, + "grad_norm": 0.5224083662033081, + "learning_rate": 9.984172357784495e-05, + "loss": 1.1273, + "step": 611 + }, + { + "epoch": 0.0546831371322626, + "grad_norm": 0.5680818557739258, + "learning_rate": 9.984057099691329e-05, + "loss": 1.1502, + "step": 612 + }, + { + "epoch": 0.05477248866352447, + "grad_norm": 0.46198832988739014, + "learning_rate": 9.983941424130255e-05, + "loss": 1.1101, + "step": 613 + }, + { + "epoch": 0.054861840194786335, + "grad_norm": 0.4595877528190613, + "learning_rate": 9.983825331110961e-05, + "loss": 1.1089, + "step": 614 + }, + { + "epoch": 0.054951191726048204, + "grad_norm": 0.45024290680885315, + "learning_rate": 9.983708820643173e-05, + "loss": 1.116, + "step": 615 + }, + { + "epoch": 0.05504054325731007, + "grad_norm": 0.4610383212566376, + "learning_rate": 9.983591892736647e-05, + "loss": 1.0447, + "step": 616 + }, + { + "epoch": 0.05512989478857194, + "grad_norm": 0.5474917888641357, + "learning_rate": 9.983474547401182e-05, + "loss": 1.0719, + "step": 617 + }, + { + "epoch": 0.055219246319833805, + "grad_norm": 0.3890456259250641, + "learning_rate": 9.9833567846466e-05, + "loss": 1.0726, + "step": 618 + }, + { + "epoch": 0.055308597851095674, + "grad_norm": 0.495962917804718, + "learning_rate": 9.983238604482771e-05, + "loss": 1.1493, + "step": 619 + }, + { + "epoch": 0.05539794938235754, + "grad_norm": 0.46767839789390564, + "learning_rate": 9.983120006919591e-05, + "loss": 1.1153, + "step": 620 + }, + { + "epoch": 0.055487300913619406, + "grad_norm": 0.49606284499168396, + "learning_rate": 9.983000991966993e-05, + "loss": 1.1023, + "step": 621 + }, + { + "epoch": 0.055576652444881275, + "grad_norm": 0.45764997601509094, + "learning_rate": 9.982881559634947e-05, + "loss": 1.0941, + "step": 622 + }, + { + "epoch": 0.055666003976143144, + "grad_norm": 0.5058903694152832, + "learning_rate": 9.982761709933457e-05, + "loss": 1.0503, + "step": 623 + }, + { + "epoch": 0.055755355507405006, + "grad_norm": 0.4385363757610321, + "learning_rate": 9.982641442872562e-05, + "loss": 1.1275, + "step": 624 + }, + { + "epoch": 0.055844707038666876, + "grad_norm": 0.43508297204971313, + "learning_rate": 9.982520758462335e-05, + "loss": 1.1353, + "step": 625 + }, + { + "epoch": 0.055934058569928745, + "grad_norm": 0.49032166600227356, + "learning_rate": 9.982399656712884e-05, + "loss": 1.0699, + "step": 626 + }, + { + "epoch": 0.05602341010119061, + "grad_norm": 0.5139771699905396, + "learning_rate": 9.982278137634353e-05, + "loss": 1.0745, + "step": 627 + }, + { + "epoch": 0.056112761632452476, + "grad_norm": 0.4978711009025574, + "learning_rate": 9.982156201236921e-05, + "loss": 1.0771, + "step": 628 + }, + { + "epoch": 0.056202113163714346, + "grad_norm": 0.45232483744621277, + "learning_rate": 9.9820338475308e-05, + "loss": 1.1367, + "step": 629 + }, + { + "epoch": 0.05629146469497621, + "grad_norm": 0.5391058921813965, + "learning_rate": 9.981911076526243e-05, + "loss": 0.9913, + "step": 630 + }, + { + "epoch": 0.05638081622623808, + "grad_norm": 0.4652855396270752, + "learning_rate": 9.981787888233527e-05, + "loss": 1.0855, + "step": 631 + }, + { + "epoch": 0.056470167757499946, + "grad_norm": 0.4794527590274811, + "learning_rate": 9.981664282662974e-05, + "loss": 1.2006, + "step": 632 + }, + { + "epoch": 0.05655951928876181, + "grad_norm": 0.4287259578704834, + "learning_rate": 9.981540259824938e-05, + "loss": 1.0524, + "step": 633 + }, + { + "epoch": 0.05664887082002368, + "grad_norm": 0.4706929922103882, + "learning_rate": 9.981415819729804e-05, + "loss": 0.9986, + "step": 634 + }, + { + "epoch": 0.05673822235128555, + "grad_norm": 0.5352094769477844, + "learning_rate": 9.981290962387998e-05, + "loss": 1.0363, + "step": 635 + }, + { + "epoch": 0.05682757388254741, + "grad_norm": 0.4566076695919037, + "learning_rate": 9.981165687809976e-05, + "loss": 1.089, + "step": 636 + }, + { + "epoch": 0.05691692541380928, + "grad_norm": 0.5029745697975159, + "learning_rate": 9.981039996006234e-05, + "loss": 0.9928, + "step": 637 + }, + { + "epoch": 0.05700627694507115, + "grad_norm": 0.49635255336761475, + "learning_rate": 9.980913886987296e-05, + "loss": 1.1236, + "step": 638 + }, + { + "epoch": 0.05709562847633301, + "grad_norm": 0.5090769529342651, + "learning_rate": 9.98078736076373e-05, + "loss": 1.0525, + "step": 639 + }, + { + "epoch": 0.05718498000759488, + "grad_norm": 0.46991288661956787, + "learning_rate": 9.980660417346129e-05, + "loss": 1.1371, + "step": 640 + }, + { + "epoch": 0.05727433153885675, + "grad_norm": 0.5006728172302246, + "learning_rate": 9.980533056745128e-05, + "loss": 1.0505, + "step": 641 + }, + { + "epoch": 0.05736368307011861, + "grad_norm": 0.45537087321281433, + "learning_rate": 9.980405278971396e-05, + "loss": 1.1296, + "step": 642 + }, + { + "epoch": 0.05745303460138048, + "grad_norm": 0.4972824454307556, + "learning_rate": 9.980277084035634e-05, + "loss": 1.1512, + "step": 643 + }, + { + "epoch": 0.05754238613264235, + "grad_norm": 0.6011673212051392, + "learning_rate": 9.980148471948581e-05, + "loss": 1.0242, + "step": 644 + }, + { + "epoch": 0.05763173766390421, + "grad_norm": 0.4560789167881012, + "learning_rate": 9.980019442721008e-05, + "loss": 1.1034, + "step": 645 + }, + { + "epoch": 0.05772108919516608, + "grad_norm": 0.46284574270248413, + "learning_rate": 9.979889996363723e-05, + "loss": 1.1125, + "step": 646 + }, + { + "epoch": 0.05781044072642795, + "grad_norm": 0.5078014731407166, + "learning_rate": 9.97976013288757e-05, + "loss": 1.0878, + "step": 647 + }, + { + "epoch": 0.05789979225768982, + "grad_norm": 0.4990587830543518, + "learning_rate": 9.979629852303426e-05, + "loss": 1.0623, + "step": 648 + }, + { + "epoch": 0.05798914378895168, + "grad_norm": 0.4506543278694153, + "learning_rate": 9.979499154622201e-05, + "loss": 1.1416, + "step": 649 + }, + { + "epoch": 0.05807849532021355, + "grad_norm": 0.4424755573272705, + "learning_rate": 9.979368039854847e-05, + "loss": 1.094, + "step": 650 + }, + { + "epoch": 0.05816784685147542, + "grad_norm": 0.4033477008342743, + "learning_rate": 9.979236508012341e-05, + "loss": 1.13, + "step": 651 + }, + { + "epoch": 0.05825719838273728, + "grad_norm": 0.43958303332328796, + "learning_rate": 9.979104559105703e-05, + "loss": 1.1477, + "step": 652 + }, + { + "epoch": 0.05834654991399915, + "grad_norm": 0.44301077723503113, + "learning_rate": 9.978972193145986e-05, + "loss": 1.1097, + "step": 653 + }, + { + "epoch": 0.05843590144526102, + "grad_norm": 0.44206470251083374, + "learning_rate": 9.978839410144274e-05, + "loss": 1.0702, + "step": 654 + }, + { + "epoch": 0.05852525297652288, + "grad_norm": 0.47731223702430725, + "learning_rate": 9.978706210111692e-05, + "loss": 1.109, + "step": 655 + }, + { + "epoch": 0.05861460450778475, + "grad_norm": 0.47083383798599243, + "learning_rate": 9.978572593059394e-05, + "loss": 1.1559, + "step": 656 + }, + { + "epoch": 0.05870395603904662, + "grad_norm": 0.5734443068504333, + "learning_rate": 9.978438558998575e-05, + "loss": 1.0478, + "step": 657 + }, + { + "epoch": 0.058793307570308484, + "grad_norm": 0.4939190745353699, + "learning_rate": 9.978304107940461e-05, + "loss": 1.0989, + "step": 658 + }, + { + "epoch": 0.05888265910157035, + "grad_norm": 0.42895951867103577, + "learning_rate": 9.978169239896311e-05, + "loss": 1.1302, + "step": 659 + }, + { + "epoch": 0.05897201063283222, + "grad_norm": 0.513512372970581, + "learning_rate": 9.978033954877425e-05, + "loss": 1.1085, + "step": 660 + }, + { + "epoch": 0.059061362164094085, + "grad_norm": 0.4882372319698334, + "learning_rate": 9.977898252895134e-05, + "loss": 1.0566, + "step": 661 + }, + { + "epoch": 0.059150713695355954, + "grad_norm": 0.4673426151275635, + "learning_rate": 9.977762133960802e-05, + "loss": 1.028, + "step": 662 + }, + { + "epoch": 0.05924006522661782, + "grad_norm": 0.523703932762146, + "learning_rate": 9.977625598085834e-05, + "loss": 1.0267, + "step": 663 + }, + { + "epoch": 0.059329416757879685, + "grad_norm": 0.5089814066886902, + "learning_rate": 9.977488645281662e-05, + "loss": 1.1286, + "step": 664 + }, + { + "epoch": 0.059418768289141555, + "grad_norm": 0.48868119716644287, + "learning_rate": 9.977351275559763e-05, + "loss": 1.0537, + "step": 665 + }, + { + "epoch": 0.059508119820403424, + "grad_norm": 0.5465951561927795, + "learning_rate": 9.977213488931638e-05, + "loss": 1.1084, + "step": 666 + }, + { + "epoch": 0.059597471351665286, + "grad_norm": 0.49999743700027466, + "learning_rate": 9.97707528540883e-05, + "loss": 1.094, + "step": 667 + }, + { + "epoch": 0.059686822882927155, + "grad_norm": 0.5092505216598511, + "learning_rate": 9.976936665002916e-05, + "loss": 1.0616, + "step": 668 + }, + { + "epoch": 0.059776174414189025, + "grad_norm": 0.5057411193847656, + "learning_rate": 9.976797627725505e-05, + "loss": 1.0719, + "step": 669 + }, + { + "epoch": 0.05986552594545089, + "grad_norm": 0.5061649084091187, + "learning_rate": 9.976658173588244e-05, + "loss": 1.1306, + "step": 670 + }, + { + "epoch": 0.059954877476712756, + "grad_norm": 0.4602451026439667, + "learning_rate": 9.976518302602813e-05, + "loss": 1.074, + "step": 671 + }, + { + "epoch": 0.060044229007974625, + "grad_norm": 0.47477665543556213, + "learning_rate": 9.97637801478093e-05, + "loss": 1.0751, + "step": 672 + }, + { + "epoch": 0.06013358053923649, + "grad_norm": 0.4398420751094818, + "learning_rate": 9.976237310134342e-05, + "loss": 1.1357, + "step": 673 + }, + { + "epoch": 0.06022293207049836, + "grad_norm": 0.49362048506736755, + "learning_rate": 9.976096188674837e-05, + "loss": 1.1307, + "step": 674 + }, + { + "epoch": 0.060312283601760226, + "grad_norm": 0.4223507344722748, + "learning_rate": 9.975954650414236e-05, + "loss": 1.122, + "step": 675 + }, + { + "epoch": 0.060401635133022095, + "grad_norm": 0.4884486496448517, + "learning_rate": 9.975812695364392e-05, + "loss": 1.1486, + "step": 676 + }, + { + "epoch": 0.06049098666428396, + "grad_norm": 0.4124647080898285, + "learning_rate": 9.975670323537197e-05, + "loss": 1.0908, + "step": 677 + }, + { + "epoch": 0.06058033819554583, + "grad_norm": 0.5047022700309753, + "learning_rate": 9.975527534944574e-05, + "loss": 0.9936, + "step": 678 + }, + { + "epoch": 0.060669689726807696, + "grad_norm": 0.509061872959137, + "learning_rate": 9.975384329598486e-05, + "loss": 1.0482, + "step": 679 + }, + { + "epoch": 0.06075904125806956, + "grad_norm": 0.5170226693153381, + "learning_rate": 9.975240707510926e-05, + "loss": 1.0872, + "step": 680 + }, + { + "epoch": 0.06084839278933143, + "grad_norm": 0.43989554047584534, + "learning_rate": 9.975096668693926e-05, + "loss": 1.1313, + "step": 681 + }, + { + "epoch": 0.0609377443205933, + "grad_norm": 0.43840011954307556, + "learning_rate": 9.974952213159547e-05, + "loss": 1.1161, + "step": 682 + }, + { + "epoch": 0.06102709585185516, + "grad_norm": 0.5931734442710876, + "learning_rate": 9.974807340919893e-05, + "loss": 0.9472, + "step": 683 + }, + { + "epoch": 0.06111644738311703, + "grad_norm": 0.4824213683605194, + "learning_rate": 9.974662051987096e-05, + "loss": 1.0452, + "step": 684 + }, + { + "epoch": 0.0612057989143789, + "grad_norm": 0.5604041218757629, + "learning_rate": 9.974516346373326e-05, + "loss": 1.1357, + "step": 685 + }, + { + "epoch": 0.06129515044564076, + "grad_norm": 0.42983385920524597, + "learning_rate": 9.974370224090788e-05, + "loss": 1.166, + "step": 686 + }, + { + "epoch": 0.06138450197690263, + "grad_norm": 0.4323766827583313, + "learning_rate": 9.97422368515172e-05, + "loss": 1.0949, + "step": 687 + }, + { + "epoch": 0.0614738535081645, + "grad_norm": 0.4329332113265991, + "learning_rate": 9.974076729568396e-05, + "loss": 1.1318, + "step": 688 + }, + { + "epoch": 0.06156320503942636, + "grad_norm": 0.48779281973838806, + "learning_rate": 9.973929357353126e-05, + "loss": 1.1396, + "step": 689 + }, + { + "epoch": 0.06165255657068823, + "grad_norm": 0.485921710729599, + "learning_rate": 9.973781568518256e-05, + "loss": 1.0039, + "step": 690 + }, + { + "epoch": 0.0617419081019501, + "grad_norm": 0.45170992612838745, + "learning_rate": 9.973633363076163e-05, + "loss": 1.0345, + "step": 691 + }, + { + "epoch": 0.06183125963321196, + "grad_norm": 0.5609152317047119, + "learning_rate": 9.973484741039258e-05, + "loss": 1.0191, + "step": 692 + }, + { + "epoch": 0.06192061116447383, + "grad_norm": 0.4308573007583618, + "learning_rate": 9.973335702419995e-05, + "loss": 1.0891, + "step": 693 + }, + { + "epoch": 0.0620099626957357, + "grad_norm": 0.4530807137489319, + "learning_rate": 9.973186247230855e-05, + "loss": 1.1436, + "step": 694 + }, + { + "epoch": 0.06209931422699756, + "grad_norm": 0.4708486795425415, + "learning_rate": 9.973036375484354e-05, + "loss": 1.0609, + "step": 695 + }, + { + "epoch": 0.06218866575825943, + "grad_norm": 0.4028553068637848, + "learning_rate": 9.97288608719305e-05, + "loss": 1.0699, + "step": 696 + }, + { + "epoch": 0.0622780172895213, + "grad_norm": 0.4526921510696411, + "learning_rate": 9.97273538236953e-05, + "loss": 1.1414, + "step": 697 + }, + { + "epoch": 0.06236736882078316, + "grad_norm": 0.46698683500289917, + "learning_rate": 9.972584261026413e-05, + "loss": 1.1178, + "step": 698 + }, + { + "epoch": 0.06245672035204503, + "grad_norm": 0.5231309533119202, + "learning_rate": 9.972432723176361e-05, + "loss": 1.1514, + "step": 699 + }, + { + "epoch": 0.0625460718833069, + "grad_norm": 0.38658031821250916, + "learning_rate": 9.972280768832068e-05, + "loss": 1.1243, + "step": 700 + }, + { + "epoch": 0.06263542341456876, + "grad_norm": 0.4694380760192871, + "learning_rate": 9.972128398006259e-05, + "loss": 1.1258, + "step": 701 + }, + { + "epoch": 0.06272477494583063, + "grad_norm": 0.4181990623474121, + "learning_rate": 9.971975610711697e-05, + "loss": 1.1512, + "step": 702 + }, + { + "epoch": 0.0628141264770925, + "grad_norm": 0.41654473543167114, + "learning_rate": 9.971822406961179e-05, + "loss": 1.1766, + "step": 703 + }, + { + "epoch": 0.06290347800835437, + "grad_norm": 0.44843196868896484, + "learning_rate": 9.971668786767541e-05, + "loss": 1.0496, + "step": 704 + }, + { + "epoch": 0.06299282953961624, + "grad_norm": 0.5500750541687012, + "learning_rate": 9.971514750143647e-05, + "loss": 1.0574, + "step": 705 + }, + { + "epoch": 0.0630821810708781, + "grad_norm": 0.4476911127567291, + "learning_rate": 9.971360297102401e-05, + "loss": 1.0639, + "step": 706 + }, + { + "epoch": 0.06317153260213997, + "grad_norm": 0.46156466007232666, + "learning_rate": 9.971205427656738e-05, + "loss": 1.1028, + "step": 707 + }, + { + "epoch": 0.06326088413340183, + "grad_norm": 0.45186087489128113, + "learning_rate": 9.971050141819632e-05, + "loss": 1.0948, + "step": 708 + }, + { + "epoch": 0.0633502356646637, + "grad_norm": 0.4499582052230835, + "learning_rate": 9.970894439604088e-05, + "loss": 1.0978, + "step": 709 + }, + { + "epoch": 0.06343958719592557, + "grad_norm": 0.4321632385253906, + "learning_rate": 9.970738321023149e-05, + "loss": 1.1102, + "step": 710 + }, + { + "epoch": 0.06352893872718744, + "grad_norm": 0.4565223455429077, + "learning_rate": 9.970581786089891e-05, + "loss": 1.0825, + "step": 711 + }, + { + "epoch": 0.0636182902584493, + "grad_norm": 0.4617339074611664, + "learning_rate": 9.970424834817428e-05, + "loss": 1.0773, + "step": 712 + }, + { + "epoch": 0.06370764178971117, + "grad_norm": 0.4778772294521332, + "learning_rate": 9.970267467218904e-05, + "loss": 1.0918, + "step": 713 + }, + { + "epoch": 0.06379699332097304, + "grad_norm": 0.46952787041664124, + "learning_rate": 9.970109683307498e-05, + "loss": 1.0939, + "step": 714 + }, + { + "epoch": 0.0638863448522349, + "grad_norm": 0.4276919364929199, + "learning_rate": 9.969951483096429e-05, + "loss": 1.0416, + "step": 715 + }, + { + "epoch": 0.06397569638349677, + "grad_norm": 0.45656710863113403, + "learning_rate": 9.969792866598948e-05, + "loss": 1.0634, + "step": 716 + }, + { + "epoch": 0.06406504791475864, + "grad_norm": 0.4644415080547333, + "learning_rate": 9.96963383382834e-05, + "loss": 1.1223, + "step": 717 + }, + { + "epoch": 0.06415439944602051, + "grad_norm": 0.5773541331291199, + "learning_rate": 9.969474384797926e-05, + "loss": 1.0547, + "step": 718 + }, + { + "epoch": 0.06424375097728237, + "grad_norm": 0.5227424502372742, + "learning_rate": 9.969314519521063e-05, + "loss": 1.1057, + "step": 719 + }, + { + "epoch": 0.06433310250854424, + "grad_norm": 0.4427839517593384, + "learning_rate": 9.969154238011138e-05, + "loss": 1.0599, + "step": 720 + }, + { + "epoch": 0.0644224540398061, + "grad_norm": 0.472834050655365, + "learning_rate": 9.968993540281579e-05, + "loss": 1.0368, + "step": 721 + }, + { + "epoch": 0.06451180557106798, + "grad_norm": 0.4237770438194275, + "learning_rate": 9.968832426345845e-05, + "loss": 1.1564, + "step": 722 + }, + { + "epoch": 0.06460115710232985, + "grad_norm": 0.3942003548145294, + "learning_rate": 9.968670896217431e-05, + "loss": 1.1415, + "step": 723 + }, + { + "epoch": 0.06469050863359171, + "grad_norm": 0.43824538588523865, + "learning_rate": 9.968508949909868e-05, + "loss": 1.118, + "step": 724 + }, + { + "epoch": 0.06477986016485357, + "grad_norm": 0.4967290461063385, + "learning_rate": 9.968346587436719e-05, + "loss": 1.0834, + "step": 725 + }, + { + "epoch": 0.06486921169611544, + "grad_norm": 0.5201587080955505, + "learning_rate": 9.968183808811586e-05, + "loss": 1.1773, + "step": 726 + }, + { + "epoch": 0.06495856322737731, + "grad_norm": 0.42590588331222534, + "learning_rate": 9.968020614048101e-05, + "loss": 1.1176, + "step": 727 + }, + { + "epoch": 0.06504791475863918, + "grad_norm": 0.4532450735569, + "learning_rate": 9.967857003159933e-05, + "loss": 1.0961, + "step": 728 + }, + { + "epoch": 0.06513726628990105, + "grad_norm": 0.49376237392425537, + "learning_rate": 9.96769297616079e-05, + "loss": 1.1232, + "step": 729 + }, + { + "epoch": 0.06522661782116292, + "grad_norm": 0.4753684401512146, + "learning_rate": 9.967528533064408e-05, + "loss": 1.0848, + "step": 730 + }, + { + "epoch": 0.06531596935242477, + "grad_norm": 0.478083997964859, + "learning_rate": 9.96736367388456e-05, + "loss": 1.1025, + "step": 731 + }, + { + "epoch": 0.06540532088368664, + "grad_norm": 0.46657711267471313, + "learning_rate": 9.967198398635056e-05, + "loss": 1.1299, + "step": 732 + }, + { + "epoch": 0.06549467241494851, + "grad_norm": 0.4510549306869507, + "learning_rate": 9.96703270732974e-05, + "loss": 1.1, + "step": 733 + }, + { + "epoch": 0.06558402394621038, + "grad_norm": 0.4335935115814209, + "learning_rate": 9.96686659998249e-05, + "loss": 1.0771, + "step": 734 + }, + { + "epoch": 0.06567337547747225, + "grad_norm": 0.39717864990234375, + "learning_rate": 9.96670007660722e-05, + "loss": 1.1836, + "step": 735 + }, + { + "epoch": 0.06576272700873412, + "grad_norm": 0.5292769074440002, + "learning_rate": 9.966533137217878e-05, + "loss": 1.0035, + "step": 736 + }, + { + "epoch": 0.06585207853999597, + "grad_norm": 0.4478752911090851, + "learning_rate": 9.966365781828443e-05, + "loss": 1.0829, + "step": 737 + }, + { + "epoch": 0.06594143007125784, + "grad_norm": 0.4209703207015991, + "learning_rate": 9.966198010452939e-05, + "loss": 1.1075, + "step": 738 + }, + { + "epoch": 0.06603078160251971, + "grad_norm": 0.5448471903800964, + "learning_rate": 9.966029823105416e-05, + "loss": 1.0948, + "step": 739 + }, + { + "epoch": 0.06612013313378158, + "grad_norm": 0.40626657009124756, + "learning_rate": 9.965861219799958e-05, + "loss": 1.1325, + "step": 740 + }, + { + "epoch": 0.06620948466504345, + "grad_norm": 0.41559332609176636, + "learning_rate": 9.965692200550693e-05, + "loss": 1.1513, + "step": 741 + }, + { + "epoch": 0.06629883619630532, + "grad_norm": 0.4277205169200897, + "learning_rate": 9.965522765371777e-05, + "loss": 1.1243, + "step": 742 + }, + { + "epoch": 0.06638818772756717, + "grad_norm": 0.4078103303909302, + "learning_rate": 9.965352914277399e-05, + "loss": 1.1174, + "step": 743 + }, + { + "epoch": 0.06647753925882904, + "grad_norm": 0.4964846968650818, + "learning_rate": 9.965182647281788e-05, + "loss": 1.0881, + "step": 744 + }, + { + "epoch": 0.06656689079009091, + "grad_norm": 0.442452609539032, + "learning_rate": 9.965011964399204e-05, + "loss": 1.0964, + "step": 745 + }, + { + "epoch": 0.06665624232135278, + "grad_norm": 0.4539310038089752, + "learning_rate": 9.964840865643948e-05, + "loss": 1.0969, + "step": 746 + }, + { + "epoch": 0.06674559385261465, + "grad_norm": 0.4463444948196411, + "learning_rate": 9.964669351030345e-05, + "loss": 1.1043, + "step": 747 + }, + { + "epoch": 0.06683494538387652, + "grad_norm": 0.4720766246318817, + "learning_rate": 9.964497420572765e-05, + "loss": 1.0618, + "step": 748 + }, + { + "epoch": 0.06692429691513839, + "grad_norm": 0.42916223406791687, + "learning_rate": 9.964325074285609e-05, + "loss": 1.1004, + "step": 749 + }, + { + "epoch": 0.06701364844640025, + "grad_norm": 0.4502628743648529, + "learning_rate": 9.96415231218331e-05, + "loss": 1.0862, + "step": 750 + }, + { + "epoch": 0.06710299997766211, + "grad_norm": 0.4129611551761627, + "learning_rate": 9.963979134280343e-05, + "loss": 1.1337, + "step": 751 + }, + { + "epoch": 0.06719235150892398, + "grad_norm": 0.4612719714641571, + "learning_rate": 9.963805540591211e-05, + "loss": 1.1208, + "step": 752 + }, + { + "epoch": 0.06728170304018585, + "grad_norm": 0.5239313840866089, + "learning_rate": 9.963631531130455e-05, + "loss": 1.1208, + "step": 753 + }, + { + "epoch": 0.06737105457144772, + "grad_norm": 0.4685198664665222, + "learning_rate": 9.963457105912647e-05, + "loss": 1.0893, + "step": 754 + }, + { + "epoch": 0.06746040610270959, + "grad_norm": 0.4528690278530121, + "learning_rate": 9.963282264952403e-05, + "loss": 1.042, + "step": 755 + }, + { + "epoch": 0.06754975763397145, + "grad_norm": 0.5180427432060242, + "learning_rate": 9.963107008264364e-05, + "loss": 1.0768, + "step": 756 + }, + { + "epoch": 0.06763910916523332, + "grad_norm": 0.4094788432121277, + "learning_rate": 9.96293133586321e-05, + "loss": 1.0914, + "step": 757 + }, + { + "epoch": 0.06772846069649519, + "grad_norm": 0.4870153069496155, + "learning_rate": 9.962755247763654e-05, + "loss": 1.019, + "step": 758 + }, + { + "epoch": 0.06781781222775705, + "grad_norm": 0.4720255434513092, + "learning_rate": 9.962578743980449e-05, + "loss": 1.0146, + "step": 759 + }, + { + "epoch": 0.06790716375901892, + "grad_norm": 0.45057880878448486, + "learning_rate": 9.962401824528376e-05, + "loss": 1.0702, + "step": 760 + }, + { + "epoch": 0.06799651529028079, + "grad_norm": 0.44907015562057495, + "learning_rate": 9.962224489422254e-05, + "loss": 1.0297, + "step": 761 + }, + { + "epoch": 0.06808586682154265, + "grad_norm": 0.4270571172237396, + "learning_rate": 9.962046738676938e-05, + "loss": 1.0743, + "step": 762 + }, + { + "epoch": 0.06817521835280452, + "grad_norm": 0.47250601649284363, + "learning_rate": 9.961868572307315e-05, + "loss": 1.1362, + "step": 763 + }, + { + "epoch": 0.06826456988406639, + "grad_norm": 0.5045757293701172, + "learning_rate": 9.96168999032831e-05, + "loss": 1.04, + "step": 764 + }, + { + "epoch": 0.06835392141532826, + "grad_norm": 0.4600796699523926, + "learning_rate": 9.961510992754882e-05, + "loss": 1.0663, + "step": 765 + }, + { + "epoch": 0.06844327294659013, + "grad_norm": 0.4397517442703247, + "learning_rate": 9.961331579602022e-05, + "loss": 1.1174, + "step": 766 + }, + { + "epoch": 0.068532624477852, + "grad_norm": 0.4315062165260315, + "learning_rate": 9.961151750884758e-05, + "loss": 1.0979, + "step": 767 + }, + { + "epoch": 0.06862197600911385, + "grad_norm": 0.41421815752983093, + "learning_rate": 9.960971506618152e-05, + "loss": 1.0429, + "step": 768 + }, + { + "epoch": 0.06871132754037572, + "grad_norm": 0.49965426325798035, + "learning_rate": 9.960790846817303e-05, + "loss": 1.1674, + "step": 769 + }, + { + "epoch": 0.06880067907163759, + "grad_norm": 0.4487605392932892, + "learning_rate": 9.960609771497341e-05, + "loss": 1.1274, + "step": 770 + }, + { + "epoch": 0.06889003060289946, + "grad_norm": 0.5031136870384216, + "learning_rate": 9.960428280673435e-05, + "loss": 1.0788, + "step": 771 + }, + { + "epoch": 0.06897938213416133, + "grad_norm": 0.4574654996395111, + "learning_rate": 9.960246374360787e-05, + "loss": 1.1115, + "step": 772 + }, + { + "epoch": 0.0690687336654232, + "grad_norm": 0.3996691405773163, + "learning_rate": 9.960064052574632e-05, + "loss": 1.1178, + "step": 773 + }, + { + "epoch": 0.06915808519668505, + "grad_norm": 0.45163261890411377, + "learning_rate": 9.959881315330241e-05, + "loss": 1.1403, + "step": 774 + }, + { + "epoch": 0.06924743672794692, + "grad_norm": 0.48119327425956726, + "learning_rate": 9.959698162642923e-05, + "loss": 1.1177, + "step": 775 + }, + { + "epoch": 0.06933678825920879, + "grad_norm": 0.4419304132461548, + "learning_rate": 9.959514594528018e-05, + "loss": 1.0816, + "step": 776 + }, + { + "epoch": 0.06942613979047066, + "grad_norm": 0.48990410566329956, + "learning_rate": 9.959330611000898e-05, + "loss": 1.1581, + "step": 777 + }, + { + "epoch": 0.06951549132173253, + "grad_norm": 0.4900878071784973, + "learning_rate": 9.95914621207698e-05, + "loss": 1.0604, + "step": 778 + }, + { + "epoch": 0.0696048428529944, + "grad_norm": 0.473408967256546, + "learning_rate": 9.958961397771704e-05, + "loss": 1.0345, + "step": 779 + }, + { + "epoch": 0.06969419438425627, + "grad_norm": 0.4891606867313385, + "learning_rate": 9.958776168100555e-05, + "loss": 1.0432, + "step": 780 + }, + { + "epoch": 0.06978354591551812, + "grad_norm": 0.5358079075813293, + "learning_rate": 9.958590523079041e-05, + "loss": 1.1014, + "step": 781 + }, + { + "epoch": 0.06987289744677999, + "grad_norm": 0.455711305141449, + "learning_rate": 9.95840446272272e-05, + "loss": 1.0498, + "step": 782 + }, + { + "epoch": 0.06996224897804186, + "grad_norm": 0.42493581771850586, + "learning_rate": 9.958217987047169e-05, + "loss": 1.0354, + "step": 783 + }, + { + "epoch": 0.07005160050930373, + "grad_norm": 0.46666839718818665, + "learning_rate": 9.958031096068012e-05, + "loss": 1.0204, + "step": 784 + }, + { + "epoch": 0.0701409520405656, + "grad_norm": 0.40961867570877075, + "learning_rate": 9.957843789800902e-05, + "loss": 1.1588, + "step": 785 + }, + { + "epoch": 0.07023030357182747, + "grad_norm": 0.4467061758041382, + "learning_rate": 9.957656068261527e-05, + "loss": 1.0933, + "step": 786 + }, + { + "epoch": 0.07031965510308932, + "grad_norm": 0.4334403872489929, + "learning_rate": 9.957467931465613e-05, + "loss": 1.1161, + "step": 787 + }, + { + "epoch": 0.07040900663435119, + "grad_norm": 0.5627568364143372, + "learning_rate": 9.957279379428917e-05, + "loss": 1.0551, + "step": 788 + }, + { + "epoch": 0.07049835816561306, + "grad_norm": 0.422397643327713, + "learning_rate": 9.95709041216723e-05, + "loss": 1.1089, + "step": 789 + }, + { + "epoch": 0.07058770969687493, + "grad_norm": 0.5722343325614929, + "learning_rate": 9.956901029696384e-05, + "loss": 1.0943, + "step": 790 + }, + { + "epoch": 0.0706770612281368, + "grad_norm": 0.4556920826435089, + "learning_rate": 9.95671123203224e-05, + "loss": 1.0813, + "step": 791 + }, + { + "epoch": 0.07076641275939867, + "grad_norm": 0.43123292922973633, + "learning_rate": 9.956521019190694e-05, + "loss": 1.0318, + "step": 792 + }, + { + "epoch": 0.07085576429066053, + "grad_norm": 0.5033695101737976, + "learning_rate": 9.956330391187682e-05, + "loss": 1.0647, + "step": 793 + }, + { + "epoch": 0.0709451158219224, + "grad_norm": 0.45806631445884705, + "learning_rate": 9.956139348039168e-05, + "loss": 1.1287, + "step": 794 + }, + { + "epoch": 0.07103446735318426, + "grad_norm": 0.43176013231277466, + "learning_rate": 9.955947889761155e-05, + "loss": 1.061, + "step": 795 + }, + { + "epoch": 0.07112381888444613, + "grad_norm": 0.4724084138870239, + "learning_rate": 9.95575601636968e-05, + "loss": 1.0868, + "step": 796 + }, + { + "epoch": 0.071213170415708, + "grad_norm": 0.4638439416885376, + "learning_rate": 9.955563727880814e-05, + "loss": 1.0874, + "step": 797 + }, + { + "epoch": 0.07130252194696987, + "grad_norm": 0.4180169403553009, + "learning_rate": 9.955371024310662e-05, + "loss": 1.1168, + "step": 798 + }, + { + "epoch": 0.07139187347823173, + "grad_norm": 0.455720454454422, + "learning_rate": 9.955177905675367e-05, + "loss": 1.1141, + "step": 799 + }, + { + "epoch": 0.0714812250094936, + "grad_norm": 0.4505751132965088, + "learning_rate": 9.954984371991105e-05, + "loss": 1.0927, + "step": 800 + }, + { + "epoch": 0.07157057654075547, + "grad_norm": 0.4087337553501129, + "learning_rate": 9.954790423274085e-05, + "loss": 1.104, + "step": 801 + }, + { + "epoch": 0.07165992807201733, + "grad_norm": 0.46492061018943787, + "learning_rate": 9.954596059540553e-05, + "loss": 1.0794, + "step": 802 + }, + { + "epoch": 0.0717492796032792, + "grad_norm": 0.4779999554157257, + "learning_rate": 9.954401280806789e-05, + "loss": 1.0552, + "step": 803 + }, + { + "epoch": 0.07183863113454107, + "grad_norm": 0.480494886636734, + "learning_rate": 9.954206087089107e-05, + "loss": 1.0328, + "step": 804 + }, + { + "epoch": 0.07192798266580294, + "grad_norm": 0.40479815006256104, + "learning_rate": 9.954010478403857e-05, + "loss": 1.0973, + "step": 805 + }, + { + "epoch": 0.0720173341970648, + "grad_norm": 0.49492841958999634, + "learning_rate": 9.953814454767423e-05, + "loss": 1.0628, + "step": 806 + }, + { + "epoch": 0.07210668572832667, + "grad_norm": 0.49477580189704895, + "learning_rate": 9.953618016196224e-05, + "loss": 1.083, + "step": 807 + }, + { + "epoch": 0.07219603725958854, + "grad_norm": 0.4487774670124054, + "learning_rate": 9.953421162706717e-05, + "loss": 1.0459, + "step": 808 + }, + { + "epoch": 0.0722853887908504, + "grad_norm": 0.4162748157978058, + "learning_rate": 9.953223894315386e-05, + "loss": 1.1345, + "step": 809 + }, + { + "epoch": 0.07237474032211227, + "grad_norm": 0.4790276288986206, + "learning_rate": 9.953026211038757e-05, + "loss": 1.1452, + "step": 810 + }, + { + "epoch": 0.07246409185337414, + "grad_norm": 0.4182452857494354, + "learning_rate": 9.952828112893388e-05, + "loss": 1.06, + "step": 811 + }, + { + "epoch": 0.072553443384636, + "grad_norm": 0.4895060360431671, + "learning_rate": 9.95262959989587e-05, + "loss": 1.0784, + "step": 812 + }, + { + "epoch": 0.07264279491589787, + "grad_norm": 0.44416457414627075, + "learning_rate": 9.952430672062831e-05, + "loss": 1.1275, + "step": 813 + }, + { + "epoch": 0.07273214644715974, + "grad_norm": 0.3927420377731323, + "learning_rate": 9.952231329410936e-05, + "loss": 1.1797, + "step": 814 + }, + { + "epoch": 0.0728214979784216, + "grad_norm": 0.43822696805000305, + "learning_rate": 9.952031571956878e-05, + "loss": 1.074, + "step": 815 + }, + { + "epoch": 0.07291084950968348, + "grad_norm": 0.44089022278785706, + "learning_rate": 9.951831399717394e-05, + "loss": 1.0785, + "step": 816 + }, + { + "epoch": 0.07300020104094535, + "grad_norm": 0.4124377369880676, + "learning_rate": 9.951630812709245e-05, + "loss": 1.065, + "step": 817 + }, + { + "epoch": 0.0730895525722072, + "grad_norm": 0.43494531512260437, + "learning_rate": 9.951429810949237e-05, + "loss": 1.0736, + "step": 818 + }, + { + "epoch": 0.07317890410346907, + "grad_norm": 0.4180351793766022, + "learning_rate": 9.951228394454201e-05, + "loss": 1.0566, + "step": 819 + }, + { + "epoch": 0.07326825563473094, + "grad_norm": 0.4254007637500763, + "learning_rate": 9.951026563241014e-05, + "loss": 1.079, + "step": 820 + }, + { + "epoch": 0.07335760716599281, + "grad_norm": 0.42448723316192627, + "learning_rate": 9.950824317326577e-05, + "loss": 1.1101, + "step": 821 + }, + { + "epoch": 0.07344695869725468, + "grad_norm": 0.4987415075302124, + "learning_rate": 9.95062165672783e-05, + "loss": 1.0266, + "step": 822 + }, + { + "epoch": 0.07353631022851655, + "grad_norm": 0.4084853529930115, + "learning_rate": 9.95041858146175e-05, + "loss": 1.0966, + "step": 823 + }, + { + "epoch": 0.0736256617597784, + "grad_norm": 0.4334554374217987, + "learning_rate": 9.950215091545347e-05, + "loss": 1.11, + "step": 824 + }, + { + "epoch": 0.07371501329104027, + "grad_norm": 0.5075578093528748, + "learning_rate": 9.950011186995665e-05, + "loss": 1.014, + "step": 825 + }, + { + "epoch": 0.07380436482230214, + "grad_norm": 0.4870114028453827, + "learning_rate": 9.94980686782978e-05, + "loss": 1.0139, + "step": 826 + }, + { + "epoch": 0.07389371635356401, + "grad_norm": 0.44856953620910645, + "learning_rate": 9.949602134064812e-05, + "loss": 1.089, + "step": 827 + }, + { + "epoch": 0.07398306788482588, + "grad_norm": 0.48632118105888367, + "learning_rate": 9.949396985717904e-05, + "loss": 1.1153, + "step": 828 + }, + { + "epoch": 0.07407241941608775, + "grad_norm": 0.44190341234207153, + "learning_rate": 9.949191422806244e-05, + "loss": 1.2339, + "step": 829 + }, + { + "epoch": 0.0741617709473496, + "grad_norm": 0.4298502504825592, + "learning_rate": 9.948985445347046e-05, + "loss": 1.1638, + "step": 830 + }, + { + "epoch": 0.07425112247861147, + "grad_norm": 0.4637526869773865, + "learning_rate": 9.948779053357564e-05, + "loss": 1.0537, + "step": 831 + }, + { + "epoch": 0.07434047400987334, + "grad_norm": 0.4532431364059448, + "learning_rate": 9.948572246855086e-05, + "loss": 1.0724, + "step": 832 + }, + { + "epoch": 0.07442982554113521, + "grad_norm": 0.52618807554245, + "learning_rate": 9.948365025856936e-05, + "loss": 1.0945, + "step": 833 + }, + { + "epoch": 0.07451917707239708, + "grad_norm": 0.4329455494880676, + "learning_rate": 9.948157390380468e-05, + "loss": 1.0166, + "step": 834 + }, + { + "epoch": 0.07460852860365895, + "grad_norm": 0.4594184160232544, + "learning_rate": 9.947949340443076e-05, + "loss": 1.1553, + "step": 835 + }, + { + "epoch": 0.07469788013492082, + "grad_norm": 0.4881901443004608, + "learning_rate": 9.947740876062185e-05, + "loss": 1.0782, + "step": 836 + }, + { + "epoch": 0.07478723166618267, + "grad_norm": 0.5016196370124817, + "learning_rate": 9.947531997255256e-05, + "loss": 1.0892, + "step": 837 + }, + { + "epoch": 0.07487658319744454, + "grad_norm": 0.4345821738243103, + "learning_rate": 9.947322704039785e-05, + "loss": 1.1057, + "step": 838 + }, + { + "epoch": 0.07496593472870641, + "grad_norm": 0.4423231780529022, + "learning_rate": 9.947112996433305e-05, + "loss": 1.0884, + "step": 839 + }, + { + "epoch": 0.07505528625996828, + "grad_norm": 0.5169707536697388, + "learning_rate": 9.946902874453376e-05, + "loss": 1.015, + "step": 840 + }, + { + "epoch": 0.07514463779123015, + "grad_norm": 0.44377315044403076, + "learning_rate": 9.946692338117603e-05, + "loss": 1.0713, + "step": 841 + }, + { + "epoch": 0.07523398932249202, + "grad_norm": 0.4438510239124298, + "learning_rate": 9.94648138744362e-05, + "loss": 1.0522, + "step": 842 + }, + { + "epoch": 0.07532334085375388, + "grad_norm": 0.5507922768592834, + "learning_rate": 9.946270022449093e-05, + "loss": 1.0294, + "step": 843 + }, + { + "epoch": 0.07541269238501574, + "grad_norm": 0.48594483733177185, + "learning_rate": 9.946058243151728e-05, + "loss": 1.0541, + "step": 844 + }, + { + "epoch": 0.07550204391627761, + "grad_norm": 0.4724047780036926, + "learning_rate": 9.945846049569265e-05, + "loss": 1.0744, + "step": 845 + }, + { + "epoch": 0.07559139544753948, + "grad_norm": 0.5080140233039856, + "learning_rate": 9.945633441719476e-05, + "loss": 1.0812, + "step": 846 + }, + { + "epoch": 0.07568074697880135, + "grad_norm": 0.44215860962867737, + "learning_rate": 9.945420419620171e-05, + "loss": 1.1124, + "step": 847 + }, + { + "epoch": 0.07577009851006322, + "grad_norm": 0.43543803691864014, + "learning_rate": 9.94520698328919e-05, + "loss": 1.108, + "step": 848 + }, + { + "epoch": 0.07585945004132508, + "grad_norm": 0.4514320194721222, + "learning_rate": 9.944993132744411e-05, + "loss": 1.0689, + "step": 849 + }, + { + "epoch": 0.07594880157258695, + "grad_norm": 0.4318578243255615, + "learning_rate": 9.94477886800375e-05, + "loss": 1.0977, + "step": 850 + }, + { + "epoch": 0.07603815310384882, + "grad_norm": 0.4266031086444855, + "learning_rate": 9.944564189085149e-05, + "loss": 1.0782, + "step": 851 + }, + { + "epoch": 0.07612750463511068, + "grad_norm": 0.3994167149066925, + "learning_rate": 9.944349096006593e-05, + "loss": 1.1211, + "step": 852 + }, + { + "epoch": 0.07621685616637255, + "grad_norm": 0.4137265980243683, + "learning_rate": 9.944133588786097e-05, + "loss": 1.0609, + "step": 853 + }, + { + "epoch": 0.07630620769763442, + "grad_norm": 0.4087986648082733, + "learning_rate": 9.943917667441712e-05, + "loss": 1.0968, + "step": 854 + }, + { + "epoch": 0.07639555922889628, + "grad_norm": 0.4582999646663666, + "learning_rate": 9.943701331991524e-05, + "loss": 1.0594, + "step": 855 + }, + { + "epoch": 0.07648491076015815, + "grad_norm": 0.44180095195770264, + "learning_rate": 9.943484582453653e-05, + "loss": 1.0959, + "step": 856 + }, + { + "epoch": 0.07657426229142002, + "grad_norm": 0.5081307291984558, + "learning_rate": 9.943267418846256e-05, + "loss": 1.0916, + "step": 857 + }, + { + "epoch": 0.07666361382268189, + "grad_norm": 0.4701635241508484, + "learning_rate": 9.94304984118752e-05, + "loss": 1.0602, + "step": 858 + }, + { + "epoch": 0.07675296535394376, + "grad_norm": 0.42284098267555237, + "learning_rate": 9.942831849495671e-05, + "loss": 1.1025, + "step": 859 + }, + { + "epoch": 0.07684231688520562, + "grad_norm": 0.4433005154132843, + "learning_rate": 9.942613443788967e-05, + "loss": 1.1403, + "step": 860 + }, + { + "epoch": 0.0769316684164675, + "grad_norm": 0.41836610436439514, + "learning_rate": 9.942394624085703e-05, + "loss": 1.0193, + "step": 861 + }, + { + "epoch": 0.07702101994772935, + "grad_norm": 0.5095844268798828, + "learning_rate": 9.942175390404208e-05, + "loss": 1.0305, + "step": 862 + }, + { + "epoch": 0.07711037147899122, + "grad_norm": 0.4428594410419464, + "learning_rate": 9.941955742762843e-05, + "loss": 1.0768, + "step": 863 + }, + { + "epoch": 0.07719972301025309, + "grad_norm": 0.5194300413131714, + "learning_rate": 9.941735681180009e-05, + "loss": 1.0837, + "step": 864 + }, + { + "epoch": 0.07728907454151496, + "grad_norm": 0.5022509098052979, + "learning_rate": 9.941515205674134e-05, + "loss": 1.0751, + "step": 865 + }, + { + "epoch": 0.07737842607277683, + "grad_norm": 0.45329055190086365, + "learning_rate": 9.94129431626369e-05, + "loss": 1.1309, + "step": 866 + }, + { + "epoch": 0.0774677776040387, + "grad_norm": 0.44875457882881165, + "learning_rate": 9.941073012967174e-05, + "loss": 0.9794, + "step": 867 + }, + { + "epoch": 0.07755712913530055, + "grad_norm": 0.5031179189682007, + "learning_rate": 9.940851295803128e-05, + "loss": 1.086, + "step": 868 + }, + { + "epoch": 0.07764648066656242, + "grad_norm": 0.42640596628189087, + "learning_rate": 9.94062916479012e-05, + "loss": 1.068, + "step": 869 + }, + { + "epoch": 0.07773583219782429, + "grad_norm": 0.39963454008102417, + "learning_rate": 9.940406619946754e-05, + "loss": 1.1031, + "step": 870 + }, + { + "epoch": 0.07782518372908616, + "grad_norm": 0.42264753580093384, + "learning_rate": 9.940183661291674e-05, + "loss": 1.0831, + "step": 871 + }, + { + "epoch": 0.07791453526034803, + "grad_norm": 0.45042434334754944, + "learning_rate": 9.939960288843553e-05, + "loss": 1.0708, + "step": 872 + }, + { + "epoch": 0.0780038867916099, + "grad_norm": 0.41273242235183716, + "learning_rate": 9.939736502621104e-05, + "loss": 1.122, + "step": 873 + }, + { + "epoch": 0.07809323832287175, + "grad_norm": 0.43225687742233276, + "learning_rate": 9.939512302643066e-05, + "loss": 1.1367, + "step": 874 + }, + { + "epoch": 0.07818258985413362, + "grad_norm": 0.4413902759552002, + "learning_rate": 9.939287688928223e-05, + "loss": 1.0814, + "step": 875 + }, + { + "epoch": 0.07827194138539549, + "grad_norm": 0.4078238308429718, + "learning_rate": 9.939062661495386e-05, + "loss": 1.0772, + "step": 876 + }, + { + "epoch": 0.07836129291665736, + "grad_norm": 0.3733663260936737, + "learning_rate": 9.938837220363406e-05, + "loss": 1.1371, + "step": 877 + }, + { + "epoch": 0.07845064444791923, + "grad_norm": 0.39089709520339966, + "learning_rate": 9.938611365551164e-05, + "loss": 1.0742, + "step": 878 + }, + { + "epoch": 0.0785399959791811, + "grad_norm": 0.41924816370010376, + "learning_rate": 9.938385097077579e-05, + "loss": 1.1673, + "step": 879 + }, + { + "epoch": 0.07862934751044295, + "grad_norm": 0.5209245681762695, + "learning_rate": 9.9381584149616e-05, + "loss": 1.0325, + "step": 880 + }, + { + "epoch": 0.07871869904170482, + "grad_norm": 0.38127946853637695, + "learning_rate": 9.937931319222218e-05, + "loss": 1.1191, + "step": 881 + }, + { + "epoch": 0.07880805057296669, + "grad_norm": 0.4516022503376007, + "learning_rate": 9.937703809878455e-05, + "loss": 1.0864, + "step": 882 + }, + { + "epoch": 0.07889740210422856, + "grad_norm": 0.5131936073303223, + "learning_rate": 9.937475886949364e-05, + "loss": 1.0033, + "step": 883 + }, + { + "epoch": 0.07898675363549043, + "grad_norm": 0.3989183008670807, + "learning_rate": 9.937247550454039e-05, + "loss": 1.1386, + "step": 884 + }, + { + "epoch": 0.0790761051667523, + "grad_norm": 0.4165286123752594, + "learning_rate": 9.937018800411604e-05, + "loss": 1.131, + "step": 885 + }, + { + "epoch": 0.07916545669801416, + "grad_norm": 0.49382612109184265, + "learning_rate": 9.936789636841219e-05, + "loss": 1.0344, + "step": 886 + }, + { + "epoch": 0.07925480822927602, + "grad_norm": 0.4288516938686371, + "learning_rate": 9.93656005976208e-05, + "loss": 1.0715, + "step": 887 + }, + { + "epoch": 0.0793441597605379, + "grad_norm": 0.5689799785614014, + "learning_rate": 9.936330069193415e-05, + "loss": 1.0104, + "step": 888 + }, + { + "epoch": 0.07943351129179976, + "grad_norm": 0.40826788544654846, + "learning_rate": 9.936099665154491e-05, + "loss": 1.1411, + "step": 889 + }, + { + "epoch": 0.07952286282306163, + "grad_norm": 0.4648611843585968, + "learning_rate": 9.935868847664605e-05, + "loss": 1.1341, + "step": 890 + }, + { + "epoch": 0.0796122143543235, + "grad_norm": 0.45681998133659363, + "learning_rate": 9.935637616743089e-05, + "loss": 1.1331, + "step": 891 + }, + { + "epoch": 0.07970156588558537, + "grad_norm": 0.4700930416584015, + "learning_rate": 9.935405972409313e-05, + "loss": 1.0357, + "step": 892 + }, + { + "epoch": 0.07979091741684723, + "grad_norm": 0.42509207129478455, + "learning_rate": 9.93517391468268e-05, + "loss": 1.1024, + "step": 893 + }, + { + "epoch": 0.0798802689481091, + "grad_norm": 0.46535390615463257, + "learning_rate": 9.934941443582626e-05, + "loss": 1.1248, + "step": 894 + }, + { + "epoch": 0.07996962047937096, + "grad_norm": 0.5609625577926636, + "learning_rate": 9.934708559128622e-05, + "loss": 0.947, + "step": 895 + }, + { + "epoch": 0.08005897201063283, + "grad_norm": 0.5007188320159912, + "learning_rate": 9.934475261340177e-05, + "loss": 1.1, + "step": 896 + }, + { + "epoch": 0.0801483235418947, + "grad_norm": 0.5065818428993225, + "learning_rate": 9.934241550236831e-05, + "loss": 1.0299, + "step": 897 + }, + { + "epoch": 0.08023767507315657, + "grad_norm": 0.44581353664398193, + "learning_rate": 9.934007425838161e-05, + "loss": 1.1251, + "step": 898 + }, + { + "epoch": 0.08032702660441843, + "grad_norm": 0.4115321636199951, + "learning_rate": 9.933772888163776e-05, + "loss": 1.1807, + "step": 899 + }, + { + "epoch": 0.0804163781356803, + "grad_norm": 0.47303566336631775, + "learning_rate": 9.933537937233321e-05, + "loss": 1.1472, + "step": 900 + }, + { + "epoch": 0.08050572966694217, + "grad_norm": 0.4314580261707306, + "learning_rate": 9.933302573066477e-05, + "loss": 1.089, + "step": 901 + }, + { + "epoch": 0.08059508119820404, + "grad_norm": 0.47710415720939636, + "learning_rate": 9.933066795682955e-05, + "loss": 1.0226, + "step": 902 + }, + { + "epoch": 0.0806844327294659, + "grad_norm": 0.4161222577095032, + "learning_rate": 9.932830605102508e-05, + "loss": 1.0913, + "step": 903 + }, + { + "epoch": 0.08077378426072777, + "grad_norm": 0.485124409198761, + "learning_rate": 9.932594001344918e-05, + "loss": 1.0963, + "step": 904 + }, + { + "epoch": 0.08086313579198963, + "grad_norm": 0.47519686818122864, + "learning_rate": 9.93235698443e-05, + "loss": 1.0747, + "step": 905 + }, + { + "epoch": 0.0809524873232515, + "grad_norm": 0.4680500328540802, + "learning_rate": 9.932119554377611e-05, + "loss": 1.1009, + "step": 906 + }, + { + "epoch": 0.08104183885451337, + "grad_norm": 0.49420708417892456, + "learning_rate": 9.931881711207638e-05, + "loss": 1.0773, + "step": 907 + }, + { + "epoch": 0.08113119038577524, + "grad_norm": 0.3618301749229431, + "learning_rate": 9.93164345494e-05, + "loss": 1.0759, + "step": 908 + }, + { + "epoch": 0.0812205419170371, + "grad_norm": 0.4082967936992645, + "learning_rate": 9.931404785594656e-05, + "loss": 1.133, + "step": 909 + }, + { + "epoch": 0.08130989344829898, + "grad_norm": 0.5318341851234436, + "learning_rate": 9.931165703191595e-05, + "loss": 1.0725, + "step": 910 + }, + { + "epoch": 0.08139924497956083, + "grad_norm": 0.46658989787101746, + "learning_rate": 9.930926207750845e-05, + "loss": 1.0376, + "step": 911 + }, + { + "epoch": 0.0814885965108227, + "grad_norm": 0.5218292474746704, + "learning_rate": 9.930686299292464e-05, + "loss": 1.0949, + "step": 912 + }, + { + "epoch": 0.08157794804208457, + "grad_norm": 0.48732948303222656, + "learning_rate": 9.930445977836548e-05, + "loss": 1.0491, + "step": 913 + }, + { + "epoch": 0.08166729957334644, + "grad_norm": 0.40489867329597473, + "learning_rate": 9.930205243403229e-05, + "loss": 1.0856, + "step": 914 + }, + { + "epoch": 0.08175665110460831, + "grad_norm": 0.5249372720718384, + "learning_rate": 9.929964096012668e-05, + "loss": 1.0358, + "step": 915 + }, + { + "epoch": 0.08184600263587018, + "grad_norm": 0.4804689288139343, + "learning_rate": 9.929722535685062e-05, + "loss": 1.0793, + "step": 916 + }, + { + "epoch": 0.08193535416713203, + "grad_norm": 0.43911850452423096, + "learning_rate": 9.929480562440649e-05, + "loss": 1.1816, + "step": 917 + }, + { + "epoch": 0.0820247056983939, + "grad_norm": 0.4654462933540344, + "learning_rate": 9.929238176299693e-05, + "loss": 1.1595, + "step": 918 + }, + { + "epoch": 0.08211405722965577, + "grad_norm": 0.44041964411735535, + "learning_rate": 9.928995377282498e-05, + "loss": 1.0732, + "step": 919 + }, + { + "epoch": 0.08220340876091764, + "grad_norm": 0.4381254315376282, + "learning_rate": 9.928752165409401e-05, + "loss": 1.0728, + "step": 920 + }, + { + "epoch": 0.08229276029217951, + "grad_norm": 0.5617743730545044, + "learning_rate": 9.928508540700774e-05, + "loss": 1.0313, + "step": 921 + }, + { + "epoch": 0.08238211182344138, + "grad_norm": 0.4664022922515869, + "learning_rate": 9.928264503177023e-05, + "loss": 1.0456, + "step": 922 + }, + { + "epoch": 0.08247146335470325, + "grad_norm": 0.4458063542842865, + "learning_rate": 9.928020052858587e-05, + "loss": 1.0782, + "step": 923 + }, + { + "epoch": 0.0825608148859651, + "grad_norm": 0.44314441084861755, + "learning_rate": 9.927775189765943e-05, + "loss": 1.0905, + "step": 924 + }, + { + "epoch": 0.08265016641722697, + "grad_norm": 0.4562947154045105, + "learning_rate": 9.927529913919601e-05, + "loss": 1.0686, + "step": 925 + }, + { + "epoch": 0.08273951794848884, + "grad_norm": 0.4639774560928345, + "learning_rate": 9.927284225340105e-05, + "loss": 1.0761, + "step": 926 + }, + { + "epoch": 0.08282886947975071, + "grad_norm": 0.4482592046260834, + "learning_rate": 9.927038124048034e-05, + "loss": 1.0584, + "step": 927 + }, + { + "epoch": 0.08291822101101258, + "grad_norm": 0.4356086254119873, + "learning_rate": 9.926791610064002e-05, + "loss": 1.0996, + "step": 928 + }, + { + "epoch": 0.08300757254227445, + "grad_norm": 0.3952994644641876, + "learning_rate": 9.926544683408656e-05, + "loss": 1.1176, + "step": 929 + }, + { + "epoch": 0.0830969240735363, + "grad_norm": 0.4380575716495514, + "learning_rate": 9.92629734410268e-05, + "loss": 1.0432, + "step": 930 + }, + { + "epoch": 0.08318627560479817, + "grad_norm": 0.54874587059021, + "learning_rate": 9.92604959216679e-05, + "loss": 0.9305, + "step": 931 + }, + { + "epoch": 0.08327562713606004, + "grad_norm": 0.5379965901374817, + "learning_rate": 9.925801427621739e-05, + "loss": 1.1302, + "step": 932 + }, + { + "epoch": 0.08336497866732191, + "grad_norm": 0.4604237377643585, + "learning_rate": 9.925552850488314e-05, + "loss": 1.0884, + "step": 933 + }, + { + "epoch": 0.08345433019858378, + "grad_norm": 0.4671541750431061, + "learning_rate": 9.925303860787335e-05, + "loss": 0.9864, + "step": 934 + }, + { + "epoch": 0.08354368172984565, + "grad_norm": 0.48653197288513184, + "learning_rate": 9.925054458539658e-05, + "loss": 1.1002, + "step": 935 + }, + { + "epoch": 0.0836330332611075, + "grad_norm": 0.4400951564311981, + "learning_rate": 9.924804643766172e-05, + "loss": 1.1034, + "step": 936 + }, + { + "epoch": 0.08372238479236938, + "grad_norm": 0.4999461770057678, + "learning_rate": 9.924554416487802e-05, + "loss": 1.1624, + "step": 937 + }, + { + "epoch": 0.08381173632363124, + "grad_norm": 0.45174461603164673, + "learning_rate": 9.92430377672551e-05, + "loss": 1.0431, + "step": 938 + }, + { + "epoch": 0.08390108785489311, + "grad_norm": 0.4591819643974304, + "learning_rate": 9.924052724500284e-05, + "loss": 1.0554, + "step": 939 + }, + { + "epoch": 0.08399043938615498, + "grad_norm": 0.4611966609954834, + "learning_rate": 9.923801259833159e-05, + "loss": 1.0504, + "step": 940 + }, + { + "epoch": 0.08407979091741685, + "grad_norm": 0.3897397518157959, + "learning_rate": 9.923549382745192e-05, + "loss": 1.0838, + "step": 941 + }, + { + "epoch": 0.08416914244867871, + "grad_norm": 0.3815537095069885, + "learning_rate": 9.923297093257485e-05, + "loss": 1.095, + "step": 942 + }, + { + "epoch": 0.08425849397994058, + "grad_norm": 0.4189860224723816, + "learning_rate": 9.923044391391165e-05, + "loss": 1.0924, + "step": 943 + }, + { + "epoch": 0.08434784551120245, + "grad_norm": 0.45325958728790283, + "learning_rate": 9.922791277167404e-05, + "loss": 1.0615, + "step": 944 + }, + { + "epoch": 0.08443719704246432, + "grad_norm": 0.4996306300163269, + "learning_rate": 9.9225377506074e-05, + "loss": 1.0995, + "step": 945 + }, + { + "epoch": 0.08452654857372618, + "grad_norm": 0.4180799424648285, + "learning_rate": 9.922283811732388e-05, + "loss": 1.1514, + "step": 946 + }, + { + "epoch": 0.08461590010498805, + "grad_norm": 0.433929443359375, + "learning_rate": 9.92202946056364e-05, + "loss": 1.056, + "step": 947 + }, + { + "epoch": 0.08470525163624992, + "grad_norm": 0.479571133852005, + "learning_rate": 9.921774697122459e-05, + "loss": 1.0351, + "step": 948 + }, + { + "epoch": 0.08479460316751178, + "grad_norm": 0.46040210127830505, + "learning_rate": 9.921519521430185e-05, + "loss": 0.9939, + "step": 949 + }, + { + "epoch": 0.08488395469877365, + "grad_norm": 0.43157997727394104, + "learning_rate": 9.92126393350819e-05, + "loss": 1.0588, + "step": 950 + }, + { + "epoch": 0.08497330623003552, + "grad_norm": 0.4109559655189514, + "learning_rate": 9.921007933377887e-05, + "loss": 1.1222, + "step": 951 + }, + { + "epoch": 0.08506265776129739, + "grad_norm": 0.4423241913318634, + "learning_rate": 9.920751521060712e-05, + "loss": 1.0276, + "step": 952 + }, + { + "epoch": 0.08515200929255926, + "grad_norm": 0.454683393239975, + "learning_rate": 9.920494696578146e-05, + "loss": 1.0437, + "step": 953 + }, + { + "epoch": 0.08524136082382112, + "grad_norm": 0.47767403721809387, + "learning_rate": 9.920237459951702e-05, + "loss": 1.0884, + "step": 954 + }, + { + "epoch": 0.08533071235508298, + "grad_norm": 0.47979000210762024, + "learning_rate": 9.919979811202923e-05, + "loss": 1.0598, + "step": 955 + }, + { + "epoch": 0.08542006388634485, + "grad_norm": 0.5115445852279663, + "learning_rate": 9.919721750353395e-05, + "loss": 1.024, + "step": 956 + }, + { + "epoch": 0.08550941541760672, + "grad_norm": 0.4317951798439026, + "learning_rate": 9.919463277424727e-05, + "loss": 1.1506, + "step": 957 + }, + { + "epoch": 0.08559876694886859, + "grad_norm": 0.4962863624095917, + "learning_rate": 9.919204392438573e-05, + "loss": 1.0262, + "step": 958 + }, + { + "epoch": 0.08568811848013046, + "grad_norm": 0.44821789860725403, + "learning_rate": 9.918945095416616e-05, + "loss": 1.0722, + "step": 959 + }, + { + "epoch": 0.08577747001139233, + "grad_norm": 0.44719305634498596, + "learning_rate": 9.918685386380573e-05, + "loss": 1.0362, + "step": 960 + }, + { + "epoch": 0.08586682154265418, + "grad_norm": 0.4721403121948242, + "learning_rate": 9.918425265352202e-05, + "loss": 0.972, + "step": 961 + }, + { + "epoch": 0.08595617307391605, + "grad_norm": 0.43076246976852417, + "learning_rate": 9.918164732353288e-05, + "loss": 1.0746, + "step": 962 + }, + { + "epoch": 0.08604552460517792, + "grad_norm": 0.4410285949707031, + "learning_rate": 9.917903787405653e-05, + "loss": 1.0802, + "step": 963 + }, + { + "epoch": 0.08613487613643979, + "grad_norm": 0.4716387689113617, + "learning_rate": 9.917642430531155e-05, + "loss": 1.165, + "step": 964 + }, + { + "epoch": 0.08622422766770166, + "grad_norm": 0.38331499695777893, + "learning_rate": 9.917380661751685e-05, + "loss": 1.0947, + "step": 965 + }, + { + "epoch": 0.08631357919896353, + "grad_norm": 0.47889336943626404, + "learning_rate": 9.917118481089169e-05, + "loss": 1.0482, + "step": 966 + }, + { + "epoch": 0.08640293073022538, + "grad_norm": 0.4541834890842438, + "learning_rate": 9.916855888565569e-05, + "loss": 1.0942, + "step": 967 + }, + { + "epoch": 0.08649228226148725, + "grad_norm": 0.4245148003101349, + "learning_rate": 9.916592884202878e-05, + "loss": 1.0635, + "step": 968 + }, + { + "epoch": 0.08658163379274912, + "grad_norm": 0.40217721462249756, + "learning_rate": 9.916329468023124e-05, + "loss": 1.0662, + "step": 969 + }, + { + "epoch": 0.08667098532401099, + "grad_norm": 0.4782769978046417, + "learning_rate": 9.916065640048374e-05, + "loss": 1.0232, + "step": 970 + }, + { + "epoch": 0.08676033685527286, + "grad_norm": 0.4472961127758026, + "learning_rate": 9.915801400300727e-05, + "loss": 1.0723, + "step": 971 + }, + { + "epoch": 0.08684968838653473, + "grad_norm": 0.4397634267807007, + "learning_rate": 9.91553674880231e-05, + "loss": 1.1551, + "step": 972 + }, + { + "epoch": 0.08693903991779658, + "grad_norm": 0.481342077255249, + "learning_rate": 9.915271685575297e-05, + "loss": 1.1133, + "step": 973 + }, + { + "epoch": 0.08702839144905845, + "grad_norm": 0.3950006365776062, + "learning_rate": 9.915006210641886e-05, + "loss": 1.0542, + "step": 974 + }, + { + "epoch": 0.08711774298032032, + "grad_norm": 0.46636098623275757, + "learning_rate": 9.914740324024316e-05, + "loss": 1.0893, + "step": 975 + }, + { + "epoch": 0.08720709451158219, + "grad_norm": 0.48313936591148376, + "learning_rate": 9.914474025744856e-05, + "loss": 1.0318, + "step": 976 + }, + { + "epoch": 0.08729644604284406, + "grad_norm": 0.480648934841156, + "learning_rate": 9.914207315825812e-05, + "loss": 1.1249, + "step": 977 + }, + { + "epoch": 0.08738579757410593, + "grad_norm": 0.4614575207233429, + "learning_rate": 9.913940194289524e-05, + "loss": 1.133, + "step": 978 + }, + { + "epoch": 0.0874751491053678, + "grad_norm": 0.4390462040901184, + "learning_rate": 9.913672661158364e-05, + "loss": 1.1004, + "step": 979 + }, + { + "epoch": 0.08756450063662966, + "grad_norm": 0.5013588070869446, + "learning_rate": 9.913404716454744e-05, + "loss": 1.1423, + "step": 980 + }, + { + "epoch": 0.08765385216789152, + "grad_norm": 0.40994441509246826, + "learning_rate": 9.913136360201106e-05, + "loss": 1.058, + "step": 981 + }, + { + "epoch": 0.0877432036991534, + "grad_norm": 0.4297054409980774, + "learning_rate": 9.912867592419928e-05, + "loss": 1.0135, + "step": 982 + }, + { + "epoch": 0.08783255523041526, + "grad_norm": 0.4328972399234772, + "learning_rate": 9.91259841313372e-05, + "loss": 1.1261, + "step": 983 + }, + { + "epoch": 0.08792190676167713, + "grad_norm": 0.42971837520599365, + "learning_rate": 9.912328822365033e-05, + "loss": 1.1578, + "step": 984 + }, + { + "epoch": 0.088011258292939, + "grad_norm": 0.4185878336429596, + "learning_rate": 9.912058820136443e-05, + "loss": 1.0351, + "step": 985 + }, + { + "epoch": 0.08810060982420086, + "grad_norm": 0.4461815655231476, + "learning_rate": 9.911788406470569e-05, + "loss": 1.0864, + "step": 986 + }, + { + "epoch": 0.08818996135546273, + "grad_norm": 0.402701199054718, + "learning_rate": 9.911517581390059e-05, + "loss": 1.1004, + "step": 987 + }, + { + "epoch": 0.0882793128867246, + "grad_norm": 0.4615456163883209, + "learning_rate": 9.9112463449176e-05, + "loss": 1.0427, + "step": 988 + }, + { + "epoch": 0.08836866441798646, + "grad_norm": 0.49826958775520325, + "learning_rate": 9.91097469707591e-05, + "loss": 1.0445, + "step": 989 + }, + { + "epoch": 0.08845801594924833, + "grad_norm": 0.5109001994132996, + "learning_rate": 9.91070263788774e-05, + "loss": 1.0151, + "step": 990 + }, + { + "epoch": 0.0885473674805102, + "grad_norm": 0.4601687788963318, + "learning_rate": 9.910430167375881e-05, + "loss": 0.9897, + "step": 991 + }, + { + "epoch": 0.08863671901177206, + "grad_norm": 0.48032835125923157, + "learning_rate": 9.910157285563154e-05, + "loss": 1.1166, + "step": 992 + }, + { + "epoch": 0.08872607054303393, + "grad_norm": 0.3616820275783539, + "learning_rate": 9.909883992472415e-05, + "loss": 1.1109, + "step": 993 + }, + { + "epoch": 0.0888154220742958, + "grad_norm": 0.5358753204345703, + "learning_rate": 9.909610288126557e-05, + "loss": 1.0476, + "step": 994 + }, + { + "epoch": 0.08890477360555767, + "grad_norm": 0.4149110019207001, + "learning_rate": 9.909336172548505e-05, + "loss": 1.068, + "step": 995 + }, + { + "epoch": 0.08899412513681954, + "grad_norm": 0.43261730670928955, + "learning_rate": 9.909061645761217e-05, + "loss": 1.0922, + "step": 996 + }, + { + "epoch": 0.0890834766680814, + "grad_norm": 0.504966139793396, + "learning_rate": 9.90878670778769e-05, + "loss": 1.0179, + "step": 997 + }, + { + "epoch": 0.08917282819934326, + "grad_norm": 0.42445361614227295, + "learning_rate": 9.908511358650953e-05, + "loss": 1.0927, + "step": 998 + }, + { + "epoch": 0.08926217973060513, + "grad_norm": 0.4488779306411743, + "learning_rate": 9.908235598374069e-05, + "loss": 1.0652, + "step": 999 + }, + { + "epoch": 0.089351531261867, + "grad_norm": 0.4371225833892822, + "learning_rate": 9.907959426980136e-05, + "loss": 1.0874, + "step": 1000 + }, + { + "epoch": 0.08944088279312887, + "grad_norm": 0.3913322389125824, + "learning_rate": 9.907682844492284e-05, + "loss": 1.0903, + "step": 1001 + }, + { + "epoch": 0.08953023432439074, + "grad_norm": 0.38679686188697815, + "learning_rate": 9.907405850933681e-05, + "loss": 1.0884, + "step": 1002 + }, + { + "epoch": 0.0896195858556526, + "grad_norm": 0.4290361702442169, + "learning_rate": 9.907128446327531e-05, + "loss": 1.087, + "step": 1003 + }, + { + "epoch": 0.08970893738691448, + "grad_norm": 0.42051953077316284, + "learning_rate": 9.906850630697068e-05, + "loss": 1.1921, + "step": 1004 + }, + { + "epoch": 0.08979828891817633, + "grad_norm": 0.4305054247379303, + "learning_rate": 9.90657240406556e-05, + "loss": 1.1087, + "step": 1005 + }, + { + "epoch": 0.0898876404494382, + "grad_norm": 0.4126817286014557, + "learning_rate": 9.906293766456312e-05, + "loss": 1.0889, + "step": 1006 + }, + { + "epoch": 0.08997699198070007, + "grad_norm": 0.4437747895717621, + "learning_rate": 9.906014717892666e-05, + "loss": 1.0852, + "step": 1007 + }, + { + "epoch": 0.09006634351196194, + "grad_norm": 0.40008100867271423, + "learning_rate": 9.905735258397993e-05, + "loss": 1.0639, + "step": 1008 + }, + { + "epoch": 0.09015569504322381, + "grad_norm": 0.4763084650039673, + "learning_rate": 9.905455387995699e-05, + "loss": 0.9696, + "step": 1009 + }, + { + "epoch": 0.09024504657448568, + "grad_norm": 0.5551483631134033, + "learning_rate": 9.905175106709228e-05, + "loss": 1.0921, + "step": 1010 + }, + { + "epoch": 0.09033439810574753, + "grad_norm": 0.398017555475235, + "learning_rate": 9.904894414562056e-05, + "loss": 1.0868, + "step": 1011 + }, + { + "epoch": 0.0904237496370094, + "grad_norm": 0.41361182928085327, + "learning_rate": 9.904613311577695e-05, + "loss": 1.0827, + "step": 1012 + }, + { + "epoch": 0.09051310116827127, + "grad_norm": 0.43735066056251526, + "learning_rate": 9.90433179777969e-05, + "loss": 1.0752, + "step": 1013 + }, + { + "epoch": 0.09060245269953314, + "grad_norm": 0.43464162945747375, + "learning_rate": 9.904049873191621e-05, + "loss": 1.1453, + "step": 1014 + }, + { + "epoch": 0.09069180423079501, + "grad_norm": 0.42488962411880493, + "learning_rate": 9.903767537837101e-05, + "loss": 1.0359, + "step": 1015 + }, + { + "epoch": 0.09078115576205688, + "grad_norm": 0.4419018626213074, + "learning_rate": 9.90348479173978e-05, + "loss": 1.0873, + "step": 1016 + }, + { + "epoch": 0.09087050729331873, + "grad_norm": 0.45977696776390076, + "learning_rate": 9.903201634923338e-05, + "loss": 1.0001, + "step": 1017 + }, + { + "epoch": 0.0909598588245806, + "grad_norm": 0.4522443115711212, + "learning_rate": 9.902918067411497e-05, + "loss": 1.0555, + "step": 1018 + }, + { + "epoch": 0.09104921035584247, + "grad_norm": 0.43287041783332825, + "learning_rate": 9.902634089228007e-05, + "loss": 1.0233, + "step": 1019 + }, + { + "epoch": 0.09113856188710434, + "grad_norm": 0.5111227035522461, + "learning_rate": 9.902349700396651e-05, + "loss": 0.9717, + "step": 1020 + }, + { + "epoch": 0.09122791341836621, + "grad_norm": 0.44105076789855957, + "learning_rate": 9.902064900941255e-05, + "loss": 0.9953, + "step": 1021 + }, + { + "epoch": 0.09131726494962808, + "grad_norm": 0.46010032296180725, + "learning_rate": 9.90177969088567e-05, + "loss": 1.0938, + "step": 1022 + }, + { + "epoch": 0.09140661648088994, + "grad_norm": 0.4434414803981781, + "learning_rate": 9.901494070253788e-05, + "loss": 1.1415, + "step": 1023 + }, + { + "epoch": 0.0914959680121518, + "grad_norm": 0.4490472972393036, + "learning_rate": 9.90120803906953e-05, + "loss": 1.0611, + "step": 1024 + }, + { + "epoch": 0.09158531954341367, + "grad_norm": 0.3900289833545685, + "learning_rate": 9.900921597356856e-05, + "loss": 1.1096, + "step": 1025 + }, + { + "epoch": 0.09167467107467554, + "grad_norm": 0.478982150554657, + "learning_rate": 9.900634745139758e-05, + "loss": 1.0677, + "step": 1026 + }, + { + "epoch": 0.09176402260593741, + "grad_norm": 0.37474942207336426, + "learning_rate": 9.900347482442262e-05, + "loss": 1.0386, + "step": 1027 + }, + { + "epoch": 0.09185337413719928, + "grad_norm": 0.5137802958488464, + "learning_rate": 9.900059809288431e-05, + "loss": 1.0332, + "step": 1028 + }, + { + "epoch": 0.09194272566846114, + "grad_norm": 0.4314192831516266, + "learning_rate": 9.899771725702362e-05, + "loss": 1.0801, + "step": 1029 + }, + { + "epoch": 0.092032077199723, + "grad_norm": 0.48988327383995056, + "learning_rate": 9.899483231708181e-05, + "loss": 1.0857, + "step": 1030 + }, + { + "epoch": 0.09212142873098488, + "grad_norm": 0.4243643283843994, + "learning_rate": 9.899194327330056e-05, + "loss": 1.096, + "step": 1031 + }, + { + "epoch": 0.09221078026224674, + "grad_norm": 0.4380730986595154, + "learning_rate": 9.898905012592183e-05, + "loss": 1.0709, + "step": 1032 + }, + { + "epoch": 0.09230013179350861, + "grad_norm": 0.4659455716609955, + "learning_rate": 9.898615287518798e-05, + "loss": 1.0621, + "step": 1033 + }, + { + "epoch": 0.09238948332477048, + "grad_norm": 0.46971428394317627, + "learning_rate": 9.898325152134167e-05, + "loss": 1.1165, + "step": 1034 + }, + { + "epoch": 0.09247883485603235, + "grad_norm": 0.47089582681655884, + "learning_rate": 9.898034606462592e-05, + "loss": 1.1352, + "step": 1035 + }, + { + "epoch": 0.09256818638729421, + "grad_norm": 0.44664955139160156, + "learning_rate": 9.897743650528408e-05, + "loss": 1.0728, + "step": 1036 + }, + { + "epoch": 0.09265753791855608, + "grad_norm": 0.4320451617240906, + "learning_rate": 9.897452284355989e-05, + "loss": 1.1007, + "step": 1037 + }, + { + "epoch": 0.09274688944981795, + "grad_norm": 0.4943344295024872, + "learning_rate": 9.897160507969738e-05, + "loss": 1.0915, + "step": 1038 + }, + { + "epoch": 0.09283624098107982, + "grad_norm": 0.43696850538253784, + "learning_rate": 9.896868321394093e-05, + "loss": 1.1057, + "step": 1039 + }, + { + "epoch": 0.09292559251234168, + "grad_norm": 0.39653998613357544, + "learning_rate": 9.896575724653529e-05, + "loss": 1.0422, + "step": 1040 + }, + { + "epoch": 0.09301494404360355, + "grad_norm": 0.4009700417518616, + "learning_rate": 9.896282717772556e-05, + "loss": 1.0917, + "step": 1041 + }, + { + "epoch": 0.09310429557486541, + "grad_norm": 0.4337768852710724, + "learning_rate": 9.895989300775714e-05, + "loss": 1.089, + "step": 1042 + }, + { + "epoch": 0.09319364710612728, + "grad_norm": 0.4776664078235626, + "learning_rate": 9.895695473687581e-05, + "loss": 1.0316, + "step": 1043 + }, + { + "epoch": 0.09328299863738915, + "grad_norm": 0.4160580337047577, + "learning_rate": 9.895401236532769e-05, + "loss": 1.0437, + "step": 1044 + }, + { + "epoch": 0.09337235016865102, + "grad_norm": 0.4131687879562378, + "learning_rate": 9.895106589335919e-05, + "loss": 1.0765, + "step": 1045 + }, + { + "epoch": 0.09346170169991289, + "grad_norm": 0.39853063225746155, + "learning_rate": 9.894811532121716e-05, + "loss": 1.0726, + "step": 1046 + }, + { + "epoch": 0.09355105323117476, + "grad_norm": 0.46733424067497253, + "learning_rate": 9.894516064914871e-05, + "loss": 1.0561, + "step": 1047 + }, + { + "epoch": 0.09364040476243661, + "grad_norm": 0.3866161108016968, + "learning_rate": 9.894220187740135e-05, + "loss": 1.0816, + "step": 1048 + }, + { + "epoch": 0.09372975629369848, + "grad_norm": 0.42076459527015686, + "learning_rate": 9.89392390062229e-05, + "loss": 0.9866, + "step": 1049 + }, + { + "epoch": 0.09381910782496035, + "grad_norm": 0.4648950695991516, + "learning_rate": 9.893627203586152e-05, + "loss": 1.0732, + "step": 1050 + }, + { + "epoch": 0.09390845935622222, + "grad_norm": 0.4551086723804474, + "learning_rate": 9.893330096656574e-05, + "loss": 1.1067, + "step": 1051 + }, + { + "epoch": 0.09399781088748409, + "grad_norm": 0.43443188071250916, + "learning_rate": 9.893032579858442e-05, + "loss": 1.0777, + "step": 1052 + }, + { + "epoch": 0.09408716241874596, + "grad_norm": 0.4357977509498596, + "learning_rate": 9.892734653216673e-05, + "loss": 0.995, + "step": 1053 + }, + { + "epoch": 0.09417651395000781, + "grad_norm": 0.4856785833835602, + "learning_rate": 9.892436316756226e-05, + "loss": 1.0041, + "step": 1054 + }, + { + "epoch": 0.09426586548126968, + "grad_norm": 0.4476493299007416, + "learning_rate": 9.892137570502087e-05, + "loss": 1.0459, + "step": 1055 + }, + { + "epoch": 0.09435521701253155, + "grad_norm": 0.4645354449748993, + "learning_rate": 9.89183841447928e-05, + "loss": 1.1058, + "step": 1056 + }, + { + "epoch": 0.09444456854379342, + "grad_norm": 0.45503050088882446, + "learning_rate": 9.891538848712863e-05, + "loss": 1.0721, + "step": 1057 + }, + { + "epoch": 0.09453392007505529, + "grad_norm": 0.4932888448238373, + "learning_rate": 9.891238873227925e-05, + "loss": 1.0064, + "step": 1058 + }, + { + "epoch": 0.09462327160631716, + "grad_norm": 0.43276292085647583, + "learning_rate": 9.890938488049597e-05, + "loss": 1.1481, + "step": 1059 + }, + { + "epoch": 0.09471262313757901, + "grad_norm": 0.4331585764884949, + "learning_rate": 9.890637693203038e-05, + "loss": 1.0468, + "step": 1060 + }, + { + "epoch": 0.09480197466884088, + "grad_norm": 0.5495041608810425, + "learning_rate": 9.89033648871344e-05, + "loss": 1.0436, + "step": 1061 + }, + { + "epoch": 0.09489132620010275, + "grad_norm": 0.47634175419807434, + "learning_rate": 9.890034874606033e-05, + "loss": 1.0175, + "step": 1062 + }, + { + "epoch": 0.09498067773136462, + "grad_norm": 0.400700181722641, + "learning_rate": 9.889732850906083e-05, + "loss": 1.1379, + "step": 1063 + }, + { + "epoch": 0.09507002926262649, + "grad_norm": 0.49066829681396484, + "learning_rate": 9.889430417638885e-05, + "loss": 1.115, + "step": 1064 + }, + { + "epoch": 0.09515938079388836, + "grad_norm": 0.3815631568431854, + "learning_rate": 9.889127574829773e-05, + "loss": 1.1274, + "step": 1065 + }, + { + "epoch": 0.09524873232515023, + "grad_norm": 0.5222175717353821, + "learning_rate": 9.88882432250411e-05, + "loss": 1.0232, + "step": 1066 + }, + { + "epoch": 0.09533808385641208, + "grad_norm": 0.4751056134700775, + "learning_rate": 9.888520660687302e-05, + "loss": 1.1066, + "step": 1067 + }, + { + "epoch": 0.09542743538767395, + "grad_norm": 0.5163382887840271, + "learning_rate": 9.888216589404779e-05, + "loss": 1.0504, + "step": 1068 + }, + { + "epoch": 0.09551678691893582, + "grad_norm": 0.5940535068511963, + "learning_rate": 9.887912108682011e-05, + "loss": 1.0346, + "step": 1069 + }, + { + "epoch": 0.09560613845019769, + "grad_norm": 0.38179200887680054, + "learning_rate": 9.887607218544503e-05, + "loss": 1.1381, + "step": 1070 + }, + { + "epoch": 0.09569548998145956, + "grad_norm": 0.3916855454444885, + "learning_rate": 9.887301919017794e-05, + "loss": 1.1062, + "step": 1071 + }, + { + "epoch": 0.09578484151272143, + "grad_norm": 0.48447951674461365, + "learning_rate": 9.886996210127452e-05, + "loss": 1.0219, + "step": 1072 + }, + { + "epoch": 0.09587419304398329, + "grad_norm": 0.4082842171192169, + "learning_rate": 9.886690091899088e-05, + "loss": 1.0546, + "step": 1073 + }, + { + "epoch": 0.09596354457524515, + "grad_norm": 0.4271855354309082, + "learning_rate": 9.886383564358339e-05, + "loss": 1.05, + "step": 1074 + }, + { + "epoch": 0.09605289610650702, + "grad_norm": 0.37129080295562744, + "learning_rate": 9.886076627530883e-05, + "loss": 1.0979, + "step": 1075 + }, + { + "epoch": 0.0961422476377689, + "grad_norm": 0.4285633862018585, + "learning_rate": 9.885769281442426e-05, + "loss": 1.0915, + "step": 1076 + }, + { + "epoch": 0.09623159916903076, + "grad_norm": 0.5488273501396179, + "learning_rate": 9.885461526118713e-05, + "loss": 0.9937, + "step": 1077 + }, + { + "epoch": 0.09632095070029263, + "grad_norm": 0.42456749081611633, + "learning_rate": 9.885153361585523e-05, + "loss": 1.1105, + "step": 1078 + }, + { + "epoch": 0.09641030223155449, + "grad_norm": 0.38298559188842773, + "learning_rate": 9.884844787868667e-05, + "loss": 1.1525, + "step": 1079 + }, + { + "epoch": 0.09649965376281636, + "grad_norm": 0.46398019790649414, + "learning_rate": 9.884535804993991e-05, + "loss": 1.0575, + "step": 1080 + }, + { + "epoch": 0.09658900529407823, + "grad_norm": 0.47431719303131104, + "learning_rate": 9.884226412987375e-05, + "loss": 1.0777, + "step": 1081 + }, + { + "epoch": 0.0966783568253401, + "grad_norm": 0.4811946451663971, + "learning_rate": 9.883916611874735e-05, + "loss": 1.0186, + "step": 1082 + }, + { + "epoch": 0.09676770835660196, + "grad_norm": 0.4855489730834961, + "learning_rate": 9.883606401682022e-05, + "loss": 1.1432, + "step": 1083 + }, + { + "epoch": 0.09685705988786383, + "grad_norm": 0.423043817281723, + "learning_rate": 9.883295782435216e-05, + "loss": 1.0769, + "step": 1084 + }, + { + "epoch": 0.09694641141912569, + "grad_norm": 0.5006414651870728, + "learning_rate": 9.882984754160334e-05, + "loss": 1.0417, + "step": 1085 + }, + { + "epoch": 0.09703576295038756, + "grad_norm": 0.41438305377960205, + "learning_rate": 9.882673316883432e-05, + "loss": 1.091, + "step": 1086 + }, + { + "epoch": 0.09712511448164943, + "grad_norm": 0.4482746124267578, + "learning_rate": 9.882361470630594e-05, + "loss": 1.0692, + "step": 1087 + }, + { + "epoch": 0.0972144660129113, + "grad_norm": 0.43151143193244934, + "learning_rate": 9.882049215427941e-05, + "loss": 1.0767, + "step": 1088 + }, + { + "epoch": 0.09730381754417317, + "grad_norm": 0.47808754444122314, + "learning_rate": 9.881736551301627e-05, + "loss": 1.1291, + "step": 1089 + }, + { + "epoch": 0.09739316907543503, + "grad_norm": 0.3970921039581299, + "learning_rate": 9.88142347827784e-05, + "loss": 1.0847, + "step": 1090 + }, + { + "epoch": 0.0974825206066969, + "grad_norm": 0.4331444501876831, + "learning_rate": 9.881109996382807e-05, + "loss": 1.0468, + "step": 1091 + }, + { + "epoch": 0.09757187213795876, + "grad_norm": 0.48726001381874084, + "learning_rate": 9.880796105642782e-05, + "loss": 1.0272, + "step": 1092 + }, + { + "epoch": 0.09766122366922063, + "grad_norm": 0.39687567949295044, + "learning_rate": 9.880481806084057e-05, + "loss": 1.0803, + "step": 1093 + }, + { + "epoch": 0.0977505752004825, + "grad_norm": 0.510676920413971, + "learning_rate": 9.880167097732957e-05, + "loss": 0.9923, + "step": 1094 + }, + { + "epoch": 0.09783992673174437, + "grad_norm": 0.4363247752189636, + "learning_rate": 9.879851980615847e-05, + "loss": 1.0961, + "step": 1095 + }, + { + "epoch": 0.09792927826300624, + "grad_norm": 0.4566675126552582, + "learning_rate": 9.879536454759115e-05, + "loss": 0.9882, + "step": 1096 + }, + { + "epoch": 0.0980186297942681, + "grad_norm": 0.3779030442237854, + "learning_rate": 9.879220520189195e-05, + "loss": 1.1436, + "step": 1097 + }, + { + "epoch": 0.09810798132552996, + "grad_norm": 0.45519158244132996, + "learning_rate": 9.878904176932546e-05, + "loss": 1.0497, + "step": 1098 + }, + { + "epoch": 0.09819733285679183, + "grad_norm": 0.3910900056362152, + "learning_rate": 9.878587425015668e-05, + "loss": 1.0654, + "step": 1099 + }, + { + "epoch": 0.0982866843880537, + "grad_norm": 0.45868462324142456, + "learning_rate": 9.878270264465091e-05, + "loss": 1.0361, + "step": 1100 + }, + { + "epoch": 0.09837603591931557, + "grad_norm": 0.435067743062973, + "learning_rate": 9.877952695307382e-05, + "loss": 1.0632, + "step": 1101 + }, + { + "epoch": 0.09846538745057744, + "grad_norm": 0.5049869418144226, + "learning_rate": 9.877634717569137e-05, + "loss": 0.9519, + "step": 1102 + }, + { + "epoch": 0.09855473898183931, + "grad_norm": 0.48381614685058594, + "learning_rate": 9.877316331276995e-05, + "loss": 1.0368, + "step": 1103 + }, + { + "epoch": 0.09864409051310116, + "grad_norm": 0.4501636326313019, + "learning_rate": 9.876997536457619e-05, + "loss": 0.9595, + "step": 1104 + }, + { + "epoch": 0.09873344204436303, + "grad_norm": 0.4924948811531067, + "learning_rate": 9.876678333137716e-05, + "loss": 1.0592, + "step": 1105 + }, + { + "epoch": 0.0988227935756249, + "grad_norm": 0.387658029794693, + "learning_rate": 9.876358721344022e-05, + "loss": 1.0821, + "step": 1106 + }, + { + "epoch": 0.09891214510688677, + "grad_norm": 0.5033579468727112, + "learning_rate": 9.876038701103305e-05, + "loss": 1.0504, + "step": 1107 + }, + { + "epoch": 0.09900149663814864, + "grad_norm": 0.47466421127319336, + "learning_rate": 9.875718272442373e-05, + "loss": 1.009, + "step": 1108 + }, + { + "epoch": 0.09909084816941051, + "grad_norm": 0.43417561054229736, + "learning_rate": 9.875397435388063e-05, + "loss": 1.0584, + "step": 1109 + }, + { + "epoch": 0.09918019970067236, + "grad_norm": 0.4432922899723053, + "learning_rate": 9.875076189967252e-05, + "loss": 1.0067, + "step": 1110 + }, + { + "epoch": 0.09926955123193423, + "grad_norm": 0.45428282022476196, + "learning_rate": 9.874754536206843e-05, + "loss": 1.1137, + "step": 1111 + }, + { + "epoch": 0.0993589027631961, + "grad_norm": 0.4388342797756195, + "learning_rate": 9.874432474133781e-05, + "loss": 1.0513, + "step": 1112 + }, + { + "epoch": 0.09944825429445797, + "grad_norm": 0.42276880145072937, + "learning_rate": 9.874110003775043e-05, + "loss": 1.0552, + "step": 1113 + }, + { + "epoch": 0.09953760582571984, + "grad_norm": 0.4311312735080719, + "learning_rate": 9.873787125157636e-05, + "loss": 1.0679, + "step": 1114 + }, + { + "epoch": 0.09962695735698171, + "grad_norm": 0.38914674520492554, + "learning_rate": 9.873463838308608e-05, + "loss": 1.0837, + "step": 1115 + }, + { + "epoch": 0.09971630888824357, + "grad_norm": 0.46259498596191406, + "learning_rate": 9.873140143255036e-05, + "loss": 1.0772, + "step": 1116 + }, + { + "epoch": 0.09980566041950543, + "grad_norm": 0.4010103642940521, + "learning_rate": 9.872816040024034e-05, + "loss": 1.0801, + "step": 1117 + }, + { + "epoch": 0.0998950119507673, + "grad_norm": 0.3827916383743286, + "learning_rate": 9.872491528642746e-05, + "loss": 1.1128, + "step": 1118 + }, + { + "epoch": 0.09998436348202917, + "grad_norm": 0.41605910658836365, + "learning_rate": 9.872166609138355e-05, + "loss": 1.0274, + "step": 1119 + }, + { + "epoch": 0.10007371501329104, + "grad_norm": 0.44363489747047424, + "learning_rate": 9.87184128153808e-05, + "loss": 1.0553, + "step": 1120 + }, + { + "epoch": 0.10016306654455291, + "grad_norm": 0.4547342360019684, + "learning_rate": 9.871515545869166e-05, + "loss": 1.0827, + "step": 1121 + }, + { + "epoch": 0.10025241807581478, + "grad_norm": 0.4331105649471283, + "learning_rate": 9.871189402158898e-05, + "loss": 1.0772, + "step": 1122 + }, + { + "epoch": 0.10034176960707664, + "grad_norm": 0.47358986735343933, + "learning_rate": 9.870862850434593e-05, + "loss": 1.124, + "step": 1123 + }, + { + "epoch": 0.1004311211383385, + "grad_norm": 0.4455796480178833, + "learning_rate": 9.870535890723607e-05, + "loss": 1.0435, + "step": 1124 + }, + { + "epoch": 0.10052047266960037, + "grad_norm": 0.42114776372909546, + "learning_rate": 9.870208523053323e-05, + "loss": 1.1176, + "step": 1125 + }, + { + "epoch": 0.10060982420086224, + "grad_norm": 0.4320701062679291, + "learning_rate": 9.869880747451164e-05, + "loss": 1.0214, + "step": 1126 + }, + { + "epoch": 0.10069917573212411, + "grad_norm": 0.41799092292785645, + "learning_rate": 9.86955256394458e-05, + "loss": 1.0928, + "step": 1127 + }, + { + "epoch": 0.10078852726338598, + "grad_norm": 0.4450794458389282, + "learning_rate": 9.869223972561066e-05, + "loss": 1.1019, + "step": 1128 + }, + { + "epoch": 0.10087787879464784, + "grad_norm": 0.42303231358528137, + "learning_rate": 9.86889497332814e-05, + "loss": 1.0618, + "step": 1129 + }, + { + "epoch": 0.10096723032590971, + "grad_norm": 0.5273473858833313, + "learning_rate": 9.868565566273363e-05, + "loss": 1.0161, + "step": 1130 + }, + { + "epoch": 0.10105658185717158, + "grad_norm": 0.4437713921070099, + "learning_rate": 9.868235751424324e-05, + "loss": 1.0227, + "step": 1131 + }, + { + "epoch": 0.10114593338843345, + "grad_norm": 0.41443753242492676, + "learning_rate": 9.86790552880865e-05, + "loss": 1.036, + "step": 1132 + }, + { + "epoch": 0.10123528491969531, + "grad_norm": 0.5034554600715637, + "learning_rate": 9.867574898453998e-05, + "loss": 0.9698, + "step": 1133 + }, + { + "epoch": 0.10132463645095718, + "grad_norm": 0.4698033630847931, + "learning_rate": 9.867243860388065e-05, + "loss": 1.0825, + "step": 1134 + }, + { + "epoch": 0.10141398798221904, + "grad_norm": 0.48983025550842285, + "learning_rate": 9.866912414638577e-05, + "loss": 1.0424, + "step": 1135 + }, + { + "epoch": 0.10150333951348091, + "grad_norm": 0.5005711317062378, + "learning_rate": 9.866580561233297e-05, + "loss": 1.0549, + "step": 1136 + }, + { + "epoch": 0.10159269104474278, + "grad_norm": 0.44725582003593445, + "learning_rate": 9.866248300200021e-05, + "loss": 1.0877, + "step": 1137 + }, + { + "epoch": 0.10168204257600465, + "grad_norm": 0.4655127227306366, + "learning_rate": 9.865915631566579e-05, + "loss": 1.1502, + "step": 1138 + }, + { + "epoch": 0.10177139410726652, + "grad_norm": 0.4426875114440918, + "learning_rate": 9.865582555360837e-05, + "loss": 1.1273, + "step": 1139 + }, + { + "epoch": 0.10186074563852839, + "grad_norm": 0.4345450699329376, + "learning_rate": 9.865249071610691e-05, + "loss": 1.1256, + "step": 1140 + }, + { + "epoch": 0.10195009716979024, + "grad_norm": 0.4371299743652344, + "learning_rate": 9.864915180344076e-05, + "loss": 1.0296, + "step": 1141 + }, + { + "epoch": 0.10203944870105211, + "grad_norm": 0.44082698225975037, + "learning_rate": 9.864580881588959e-05, + "loss": 1.0628, + "step": 1142 + }, + { + "epoch": 0.10212880023231398, + "grad_norm": 0.5174263715744019, + "learning_rate": 9.864246175373339e-05, + "loss": 0.9576, + "step": 1143 + }, + { + "epoch": 0.10221815176357585, + "grad_norm": 0.42104923725128174, + "learning_rate": 9.863911061725256e-05, + "loss": 1.017, + "step": 1144 + }, + { + "epoch": 0.10230750329483772, + "grad_norm": 0.39812588691711426, + "learning_rate": 9.863575540672772e-05, + "loss": 1.0241, + "step": 1145 + }, + { + "epoch": 0.10239685482609959, + "grad_norm": 0.5662054419517517, + "learning_rate": 9.863239612243997e-05, + "loss": 1.0541, + "step": 1146 + }, + { + "epoch": 0.10248620635736144, + "grad_norm": 0.4463253319263458, + "learning_rate": 9.862903276467066e-05, + "loss": 1.0583, + "step": 1147 + }, + { + "epoch": 0.10257555788862331, + "grad_norm": 0.3903212249279022, + "learning_rate": 9.86256653337015e-05, + "loss": 1.1279, + "step": 1148 + }, + { + "epoch": 0.10266490941988518, + "grad_norm": 0.518621563911438, + "learning_rate": 9.862229382981454e-05, + "loss": 1.0526, + "step": 1149 + }, + { + "epoch": 0.10275426095114705, + "grad_norm": 0.5142053365707397, + "learning_rate": 9.861891825329222e-05, + "loss": 1.0619, + "step": 1150 + }, + { + "epoch": 0.10284361248240892, + "grad_norm": 0.45500150322914124, + "learning_rate": 9.861553860441725e-05, + "loss": 1.0992, + "step": 1151 + }, + { + "epoch": 0.10293296401367079, + "grad_norm": 0.4826822578907013, + "learning_rate": 9.861215488347272e-05, + "loss": 1.0001, + "step": 1152 + }, + { + "epoch": 0.10302231554493266, + "grad_norm": 0.44567134976387024, + "learning_rate": 9.860876709074204e-05, + "loss": 1.1272, + "step": 1153 + }, + { + "epoch": 0.10311166707619451, + "grad_norm": 0.4271107614040375, + "learning_rate": 9.860537522650898e-05, + "loss": 1.0903, + "step": 1154 + }, + { + "epoch": 0.10320101860745638, + "grad_norm": 0.4095713198184967, + "learning_rate": 9.860197929105768e-05, + "loss": 1.0451, + "step": 1155 + }, + { + "epoch": 0.10329037013871825, + "grad_norm": 0.42766252160072327, + "learning_rate": 9.859857928467254e-05, + "loss": 1.0661, + "step": 1156 + }, + { + "epoch": 0.10337972166998012, + "grad_norm": 0.4731602072715759, + "learning_rate": 9.859517520763834e-05, + "loss": 1.0923, + "step": 1157 + }, + { + "epoch": 0.10346907320124199, + "grad_norm": 0.5062028169631958, + "learning_rate": 9.859176706024024e-05, + "loss": 0.9769, + "step": 1158 + }, + { + "epoch": 0.10355842473250386, + "grad_norm": 0.433315634727478, + "learning_rate": 9.858835484276369e-05, + "loss": 1.0987, + "step": 1159 + }, + { + "epoch": 0.10364777626376571, + "grad_norm": 0.4237388074398041, + "learning_rate": 9.858493855549453e-05, + "loss": 1.0466, + "step": 1160 + }, + { + "epoch": 0.10373712779502758, + "grad_norm": 0.45744040608406067, + "learning_rate": 9.858151819871888e-05, + "loss": 1.039, + "step": 1161 + }, + { + "epoch": 0.10382647932628945, + "grad_norm": 0.5162191390991211, + "learning_rate": 9.857809377272323e-05, + "loss": 1.0173, + "step": 1162 + }, + { + "epoch": 0.10391583085755132, + "grad_norm": 0.4239135682582855, + "learning_rate": 9.857466527779444e-05, + "loss": 1.044, + "step": 1163 + }, + { + "epoch": 0.10400518238881319, + "grad_norm": 0.5142119526863098, + "learning_rate": 9.857123271421964e-05, + "loss": 1.0666, + "step": 1164 + }, + { + "epoch": 0.10409453392007506, + "grad_norm": 0.49715521931648254, + "learning_rate": 9.856779608228638e-05, + "loss": 1.0999, + "step": 1165 + }, + { + "epoch": 0.10418388545133692, + "grad_norm": 0.4236762523651123, + "learning_rate": 9.856435538228251e-05, + "loss": 1.0383, + "step": 1166 + }, + { + "epoch": 0.10427323698259879, + "grad_norm": 0.4198687970638275, + "learning_rate": 9.85609106144962e-05, + "loss": 1.0291, + "step": 1167 + }, + { + "epoch": 0.10436258851386065, + "grad_norm": 0.4170401990413666, + "learning_rate": 9.8557461779216e-05, + "loss": 1.056, + "step": 1168 + }, + { + "epoch": 0.10445194004512252, + "grad_norm": 0.4177343547344208, + "learning_rate": 9.855400887673082e-05, + "loss": 1.0697, + "step": 1169 + }, + { + "epoch": 0.1045412915763844, + "grad_norm": 0.4959930181503296, + "learning_rate": 9.855055190732983e-05, + "loss": 1.1104, + "step": 1170 + }, + { + "epoch": 0.10463064310764626, + "grad_norm": 0.4635033905506134, + "learning_rate": 9.85470908713026e-05, + "loss": 0.9893, + "step": 1171 + }, + { + "epoch": 0.10471999463890812, + "grad_norm": 0.42118409276008606, + "learning_rate": 9.854362576893905e-05, + "loss": 1.0985, + "step": 1172 + }, + { + "epoch": 0.10480934617016999, + "grad_norm": 0.41519343852996826, + "learning_rate": 9.85401566005294e-05, + "loss": 1.0862, + "step": 1173 + }, + { + "epoch": 0.10489869770143186, + "grad_norm": 0.42314785718917847, + "learning_rate": 9.853668336636422e-05, + "loss": 1.1004, + "step": 1174 + }, + { + "epoch": 0.10498804923269373, + "grad_norm": 0.4355357885360718, + "learning_rate": 9.853320606673446e-05, + "loss": 1.0732, + "step": 1175 + }, + { + "epoch": 0.1050774007639556, + "grad_norm": 0.3647025525569916, + "learning_rate": 9.852972470193136e-05, + "loss": 1.0505, + "step": 1176 + }, + { + "epoch": 0.10516675229521746, + "grad_norm": 0.4869689643383026, + "learning_rate": 9.852623927224653e-05, + "loss": 1.0354, + "step": 1177 + }, + { + "epoch": 0.10525610382647933, + "grad_norm": 0.38133373856544495, + "learning_rate": 9.85227497779719e-05, + "loss": 1.0863, + "step": 1178 + }, + { + "epoch": 0.10534545535774119, + "grad_norm": 0.5018585920333862, + "learning_rate": 9.851925621939976e-05, + "loss": 1.093, + "step": 1179 + }, + { + "epoch": 0.10543480688900306, + "grad_norm": 0.44408929347991943, + "learning_rate": 9.851575859682274e-05, + "loss": 1.0619, + "step": 1180 + }, + { + "epoch": 0.10552415842026493, + "grad_norm": 0.4483299255371094, + "learning_rate": 9.85122569105338e-05, + "loss": 1.0438, + "step": 1181 + }, + { + "epoch": 0.1056135099515268, + "grad_norm": 0.3859767019748688, + "learning_rate": 9.850875116082623e-05, + "loss": 1.1045, + "step": 1182 + }, + { + "epoch": 0.10570286148278867, + "grad_norm": 0.4310990571975708, + "learning_rate": 9.850524134799371e-05, + "loss": 1.0604, + "step": 1183 + }, + { + "epoch": 0.10579221301405053, + "grad_norm": 0.5007196664810181, + "learning_rate": 9.850172747233018e-05, + "loss": 0.9943, + "step": 1184 + }, + { + "epoch": 0.10588156454531239, + "grad_norm": 0.4328349530696869, + "learning_rate": 9.849820953412997e-05, + "loss": 1.0766, + "step": 1185 + }, + { + "epoch": 0.10597091607657426, + "grad_norm": 0.4742787480354309, + "learning_rate": 9.849468753368777e-05, + "loss": 1.0821, + "step": 1186 + }, + { + "epoch": 0.10606026760783613, + "grad_norm": 0.43033096194267273, + "learning_rate": 9.849116147129857e-05, + "loss": 1.0192, + "step": 1187 + }, + { + "epoch": 0.106149619139098, + "grad_norm": 0.416620671749115, + "learning_rate": 9.848763134725771e-05, + "loss": 1.1052, + "step": 1188 + }, + { + "epoch": 0.10623897067035987, + "grad_norm": 0.44439324736595154, + "learning_rate": 9.848409716186091e-05, + "loss": 1.0822, + "step": 1189 + }, + { + "epoch": 0.10632832220162174, + "grad_norm": 0.4544062316417694, + "learning_rate": 9.848055891540416e-05, + "loss": 1.0642, + "step": 1190 + }, + { + "epoch": 0.10641767373288359, + "grad_norm": 0.5025431513786316, + "learning_rate": 9.847701660818381e-05, + "loss": 0.9832, + "step": 1191 + }, + { + "epoch": 0.10650702526414546, + "grad_norm": 0.4252472519874573, + "learning_rate": 9.847347024049662e-05, + "loss": 1.0422, + "step": 1192 + }, + { + "epoch": 0.10659637679540733, + "grad_norm": 0.43434327840805054, + "learning_rate": 9.84699198126396e-05, + "loss": 1.1426, + "step": 1193 + }, + { + "epoch": 0.1066857283266692, + "grad_norm": 0.40715593099594116, + "learning_rate": 9.846636532491014e-05, + "loss": 1.0338, + "step": 1194 + }, + { + "epoch": 0.10677507985793107, + "grad_norm": 0.47930577397346497, + "learning_rate": 9.846280677760596e-05, + "loss": 1.1011, + "step": 1195 + }, + { + "epoch": 0.10686443138919294, + "grad_norm": 0.45403239130973816, + "learning_rate": 9.845924417102514e-05, + "loss": 1.0613, + "step": 1196 + }, + { + "epoch": 0.10695378292045479, + "grad_norm": 0.3767177164554596, + "learning_rate": 9.84556775054661e-05, + "loss": 1.1216, + "step": 1197 + }, + { + "epoch": 0.10704313445171666, + "grad_norm": 0.41710513830184937, + "learning_rate": 9.845210678122756e-05, + "loss": 1.0588, + "step": 1198 + }, + { + "epoch": 0.10713248598297853, + "grad_norm": 0.49159350991249084, + "learning_rate": 9.844853199860861e-05, + "loss": 1.0458, + "step": 1199 + }, + { + "epoch": 0.1072218375142404, + "grad_norm": 0.4163805842399597, + "learning_rate": 9.844495315790869e-05, + "loss": 1.0478, + "step": 1200 + }, + { + "epoch": 0.10731118904550227, + "grad_norm": 0.3860858678817749, + "learning_rate": 9.844137025942754e-05, + "loss": 1.0728, + "step": 1201 + }, + { + "epoch": 0.10740054057676414, + "grad_norm": 0.44660601019859314, + "learning_rate": 9.84377833034653e-05, + "loss": 1.1211, + "step": 1202 + }, + { + "epoch": 0.107489892108026, + "grad_norm": 0.4287334382534027, + "learning_rate": 9.843419229032238e-05, + "loss": 1.0144, + "step": 1203 + }, + { + "epoch": 0.10757924363928786, + "grad_norm": 0.4286378026008606, + "learning_rate": 9.843059722029959e-05, + "loss": 1.1338, + "step": 1204 + }, + { + "epoch": 0.10766859517054973, + "grad_norm": 0.3777160048484802, + "learning_rate": 9.842699809369806e-05, + "loss": 1.0877, + "step": 1205 + }, + { + "epoch": 0.1077579467018116, + "grad_norm": 0.40929606556892395, + "learning_rate": 9.842339491081924e-05, + "loss": 1.0229, + "step": 1206 + }, + { + "epoch": 0.10784729823307347, + "grad_norm": 0.40165361762046814, + "learning_rate": 9.841978767196495e-05, + "loss": 1.042, + "step": 1207 + }, + { + "epoch": 0.10793664976433534, + "grad_norm": 0.4323919713497162, + "learning_rate": 9.841617637743731e-05, + "loss": 1.0781, + "step": 1208 + }, + { + "epoch": 0.10802600129559721, + "grad_norm": 0.5180752873420715, + "learning_rate": 9.841256102753882e-05, + "loss": 1.1395, + "step": 1209 + }, + { + "epoch": 0.10811535282685907, + "grad_norm": 0.40795522928237915, + "learning_rate": 9.84089416225723e-05, + "loss": 1.0676, + "step": 1210 + }, + { + "epoch": 0.10820470435812093, + "grad_norm": 0.4201282560825348, + "learning_rate": 9.840531816284093e-05, + "loss": 1.0493, + "step": 1211 + }, + { + "epoch": 0.1082940558893828, + "grad_norm": 0.4624415636062622, + "learning_rate": 9.84016906486482e-05, + "loss": 1.0656, + "step": 1212 + }, + { + "epoch": 0.10838340742064467, + "grad_norm": 0.415123850107193, + "learning_rate": 9.839805908029795e-05, + "loss": 1.0556, + "step": 1213 + }, + { + "epoch": 0.10847275895190654, + "grad_norm": 0.41990217566490173, + "learning_rate": 9.839442345809435e-05, + "loss": 1.0851, + "step": 1214 + }, + { + "epoch": 0.10856211048316841, + "grad_norm": 0.43788111209869385, + "learning_rate": 9.839078378234196e-05, + "loss": 1.106, + "step": 1215 + }, + { + "epoch": 0.10865146201443027, + "grad_norm": 0.4749810993671417, + "learning_rate": 9.838714005334562e-05, + "loss": 1.0649, + "step": 1216 + }, + { + "epoch": 0.10874081354569214, + "grad_norm": 0.4938737750053406, + "learning_rate": 9.838349227141051e-05, + "loss": 1.0409, + "step": 1217 + }, + { + "epoch": 0.108830165076954, + "grad_norm": 0.48466798663139343, + "learning_rate": 9.83798404368422e-05, + "loss": 1.1215, + "step": 1218 + }, + { + "epoch": 0.10891951660821587, + "grad_norm": 0.3890851140022278, + "learning_rate": 9.837618454994657e-05, + "loss": 1.0944, + "step": 1219 + }, + { + "epoch": 0.10900886813947774, + "grad_norm": 0.4145745635032654, + "learning_rate": 9.837252461102981e-05, + "loss": 1.1748, + "step": 1220 + }, + { + "epoch": 0.10909821967073961, + "grad_norm": 0.4121316969394684, + "learning_rate": 9.836886062039853e-05, + "loss": 1.0474, + "step": 1221 + }, + { + "epoch": 0.10918757120200147, + "grad_norm": 0.3703002333641052, + "learning_rate": 9.836519257835957e-05, + "loss": 1.1139, + "step": 1222 + }, + { + "epoch": 0.10927692273326334, + "grad_norm": 0.44502654671669006, + "learning_rate": 9.836152048522022e-05, + "loss": 1.0527, + "step": 1223 + }, + { + "epoch": 0.1093662742645252, + "grad_norm": 0.41378626227378845, + "learning_rate": 9.835784434128802e-05, + "loss": 1.1426, + "step": 1224 + }, + { + "epoch": 0.10945562579578708, + "grad_norm": 0.5465456247329712, + "learning_rate": 9.83541641468709e-05, + "loss": 1.057, + "step": 1225 + }, + { + "epoch": 0.10954497732704895, + "grad_norm": 0.49641984701156616, + "learning_rate": 9.835047990227712e-05, + "loss": 1.0913, + "step": 1226 + }, + { + "epoch": 0.10963432885831081, + "grad_norm": 0.5275992155075073, + "learning_rate": 9.834679160781526e-05, + "loss": 0.987, + "step": 1227 + }, + { + "epoch": 0.10972368038957267, + "grad_norm": 0.43388912081718445, + "learning_rate": 9.834309926379426e-05, + "loss": 1.0185, + "step": 1228 + }, + { + "epoch": 0.10981303192083454, + "grad_norm": 0.5272001028060913, + "learning_rate": 9.833940287052341e-05, + "loss": 1.0355, + "step": 1229 + }, + { + "epoch": 0.10990238345209641, + "grad_norm": 0.39785364270210266, + "learning_rate": 9.833570242831229e-05, + "loss": 1.0603, + "step": 1230 + }, + { + "epoch": 0.10999173498335828, + "grad_norm": 0.5039787292480469, + "learning_rate": 9.833199793747089e-05, + "loss": 1.0754, + "step": 1231 + }, + { + "epoch": 0.11008108651462015, + "grad_norm": 0.5028790831565857, + "learning_rate": 9.832828939830947e-05, + "loss": 1.0872, + "step": 1232 + }, + { + "epoch": 0.11017043804588202, + "grad_norm": 0.4891465902328491, + "learning_rate": 9.832457681113866e-05, + "loss": 0.9673, + "step": 1233 + }, + { + "epoch": 0.11025978957714389, + "grad_norm": 0.5017561316490173, + "learning_rate": 9.832086017626947e-05, + "loss": 1.0323, + "step": 1234 + }, + { + "epoch": 0.11034914110840574, + "grad_norm": 0.41430050134658813, + "learning_rate": 9.831713949401316e-05, + "loss": 1.0685, + "step": 1235 + }, + { + "epoch": 0.11043849263966761, + "grad_norm": 0.4018838703632355, + "learning_rate": 9.831341476468139e-05, + "loss": 1.0245, + "step": 1236 + }, + { + "epoch": 0.11052784417092948, + "grad_norm": 0.48818472027778625, + "learning_rate": 9.830968598858614e-05, + "loss": 1.0407, + "step": 1237 + }, + { + "epoch": 0.11061719570219135, + "grad_norm": 0.45621582865715027, + "learning_rate": 9.830595316603976e-05, + "loss": 1.0304, + "step": 1238 + }, + { + "epoch": 0.11070654723345322, + "grad_norm": 0.5320197343826294, + "learning_rate": 9.83022162973549e-05, + "loss": 1.0693, + "step": 1239 + }, + { + "epoch": 0.11079589876471509, + "grad_norm": 0.4279773533344269, + "learning_rate": 9.829847538284455e-05, + "loss": 1.1345, + "step": 1240 + }, + { + "epoch": 0.11088525029597694, + "grad_norm": 0.4108751118183136, + "learning_rate": 9.829473042282207e-05, + "loss": 1.1014, + "step": 1241 + }, + { + "epoch": 0.11097460182723881, + "grad_norm": 0.5319353938102722, + "learning_rate": 9.829098141760111e-05, + "loss": 0.9735, + "step": 1242 + }, + { + "epoch": 0.11106395335850068, + "grad_norm": 0.42923703789711, + "learning_rate": 9.828722836749575e-05, + "loss": 1.0158, + "step": 1243 + }, + { + "epoch": 0.11115330488976255, + "grad_norm": 0.4357326924800873, + "learning_rate": 9.828347127282027e-05, + "loss": 1.0393, + "step": 1244 + }, + { + "epoch": 0.11124265642102442, + "grad_norm": 0.3578936755657196, + "learning_rate": 9.827971013388944e-05, + "loss": 1.0764, + "step": 1245 + }, + { + "epoch": 0.11133200795228629, + "grad_norm": 0.35362938046455383, + "learning_rate": 9.827594495101823e-05, + "loss": 1.1115, + "step": 1246 + }, + { + "epoch": 0.11142135948354814, + "grad_norm": 0.41567638516426086, + "learning_rate": 9.827217572452208e-05, + "loss": 1.1021, + "step": 1247 + }, + { + "epoch": 0.11151071101481001, + "grad_norm": 0.40736740827560425, + "learning_rate": 9.826840245471665e-05, + "loss": 1.03, + "step": 1248 + }, + { + "epoch": 0.11160006254607188, + "grad_norm": 0.4363911747932434, + "learning_rate": 9.826462514191801e-05, + "loss": 1.0462, + "step": 1249 + }, + { + "epoch": 0.11168941407733375, + "grad_norm": 0.4702802896499634, + "learning_rate": 9.826084378644254e-05, + "loss": 1.029, + "step": 1250 + }, + { + "epoch": 0.11177876560859562, + "grad_norm": 0.45278483629226685, + "learning_rate": 9.8257058388607e-05, + "loss": 1.1249, + "step": 1251 + }, + { + "epoch": 0.11186811713985749, + "grad_norm": 0.47644123435020447, + "learning_rate": 9.825326894872842e-05, + "loss": 1.0096, + "step": 1252 + }, + { + "epoch": 0.11195746867111935, + "grad_norm": 0.48887526988983154, + "learning_rate": 9.824947546712424e-05, + "loss": 1.0465, + "step": 1253 + }, + { + "epoch": 0.11204682020238121, + "grad_norm": 0.4377509355545044, + "learning_rate": 9.824567794411216e-05, + "loss": 1.049, + "step": 1254 + }, + { + "epoch": 0.11213617173364308, + "grad_norm": 0.5497344732284546, + "learning_rate": 9.824187638001032e-05, + "loss": 0.9813, + "step": 1255 + }, + { + "epoch": 0.11222552326490495, + "grad_norm": 0.39412158727645874, + "learning_rate": 9.82380707751371e-05, + "loss": 1.0328, + "step": 1256 + }, + { + "epoch": 0.11231487479616682, + "grad_norm": 0.5611281394958496, + "learning_rate": 9.823426112981126e-05, + "loss": 0.9994, + "step": 1257 + }, + { + "epoch": 0.11240422632742869, + "grad_norm": 0.4828655421733856, + "learning_rate": 9.823044744435193e-05, + "loss": 1.062, + "step": 1258 + }, + { + "epoch": 0.11249357785869055, + "grad_norm": 0.3836156129837036, + "learning_rate": 9.822662971907852e-05, + "loss": 1.1001, + "step": 1259 + }, + { + "epoch": 0.11258292938995242, + "grad_norm": 0.42271852493286133, + "learning_rate": 9.822280795431082e-05, + "loss": 1.0411, + "step": 1260 + }, + { + "epoch": 0.11267228092121429, + "grad_norm": 0.4254275858402252, + "learning_rate": 9.821898215036891e-05, + "loss": 1.0963, + "step": 1261 + }, + { + "epoch": 0.11276163245247615, + "grad_norm": 0.46899744868278503, + "learning_rate": 9.821515230757329e-05, + "loss": 1.054, + "step": 1262 + }, + { + "epoch": 0.11285098398373802, + "grad_norm": 0.4628446400165558, + "learning_rate": 9.821131842624471e-05, + "loss": 1.0433, + "step": 1263 + }, + { + "epoch": 0.11294033551499989, + "grad_norm": 0.4287131726741791, + "learning_rate": 9.820748050670433e-05, + "loss": 1.0086, + "step": 1264 + }, + { + "epoch": 0.11302968704626176, + "grad_norm": 0.4528293013572693, + "learning_rate": 9.820363854927362e-05, + "loss": 1.0296, + "step": 1265 + }, + { + "epoch": 0.11311903857752362, + "grad_norm": 0.5807567834854126, + "learning_rate": 9.819979255427434e-05, + "loss": 1.0242, + "step": 1266 + }, + { + "epoch": 0.11320839010878549, + "grad_norm": 0.4787788391113281, + "learning_rate": 9.819594252202866e-05, + "loss": 1.0764, + "step": 1267 + }, + { + "epoch": 0.11329774164004736, + "grad_norm": 0.4397558271884918, + "learning_rate": 9.819208845285908e-05, + "loss": 1.0668, + "step": 1268 + }, + { + "epoch": 0.11338709317130923, + "grad_norm": 0.5197393894195557, + "learning_rate": 9.81882303470884e-05, + "loss": 1.0614, + "step": 1269 + }, + { + "epoch": 0.1134764447025711, + "grad_norm": 0.4312942326068878, + "learning_rate": 9.818436820503976e-05, + "loss": 1.1124, + "step": 1270 + }, + { + "epoch": 0.11356579623383296, + "grad_norm": 0.36695683002471924, + "learning_rate": 9.818050202703668e-05, + "loss": 1.0542, + "step": 1271 + }, + { + "epoch": 0.11365514776509482, + "grad_norm": 0.47956258058547974, + "learning_rate": 9.817663181340299e-05, + "loss": 1.0063, + "step": 1272 + }, + { + "epoch": 0.11374449929635669, + "grad_norm": 0.40785545110702515, + "learning_rate": 9.817275756446287e-05, + "loss": 1.0145, + "step": 1273 + }, + { + "epoch": 0.11383385082761856, + "grad_norm": 0.5532562732696533, + "learning_rate": 9.816887928054082e-05, + "loss": 1.0415, + "step": 1274 + }, + { + "epoch": 0.11392320235888043, + "grad_norm": 0.4029194414615631, + "learning_rate": 9.816499696196167e-05, + "loss": 1.0276, + "step": 1275 + }, + { + "epoch": 0.1140125538901423, + "grad_norm": 0.38297927379608154, + "learning_rate": 9.816111060905062e-05, + "loss": 1.0914, + "step": 1276 + }, + { + "epoch": 0.11410190542140417, + "grad_norm": 0.44156894087791443, + "learning_rate": 9.815722022213322e-05, + "loss": 1.0469, + "step": 1277 + }, + { + "epoch": 0.11419125695266602, + "grad_norm": 0.44452789425849915, + "learning_rate": 9.81533258015353e-05, + "loss": 0.9441, + "step": 1278 + }, + { + "epoch": 0.11428060848392789, + "grad_norm": 0.4572380483150482, + "learning_rate": 9.814942734758306e-05, + "loss": 1.0353, + "step": 1279 + }, + { + "epoch": 0.11436996001518976, + "grad_norm": 0.39455848932266235, + "learning_rate": 9.814552486060305e-05, + "loss": 1.0873, + "step": 1280 + }, + { + "epoch": 0.11445931154645163, + "grad_norm": 0.3743246793746948, + "learning_rate": 9.814161834092212e-05, + "loss": 1.0684, + "step": 1281 + }, + { + "epoch": 0.1145486630777135, + "grad_norm": 0.4070795774459839, + "learning_rate": 9.81377077888675e-05, + "loss": 1.143, + "step": 1282 + }, + { + "epoch": 0.11463801460897537, + "grad_norm": 0.4575822353363037, + "learning_rate": 9.813379320476677e-05, + "loss": 1.0032, + "step": 1283 + }, + { + "epoch": 0.11472736614023722, + "grad_norm": 0.4624904692173004, + "learning_rate": 9.812987458894778e-05, + "loss": 1.1421, + "step": 1284 + }, + { + "epoch": 0.11481671767149909, + "grad_norm": 0.45701712369918823, + "learning_rate": 9.812595194173875e-05, + "loss": 1.0939, + "step": 1285 + }, + { + "epoch": 0.11490606920276096, + "grad_norm": 0.4664194583892822, + "learning_rate": 9.812202526346827e-05, + "loss": 1.0381, + "step": 1286 + }, + { + "epoch": 0.11499542073402283, + "grad_norm": 0.42624375224113464, + "learning_rate": 9.811809455446523e-05, + "loss": 1.0903, + "step": 1287 + }, + { + "epoch": 0.1150847722652847, + "grad_norm": 0.40351542830467224, + "learning_rate": 9.811415981505887e-05, + "loss": 1.0591, + "step": 1288 + }, + { + "epoch": 0.11517412379654657, + "grad_norm": 0.46651527285575867, + "learning_rate": 9.811022104557877e-05, + "loss": 1.0647, + "step": 1289 + }, + { + "epoch": 0.11526347532780842, + "grad_norm": 0.4483588635921478, + "learning_rate": 9.810627824635483e-05, + "loss": 1.0734, + "step": 1290 + }, + { + "epoch": 0.11535282685907029, + "grad_norm": 0.45187729597091675, + "learning_rate": 9.810233141771732e-05, + "loss": 1.0024, + "step": 1291 + }, + { + "epoch": 0.11544217839033216, + "grad_norm": 0.43645814061164856, + "learning_rate": 9.809838055999681e-05, + "loss": 1.1, + "step": 1292 + }, + { + "epoch": 0.11553152992159403, + "grad_norm": 0.42564859986305237, + "learning_rate": 9.809442567352425e-05, + "loss": 1.0762, + "step": 1293 + }, + { + "epoch": 0.1156208814528559, + "grad_norm": 0.419881135225296, + "learning_rate": 9.809046675863087e-05, + "loss": 1.1093, + "step": 1294 + }, + { + "epoch": 0.11571023298411777, + "grad_norm": 0.46777603030204773, + "learning_rate": 9.808650381564831e-05, + "loss": 1.0415, + "step": 1295 + }, + { + "epoch": 0.11579958451537964, + "grad_norm": 0.4529683589935303, + "learning_rate": 9.80825368449085e-05, + "loss": 1.0256, + "step": 1296 + }, + { + "epoch": 0.1158889360466415, + "grad_norm": 0.3957824110984802, + "learning_rate": 9.807856584674368e-05, + "loss": 1.1027, + "step": 1297 + }, + { + "epoch": 0.11597828757790336, + "grad_norm": 0.3844730854034424, + "learning_rate": 9.807459082148648e-05, + "loss": 1.0713, + "step": 1298 + }, + { + "epoch": 0.11606763910916523, + "grad_norm": 0.3445946276187897, + "learning_rate": 9.80706117694699e-05, + "loss": 1.1058, + "step": 1299 + }, + { + "epoch": 0.1161569906404271, + "grad_norm": 0.39715394377708435, + "learning_rate": 9.806662869102717e-05, + "loss": 1.0711, + "step": 1300 + }, + { + "epoch": 0.11624634217168897, + "grad_norm": 0.5070072412490845, + "learning_rate": 9.806264158649193e-05, + "loss": 1.0142, + "step": 1301 + }, + { + "epoch": 0.11633569370295084, + "grad_norm": 0.4780667722225189, + "learning_rate": 9.805865045619813e-05, + "loss": 1.0362, + "step": 1302 + }, + { + "epoch": 0.1164250452342127, + "grad_norm": 0.45890098810195923, + "learning_rate": 9.80546553004801e-05, + "loss": 0.9997, + "step": 1303 + }, + { + "epoch": 0.11651439676547456, + "grad_norm": 0.46433109045028687, + "learning_rate": 9.805065611967248e-05, + "loss": 0.9268, + "step": 1304 + }, + { + "epoch": 0.11660374829673643, + "grad_norm": 0.43207046389579773, + "learning_rate": 9.804665291411022e-05, + "loss": 1.0814, + "step": 1305 + }, + { + "epoch": 0.1166930998279983, + "grad_norm": 0.3997035622596741, + "learning_rate": 9.804264568412862e-05, + "loss": 1.0454, + "step": 1306 + }, + { + "epoch": 0.11678245135926017, + "grad_norm": 0.4062361717224121, + "learning_rate": 9.803863443006336e-05, + "loss": 1.1057, + "step": 1307 + }, + { + "epoch": 0.11687180289052204, + "grad_norm": 0.3779062032699585, + "learning_rate": 9.80346191522504e-05, + "loss": 1.1132, + "step": 1308 + }, + { + "epoch": 0.1169611544217839, + "grad_norm": 0.4400861859321594, + "learning_rate": 9.803059985102609e-05, + "loss": 1.0412, + "step": 1309 + }, + { + "epoch": 0.11705050595304577, + "grad_norm": 0.48002249002456665, + "learning_rate": 9.802657652672706e-05, + "loss": 0.9414, + "step": 1310 + }, + { + "epoch": 0.11713985748430764, + "grad_norm": 0.5008269548416138, + "learning_rate": 9.802254917969032e-05, + "loss": 1.0162, + "step": 1311 + }, + { + "epoch": 0.1172292090155695, + "grad_norm": 0.4323752224445343, + "learning_rate": 9.801851781025322e-05, + "loss": 1.1209, + "step": 1312 + }, + { + "epoch": 0.11731856054683137, + "grad_norm": 0.5104453563690186, + "learning_rate": 9.80144824187534e-05, + "loss": 0.9974, + "step": 1313 + }, + { + "epoch": 0.11740791207809324, + "grad_norm": 0.4443637430667877, + "learning_rate": 9.801044300552887e-05, + "loss": 1.0159, + "step": 1314 + }, + { + "epoch": 0.1174972636093551, + "grad_norm": 0.44141674041748047, + "learning_rate": 9.800639957091799e-05, + "loss": 1.0203, + "step": 1315 + }, + { + "epoch": 0.11758661514061697, + "grad_norm": 0.44127362966537476, + "learning_rate": 9.800235211525945e-05, + "loss": 1.1218, + "step": 1316 + }, + { + "epoch": 0.11767596667187884, + "grad_norm": 0.401253342628479, + "learning_rate": 9.799830063889223e-05, + "loss": 1.1097, + "step": 1317 + }, + { + "epoch": 0.1177653182031407, + "grad_norm": 0.5445295572280884, + "learning_rate": 9.799424514215572e-05, + "loss": 1.0315, + "step": 1318 + }, + { + "epoch": 0.11785466973440258, + "grad_norm": 0.3961075246334076, + "learning_rate": 9.79901856253896e-05, + "loss": 1.0794, + "step": 1319 + }, + { + "epoch": 0.11794402126566444, + "grad_norm": 0.4312506318092346, + "learning_rate": 9.798612208893389e-05, + "loss": 1.0732, + "step": 1320 + }, + { + "epoch": 0.11803337279692631, + "grad_norm": 0.5154565572738647, + "learning_rate": 9.798205453312895e-05, + "loss": 1.1045, + "step": 1321 + }, + { + "epoch": 0.11812272432818817, + "grad_norm": 0.4929812550544739, + "learning_rate": 9.79779829583155e-05, + "loss": 1.009, + "step": 1322 + }, + { + "epoch": 0.11821207585945004, + "grad_norm": 0.39952999353408813, + "learning_rate": 9.797390736483459e-05, + "loss": 1.0558, + "step": 1323 + }, + { + "epoch": 0.11830142739071191, + "grad_norm": 0.47065678238868713, + "learning_rate": 9.796982775302755e-05, + "loss": 1.0119, + "step": 1324 + }, + { + "epoch": 0.11839077892197378, + "grad_norm": 0.39249664545059204, + "learning_rate": 9.796574412323611e-05, + "loss": 1.0805, + "step": 1325 + }, + { + "epoch": 0.11848013045323565, + "grad_norm": 0.38709941506385803, + "learning_rate": 9.796165647580233e-05, + "loss": 1.0919, + "step": 1326 + }, + { + "epoch": 0.11856948198449752, + "grad_norm": 0.4576520323753357, + "learning_rate": 9.795756481106857e-05, + "loss": 1.082, + "step": 1327 + }, + { + "epoch": 0.11865883351575937, + "grad_norm": 0.42923304438591003, + "learning_rate": 9.795346912937757e-05, + "loss": 0.9918, + "step": 1328 + }, + { + "epoch": 0.11874818504702124, + "grad_norm": 0.38031134009361267, + "learning_rate": 9.79493694310724e-05, + "loss": 1.0198, + "step": 1329 + }, + { + "epoch": 0.11883753657828311, + "grad_norm": 0.39930686354637146, + "learning_rate": 9.794526571649643e-05, + "loss": 1.0082, + "step": 1330 + }, + { + "epoch": 0.11892688810954498, + "grad_norm": 0.463544636964798, + "learning_rate": 9.794115798599339e-05, + "loss": 1.1003, + "step": 1331 + }, + { + "epoch": 0.11901623964080685, + "grad_norm": 0.42002353072166443, + "learning_rate": 9.793704623990736e-05, + "loss": 1.0518, + "step": 1332 + }, + { + "epoch": 0.11910559117206872, + "grad_norm": 0.39078399538993835, + "learning_rate": 9.793293047858274e-05, + "loss": 1.0775, + "step": 1333 + }, + { + "epoch": 0.11919494270333057, + "grad_norm": 0.44137701392173767, + "learning_rate": 9.792881070236426e-05, + "loss": 1.0588, + "step": 1334 + }, + { + "epoch": 0.11928429423459244, + "grad_norm": 0.40128040313720703, + "learning_rate": 9.7924686911597e-05, + "loss": 1.0676, + "step": 1335 + }, + { + "epoch": 0.11937364576585431, + "grad_norm": 0.46440789103507996, + "learning_rate": 9.792055910662636e-05, + "loss": 1.1613, + "step": 1336 + }, + { + "epoch": 0.11946299729711618, + "grad_norm": 0.36990535259246826, + "learning_rate": 9.791642728779811e-05, + "loss": 1.0511, + "step": 1337 + }, + { + "epoch": 0.11955234882837805, + "grad_norm": 0.47823598980903625, + "learning_rate": 9.791229145545831e-05, + "loss": 0.9977, + "step": 1338 + }, + { + "epoch": 0.11964170035963992, + "grad_norm": 0.3946368992328644, + "learning_rate": 9.790815160995342e-05, + "loss": 1.0871, + "step": 1339 + }, + { + "epoch": 0.11973105189090177, + "grad_norm": 0.5002843141555786, + "learning_rate": 9.790400775163014e-05, + "loss": 0.9361, + "step": 1340 + }, + { + "epoch": 0.11982040342216364, + "grad_norm": 0.4414635896682739, + "learning_rate": 9.789985988083558e-05, + "loss": 1.0857, + "step": 1341 + }, + { + "epoch": 0.11990975495342551, + "grad_norm": 0.4567340314388275, + "learning_rate": 9.789570799791721e-05, + "loss": 0.9954, + "step": 1342 + }, + { + "epoch": 0.11999910648468738, + "grad_norm": 0.46271204948425293, + "learning_rate": 9.789155210322276e-05, + "loss": 0.9978, + "step": 1343 + }, + { + "epoch": 0.12008845801594925, + "grad_norm": 0.45726093649864197, + "learning_rate": 9.788739219710032e-05, + "loss": 1.0101, + "step": 1344 + }, + { + "epoch": 0.12017780954721112, + "grad_norm": 0.4213901162147522, + "learning_rate": 9.788322827989836e-05, + "loss": 1.061, + "step": 1345 + }, + { + "epoch": 0.12026716107847298, + "grad_norm": 0.5426849126815796, + "learning_rate": 9.787906035196562e-05, + "loss": 1.0625, + "step": 1346 + }, + { + "epoch": 0.12035651260973484, + "grad_norm": 0.4493926167488098, + "learning_rate": 9.787488841365122e-05, + "loss": 1.021, + "step": 1347 + }, + { + "epoch": 0.12044586414099671, + "grad_norm": 0.47702398896217346, + "learning_rate": 9.787071246530459e-05, + "loss": 1.0161, + "step": 1348 + }, + { + "epoch": 0.12053521567225858, + "grad_norm": 0.5239232778549194, + "learning_rate": 9.786653250727555e-05, + "loss": 1.0321, + "step": 1349 + }, + { + "epoch": 0.12062456720352045, + "grad_norm": 0.4591730237007141, + "learning_rate": 9.786234853991418e-05, + "loss": 0.9884, + "step": 1350 + }, + { + "epoch": 0.12071391873478232, + "grad_norm": 0.39837637543678284, + "learning_rate": 9.785816056357095e-05, + "loss": 1.0764, + "step": 1351 + }, + { + "epoch": 0.12080327026604419, + "grad_norm": 0.42193183302879333, + "learning_rate": 9.785396857859664e-05, + "loss": 1.0636, + "step": 1352 + }, + { + "epoch": 0.12089262179730605, + "grad_norm": 0.48460525274276733, + "learning_rate": 9.784977258534239e-05, + "loss": 1.0683, + "step": 1353 + }, + { + "epoch": 0.12098197332856792, + "grad_norm": 0.45954951643943787, + "learning_rate": 9.784557258415963e-05, + "loss": 1.0613, + "step": 1354 + }, + { + "epoch": 0.12107132485982978, + "grad_norm": 0.4340897500514984, + "learning_rate": 9.784136857540015e-05, + "loss": 0.9937, + "step": 1355 + }, + { + "epoch": 0.12116067639109165, + "grad_norm": 0.39842215180397034, + "learning_rate": 9.783716055941612e-05, + "loss": 1.0619, + "step": 1356 + }, + { + "epoch": 0.12125002792235352, + "grad_norm": 0.38093817234039307, + "learning_rate": 9.783294853655999e-05, + "loss": 1.0357, + "step": 1357 + }, + { + "epoch": 0.12133937945361539, + "grad_norm": 0.5032150149345398, + "learning_rate": 9.782873250718455e-05, + "loss": 0.914, + "step": 1358 + }, + { + "epoch": 0.12142873098487725, + "grad_norm": 0.4284474551677704, + "learning_rate": 9.782451247164295e-05, + "loss": 1.0058, + "step": 1359 + }, + { + "epoch": 0.12151808251613912, + "grad_norm": 0.40385428071022034, + "learning_rate": 9.782028843028865e-05, + "loss": 1.0313, + "step": 1360 + }, + { + "epoch": 0.12160743404740099, + "grad_norm": 0.48033663630485535, + "learning_rate": 9.781606038347547e-05, + "loss": 1.005, + "step": 1361 + }, + { + "epoch": 0.12169678557866286, + "grad_norm": 0.3936779201030731, + "learning_rate": 9.781182833155755e-05, + "loss": 1.0181, + "step": 1362 + }, + { + "epoch": 0.12178613710992472, + "grad_norm": 0.46711423993110657, + "learning_rate": 9.780759227488936e-05, + "loss": 1.0816, + "step": 1363 + }, + { + "epoch": 0.1218754886411866, + "grad_norm": 0.43602418899536133, + "learning_rate": 9.780335221382574e-05, + "loss": 1.1094, + "step": 1364 + }, + { + "epoch": 0.12196484017244845, + "grad_norm": 0.3990572690963745, + "learning_rate": 9.779910814872182e-05, + "loss": 0.9995, + "step": 1365 + }, + { + "epoch": 0.12205419170371032, + "grad_norm": 0.5048510432243347, + "learning_rate": 9.77948600799331e-05, + "loss": 1.0082, + "step": 1366 + }, + { + "epoch": 0.12214354323497219, + "grad_norm": 0.3975992202758789, + "learning_rate": 9.779060800781537e-05, + "loss": 1.0578, + "step": 1367 + }, + { + "epoch": 0.12223289476623406, + "grad_norm": 0.4395469129085541, + "learning_rate": 9.778635193272483e-05, + "loss": 1.0318, + "step": 1368 + }, + { + "epoch": 0.12232224629749593, + "grad_norm": 0.4118629992008209, + "learning_rate": 9.778209185501794e-05, + "loss": 1.0436, + "step": 1369 + }, + { + "epoch": 0.1224115978287578, + "grad_norm": 0.4724205732345581, + "learning_rate": 9.777782777505153e-05, + "loss": 0.9656, + "step": 1370 + }, + { + "epoch": 0.12250094936001965, + "grad_norm": 0.49588900804519653, + "learning_rate": 9.777355969318278e-05, + "loss": 0.9988, + "step": 1371 + }, + { + "epoch": 0.12259030089128152, + "grad_norm": 0.4462149441242218, + "learning_rate": 9.776928760976918e-05, + "loss": 1.0914, + "step": 1372 + }, + { + "epoch": 0.12267965242254339, + "grad_norm": 0.3685830533504486, + "learning_rate": 9.776501152516855e-05, + "loss": 1.0973, + "step": 1373 + }, + { + "epoch": 0.12276900395380526, + "grad_norm": 0.5166415572166443, + "learning_rate": 9.776073143973904e-05, + "loss": 0.9822, + "step": 1374 + }, + { + "epoch": 0.12285835548506713, + "grad_norm": 0.4535563588142395, + "learning_rate": 9.775644735383922e-05, + "loss": 1.0124, + "step": 1375 + }, + { + "epoch": 0.122947707016329, + "grad_norm": 0.40610021352767944, + "learning_rate": 9.775215926782788e-05, + "loss": 1.0909, + "step": 1376 + }, + { + "epoch": 0.12303705854759087, + "grad_norm": 0.4109049141407013, + "learning_rate": 9.774786718206419e-05, + "loss": 1.0908, + "step": 1377 + }, + { + "epoch": 0.12312641007885272, + "grad_norm": 0.4295797348022461, + "learning_rate": 9.774357109690767e-05, + "loss": 1.0351, + "step": 1378 + }, + { + "epoch": 0.12321576161011459, + "grad_norm": 0.49584078788757324, + "learning_rate": 9.773927101271816e-05, + "loss": 1.0216, + "step": 1379 + }, + { + "epoch": 0.12330511314137646, + "grad_norm": 0.44206714630126953, + "learning_rate": 9.773496692985584e-05, + "loss": 0.9854, + "step": 1380 + }, + { + "epoch": 0.12339446467263833, + "grad_norm": 0.39388856291770935, + "learning_rate": 9.773065884868122e-05, + "loss": 1.0904, + "step": 1381 + }, + { + "epoch": 0.1234838162039002, + "grad_norm": 0.4499436914920807, + "learning_rate": 9.772634676955515e-05, + "loss": 1.0563, + "step": 1382 + }, + { + "epoch": 0.12357316773516207, + "grad_norm": 0.48682132363319397, + "learning_rate": 9.772203069283881e-05, + "loss": 1.0727, + "step": 1383 + }, + { + "epoch": 0.12366251926642392, + "grad_norm": 0.38047704100608826, + "learning_rate": 9.771771061889373e-05, + "loss": 1.0871, + "step": 1384 + }, + { + "epoch": 0.12375187079768579, + "grad_norm": 0.46172595024108887, + "learning_rate": 9.771338654808173e-05, + "loss": 1.0829, + "step": 1385 + }, + { + "epoch": 0.12384122232894766, + "grad_norm": 0.44145411252975464, + "learning_rate": 9.770905848076504e-05, + "loss": 1.0631, + "step": 1386 + }, + { + "epoch": 0.12393057386020953, + "grad_norm": 0.4803364872932434, + "learning_rate": 9.770472641730615e-05, + "loss": 0.9805, + "step": 1387 + }, + { + "epoch": 0.1240199253914714, + "grad_norm": 0.4117061197757721, + "learning_rate": 9.770039035806792e-05, + "loss": 1.023, + "step": 1388 + }, + { + "epoch": 0.12410927692273327, + "grad_norm": 0.4862026572227478, + "learning_rate": 9.769605030341357e-05, + "loss": 1.0725, + "step": 1389 + }, + { + "epoch": 0.12419862845399512, + "grad_norm": 0.4829283654689789, + "learning_rate": 9.769170625370658e-05, + "loss": 0.993, + "step": 1390 + }, + { + "epoch": 0.124287979985257, + "grad_norm": 0.4735516309738159, + "learning_rate": 9.768735820931085e-05, + "loss": 1.059, + "step": 1391 + }, + { + "epoch": 0.12437733151651886, + "grad_norm": 0.4585529565811157, + "learning_rate": 9.768300617059055e-05, + "loss": 1.0455, + "step": 1392 + }, + { + "epoch": 0.12446668304778073, + "grad_norm": 0.40741658210754395, + "learning_rate": 9.767865013791022e-05, + "loss": 0.9718, + "step": 1393 + }, + { + "epoch": 0.1245560345790426, + "grad_norm": 0.4379764497280121, + "learning_rate": 9.767429011163473e-05, + "loss": 1.0147, + "step": 1394 + }, + { + "epoch": 0.12464538611030447, + "grad_norm": 0.46133920550346375, + "learning_rate": 9.766992609212926e-05, + "loss": 1.037, + "step": 1395 + }, + { + "epoch": 0.12473473764156633, + "grad_norm": 0.4180471897125244, + "learning_rate": 9.766555807975936e-05, + "loss": 1.0846, + "step": 1396 + }, + { + "epoch": 0.1248240891728282, + "grad_norm": 0.5262375473976135, + "learning_rate": 9.76611860748909e-05, + "loss": 1.0615, + "step": 1397 + }, + { + "epoch": 0.12491344070409006, + "grad_norm": 0.4973163604736328, + "learning_rate": 9.765681007789008e-05, + "loss": 1.0329, + "step": 1398 + }, + { + "epoch": 0.12500279223535193, + "grad_norm": 0.48663395643234253, + "learning_rate": 9.765243008912342e-05, + "loss": 1.011, + "step": 1399 + }, + { + "epoch": 0.1250921437666138, + "grad_norm": 0.52272629737854, + "learning_rate": 9.76480461089578e-05, + "loss": 1.0271, + "step": 1400 + }, + { + "epoch": 0.12518149529787567, + "grad_norm": 0.5239830613136292, + "learning_rate": 9.764365813776042e-05, + "loss": 1.061, + "step": 1401 + }, + { + "epoch": 0.12527084682913753, + "grad_norm": 0.44980114698410034, + "learning_rate": 9.763926617589883e-05, + "loss": 1.0671, + "step": 1402 + }, + { + "epoch": 0.1253601983603994, + "grad_norm": 0.4098733961582184, + "learning_rate": 9.763487022374092e-05, + "loss": 0.9897, + "step": 1403 + }, + { + "epoch": 0.12544954989166127, + "grad_norm": 0.360730916261673, + "learning_rate": 9.763047028165484e-05, + "loss": 1.0978, + "step": 1404 + }, + { + "epoch": 0.12553890142292312, + "grad_norm": 0.518429160118103, + "learning_rate": 9.762606635000919e-05, + "loss": 0.9659, + "step": 1405 + }, + { + "epoch": 0.125628252954185, + "grad_norm": 0.4658845365047455, + "learning_rate": 9.762165842917283e-05, + "loss": 1.0005, + "step": 1406 + }, + { + "epoch": 0.12571760448544686, + "grad_norm": 0.3965921700000763, + "learning_rate": 9.761724651951498e-05, + "loss": 1.029, + "step": 1407 + }, + { + "epoch": 0.12580695601670874, + "grad_norm": 0.4184513986110687, + "learning_rate": 9.761283062140514e-05, + "loss": 0.9864, + "step": 1408 + }, + { + "epoch": 0.1258963075479706, + "grad_norm": 0.37843790650367737, + "learning_rate": 9.760841073521323e-05, + "loss": 1.1095, + "step": 1409 + }, + { + "epoch": 0.12598565907923248, + "grad_norm": 0.4006551504135132, + "learning_rate": 9.760398686130946e-05, + "loss": 1.0732, + "step": 1410 + }, + { + "epoch": 0.12607501061049434, + "grad_norm": 0.407956600189209, + "learning_rate": 9.759955900006436e-05, + "loss": 1.064, + "step": 1411 + }, + { + "epoch": 0.1261643621417562, + "grad_norm": 0.42004308104515076, + "learning_rate": 9.759512715184881e-05, + "loss": 1.0558, + "step": 1412 + }, + { + "epoch": 0.12625371367301808, + "grad_norm": 0.411745548248291, + "learning_rate": 9.759069131703406e-05, + "loss": 1.0318, + "step": 1413 + }, + { + "epoch": 0.12634306520427993, + "grad_norm": 0.40602388978004456, + "learning_rate": 9.75862514959916e-05, + "loss": 1.0533, + "step": 1414 + }, + { + "epoch": 0.1264324167355418, + "grad_norm": 0.4616459906101227, + "learning_rate": 9.758180768909337e-05, + "loss": 1.0691, + "step": 1415 + }, + { + "epoch": 0.12652176826680367, + "grad_norm": 0.5043428540229797, + "learning_rate": 9.757735989671156e-05, + "loss": 1.0047, + "step": 1416 + }, + { + "epoch": 0.12661111979806555, + "grad_norm": 0.4009447693824768, + "learning_rate": 9.75729081192187e-05, + "loss": 1.0422, + "step": 1417 + }, + { + "epoch": 0.1267004713293274, + "grad_norm": 0.3939031660556793, + "learning_rate": 9.756845235698772e-05, + "loss": 1.0726, + "step": 1418 + }, + { + "epoch": 0.12678982286058926, + "grad_norm": 0.42009732127189636, + "learning_rate": 9.756399261039179e-05, + "loss": 1.0992, + "step": 1419 + }, + { + "epoch": 0.12687917439185115, + "grad_norm": 0.475175142288208, + "learning_rate": 9.75595288798045e-05, + "loss": 1.079, + "step": 1420 + }, + { + "epoch": 0.126968525923113, + "grad_norm": 0.42921051383018494, + "learning_rate": 9.755506116559971e-05, + "loss": 0.999, + "step": 1421 + }, + { + "epoch": 0.12705787745437488, + "grad_norm": 0.4029146730899811, + "learning_rate": 9.755058946815164e-05, + "loss": 1.0903, + "step": 1422 + }, + { + "epoch": 0.12714722898563674, + "grad_norm": 0.4076593518257141, + "learning_rate": 9.754611378783486e-05, + "loss": 1.1189, + "step": 1423 + }, + { + "epoch": 0.1272365805168986, + "grad_norm": 0.41739514470100403, + "learning_rate": 9.754163412502424e-05, + "loss": 1.0114, + "step": 1424 + }, + { + "epoch": 0.12732593204816048, + "grad_norm": 0.4175979197025299, + "learning_rate": 9.7537150480095e-05, + "loss": 1.0852, + "step": 1425 + }, + { + "epoch": 0.12741528357942233, + "grad_norm": 0.5235275030136108, + "learning_rate": 9.75326628534227e-05, + "loss": 1.0744, + "step": 1426 + }, + { + "epoch": 0.12750463511068422, + "grad_norm": 0.41696855425834656, + "learning_rate": 9.752817124538324e-05, + "loss": 1.0345, + "step": 1427 + }, + { + "epoch": 0.12759398664194607, + "grad_norm": 0.4305160641670227, + "learning_rate": 9.752367565635281e-05, + "loss": 1.0633, + "step": 1428 + }, + { + "epoch": 0.12768333817320796, + "grad_norm": 0.4274474084377289, + "learning_rate": 9.751917608670797e-05, + "loss": 1.0398, + "step": 1429 + }, + { + "epoch": 0.1277726897044698, + "grad_norm": 0.41223591566085815, + "learning_rate": 9.751467253682563e-05, + "loss": 1.0496, + "step": 1430 + }, + { + "epoch": 0.12786204123573167, + "grad_norm": 0.4205789566040039, + "learning_rate": 9.751016500708298e-05, + "loss": 0.9926, + "step": 1431 + }, + { + "epoch": 0.12795139276699355, + "grad_norm": 0.3847178816795349, + "learning_rate": 9.75056534978576e-05, + "loss": 1.0912, + "step": 1432 + }, + { + "epoch": 0.1280407442982554, + "grad_norm": 0.3665478527545929, + "learning_rate": 9.750113800952738e-05, + "loss": 1.0538, + "step": 1433 + }, + { + "epoch": 0.1281300958295173, + "grad_norm": 0.4411013126373291, + "learning_rate": 9.74966185424705e-05, + "loss": 1.0351, + "step": 1434 + }, + { + "epoch": 0.12821944736077914, + "grad_norm": 0.5092450380325317, + "learning_rate": 9.749209509706555e-05, + "loss": 1.0013, + "step": 1435 + }, + { + "epoch": 0.12830879889204103, + "grad_norm": 0.39857053756713867, + "learning_rate": 9.74875676736914e-05, + "loss": 1.0148, + "step": 1436 + }, + { + "epoch": 0.12839815042330288, + "grad_norm": 0.4596467912197113, + "learning_rate": 9.74830362727273e-05, + "loss": 1.0708, + "step": 1437 + }, + { + "epoch": 0.12848750195456474, + "grad_norm": 0.4768296182155609, + "learning_rate": 9.747850089455275e-05, + "loss": 1.0696, + "step": 1438 + }, + { + "epoch": 0.12857685348582662, + "grad_norm": 0.39800745248794556, + "learning_rate": 9.747396153954767e-05, + "loss": 1.0498, + "step": 1439 + }, + { + "epoch": 0.12866620501708848, + "grad_norm": 0.36220765113830566, + "learning_rate": 9.746941820809229e-05, + "loss": 1.0571, + "step": 1440 + }, + { + "epoch": 0.12875555654835036, + "grad_norm": 0.4107576012611389, + "learning_rate": 9.746487090056713e-05, + "loss": 1.0934, + "step": 1441 + }, + { + "epoch": 0.1288449080796122, + "grad_norm": 0.5009557604789734, + "learning_rate": 9.746031961735311e-05, + "loss": 1.021, + "step": 1442 + }, + { + "epoch": 0.12893425961087407, + "grad_norm": 0.43069615960121155, + "learning_rate": 9.745576435883142e-05, + "loss": 1.0339, + "step": 1443 + }, + { + "epoch": 0.12902361114213595, + "grad_norm": 0.515500009059906, + "learning_rate": 9.745120512538362e-05, + "loss": 0.9844, + "step": 1444 + }, + { + "epoch": 0.1291129626733978, + "grad_norm": 0.49051618576049805, + "learning_rate": 9.744664191739161e-05, + "loss": 1.0346, + "step": 1445 + }, + { + "epoch": 0.1292023142046597, + "grad_norm": 0.4764910042285919, + "learning_rate": 9.744207473523759e-05, + "loss": 1.0409, + "step": 1446 + }, + { + "epoch": 0.12929166573592155, + "grad_norm": 0.37674903869628906, + "learning_rate": 9.74375035793041e-05, + "loss": 1.0546, + "step": 1447 + }, + { + "epoch": 0.12938101726718343, + "grad_norm": 0.524577796459198, + "learning_rate": 9.743292844997407e-05, + "loss": 1.0141, + "step": 1448 + }, + { + "epoch": 0.12947036879844528, + "grad_norm": 0.43543311953544617, + "learning_rate": 9.742834934763066e-05, + "loss": 0.9927, + "step": 1449 + }, + { + "epoch": 0.12955972032970714, + "grad_norm": 0.4244259297847748, + "learning_rate": 9.742376627265745e-05, + "loss": 1.0312, + "step": 1450 + }, + { + "epoch": 0.12964907186096902, + "grad_norm": 0.41922134160995483, + "learning_rate": 9.74191792254383e-05, + "loss": 1.0319, + "step": 1451 + }, + { + "epoch": 0.12973842339223088, + "grad_norm": 0.38788026571273804, + "learning_rate": 9.741458820635745e-05, + "loss": 1.1334, + "step": 1452 + }, + { + "epoch": 0.12982777492349276, + "grad_norm": 0.42001378536224365, + "learning_rate": 9.740999321579943e-05, + "loss": 1.0905, + "step": 1453 + }, + { + "epoch": 0.12991712645475462, + "grad_norm": 0.41198793053627014, + "learning_rate": 9.740539425414912e-05, + "loss": 1.0529, + "step": 1454 + }, + { + "epoch": 0.13000647798601647, + "grad_norm": 0.36228737235069275, + "learning_rate": 9.740079132179175e-05, + "loss": 1.0103, + "step": 1455 + }, + { + "epoch": 0.13009582951727836, + "grad_norm": 0.48605382442474365, + "learning_rate": 9.739618441911285e-05, + "loss": 1.0234, + "step": 1456 + }, + { + "epoch": 0.1301851810485402, + "grad_norm": 0.49182093143463135, + "learning_rate": 9.739157354649829e-05, + "loss": 0.9932, + "step": 1457 + }, + { + "epoch": 0.1302745325798021, + "grad_norm": 0.3926076591014862, + "learning_rate": 9.738695870433428e-05, + "loss": 1.0766, + "step": 1458 + }, + { + "epoch": 0.13036388411106395, + "grad_norm": 0.5217881202697754, + "learning_rate": 9.738233989300739e-05, + "loss": 1.0239, + "step": 1459 + }, + { + "epoch": 0.13045323564232583, + "grad_norm": 0.4107765555381775, + "learning_rate": 9.737771711290447e-05, + "loss": 1.0517, + "step": 1460 + }, + { + "epoch": 0.1305425871735877, + "grad_norm": 0.48638463020324707, + "learning_rate": 9.737309036441271e-05, + "loss": 0.9408, + "step": 1461 + }, + { + "epoch": 0.13063193870484954, + "grad_norm": 0.4553143084049225, + "learning_rate": 9.736845964791968e-05, + "loss": 1.0842, + "step": 1462 + }, + { + "epoch": 0.13072129023611143, + "grad_norm": 0.42393583059310913, + "learning_rate": 9.736382496381325e-05, + "loss": 1.0693, + "step": 1463 + }, + { + "epoch": 0.13081064176737328, + "grad_norm": 0.43372994661331177, + "learning_rate": 9.735918631248162e-05, + "loss": 1.0923, + "step": 1464 + }, + { + "epoch": 0.13089999329863516, + "grad_norm": 0.42476969957351685, + "learning_rate": 9.735454369431332e-05, + "loss": 1.1099, + "step": 1465 + }, + { + "epoch": 0.13098934482989702, + "grad_norm": 0.4615803360939026, + "learning_rate": 9.734989710969722e-05, + "loss": 1.0546, + "step": 1466 + }, + { + "epoch": 0.1310786963611589, + "grad_norm": 0.3829108774662018, + "learning_rate": 9.734524655902253e-05, + "loss": 1.0674, + "step": 1467 + }, + { + "epoch": 0.13116804789242076, + "grad_norm": 0.4632830321788788, + "learning_rate": 9.734059204267878e-05, + "loss": 1.0149, + "step": 1468 + }, + { + "epoch": 0.1312573994236826, + "grad_norm": 0.4512673020362854, + "learning_rate": 9.733593356105581e-05, + "loss": 1.018, + "step": 1469 + }, + { + "epoch": 0.1313467509549445, + "grad_norm": 0.5254853963851929, + "learning_rate": 9.733127111454385e-05, + "loss": 0.9973, + "step": 1470 + }, + { + "epoch": 0.13143610248620635, + "grad_norm": 0.4638168215751648, + "learning_rate": 9.732660470353343e-05, + "loss": 1.0137, + "step": 1471 + }, + { + "epoch": 0.13152545401746824, + "grad_norm": 0.40837833285331726, + "learning_rate": 9.732193432841539e-05, + "loss": 1.1323, + "step": 1472 + }, + { + "epoch": 0.1316148055487301, + "grad_norm": 0.39538437128067017, + "learning_rate": 9.731725998958095e-05, + "loss": 1.0323, + "step": 1473 + }, + { + "epoch": 0.13170415707999195, + "grad_norm": 0.429143488407135, + "learning_rate": 9.73125816874216e-05, + "loss": 1.0897, + "step": 1474 + }, + { + "epoch": 0.13179350861125383, + "grad_norm": 0.48212477564811707, + "learning_rate": 9.730789942232923e-05, + "loss": 0.9875, + "step": 1475 + }, + { + "epoch": 0.13188286014251568, + "grad_norm": 0.37595105171203613, + "learning_rate": 9.730321319469601e-05, + "loss": 1.0909, + "step": 1476 + }, + { + "epoch": 0.13197221167377757, + "grad_norm": 0.4069534242153168, + "learning_rate": 9.729852300491447e-05, + "loss": 1.0292, + "step": 1477 + }, + { + "epoch": 0.13206156320503942, + "grad_norm": 0.4816598892211914, + "learning_rate": 9.729382885337747e-05, + "loss": 1.0329, + "step": 1478 + }, + { + "epoch": 0.1321509147363013, + "grad_norm": 0.4182417392730713, + "learning_rate": 9.728913074047819e-05, + "loss": 1.0831, + "step": 1479 + }, + { + "epoch": 0.13224026626756316, + "grad_norm": 0.4294276535511017, + "learning_rate": 9.728442866661013e-05, + "loss": 1.0979, + "step": 1480 + }, + { + "epoch": 0.13232961779882502, + "grad_norm": 0.4855419099330902, + "learning_rate": 9.727972263216716e-05, + "loss": 0.994, + "step": 1481 + }, + { + "epoch": 0.1324189693300869, + "grad_norm": 0.3743133544921875, + "learning_rate": 9.727501263754346e-05, + "loss": 1.1104, + "step": 1482 + }, + { + "epoch": 0.13250832086134876, + "grad_norm": 0.47284814715385437, + "learning_rate": 9.727029868313352e-05, + "loss": 1.0346, + "step": 1483 + }, + { + "epoch": 0.13259767239261064, + "grad_norm": 0.4814388155937195, + "learning_rate": 9.726558076933221e-05, + "loss": 1.0394, + "step": 1484 + }, + { + "epoch": 0.1326870239238725, + "grad_norm": 0.48564615845680237, + "learning_rate": 9.726085889653469e-05, + "loss": 1.0232, + "step": 1485 + }, + { + "epoch": 0.13277637545513435, + "grad_norm": 0.4509584605693817, + "learning_rate": 9.725613306513648e-05, + "loss": 1.0218, + "step": 1486 + }, + { + "epoch": 0.13286572698639623, + "grad_norm": 0.45788347721099854, + "learning_rate": 9.725140327553342e-05, + "loss": 1.0633, + "step": 1487 + }, + { + "epoch": 0.1329550785176581, + "grad_norm": 0.5591961741447449, + "learning_rate": 9.724666952812166e-05, + "loss": 1.0504, + "step": 1488 + }, + { + "epoch": 0.13304443004891997, + "grad_norm": 0.49788787961006165, + "learning_rate": 9.724193182329772e-05, + "loss": 0.9796, + "step": 1489 + }, + { + "epoch": 0.13313378158018183, + "grad_norm": 0.5562415719032288, + "learning_rate": 9.723719016145843e-05, + "loss": 0.9633, + "step": 1490 + }, + { + "epoch": 0.1332231331114437, + "grad_norm": 0.49074721336364746, + "learning_rate": 9.723244454300093e-05, + "loss": 1.0881, + "step": 1491 + }, + { + "epoch": 0.13331248464270556, + "grad_norm": 0.42609742283821106, + "learning_rate": 9.722769496832275e-05, + "loss": 1.0958, + "step": 1492 + }, + { + "epoch": 0.13340183617396742, + "grad_norm": 0.5812770128250122, + "learning_rate": 9.722294143782171e-05, + "loss": 1.1263, + "step": 1493 + }, + { + "epoch": 0.1334911877052293, + "grad_norm": 0.4071921408176422, + "learning_rate": 9.721818395189597e-05, + "loss": 1.0965, + "step": 1494 + }, + { + "epoch": 0.13358053923649116, + "grad_norm": 0.4325934648513794, + "learning_rate": 9.7213422510944e-05, + "loss": 1.0716, + "step": 1495 + }, + { + "epoch": 0.13366989076775304, + "grad_norm": 0.4646300673484802, + "learning_rate": 9.720865711536464e-05, + "loss": 1.0785, + "step": 1496 + }, + { + "epoch": 0.1337592422990149, + "grad_norm": 0.4847221076488495, + "learning_rate": 9.720388776555704e-05, + "loss": 1.0767, + "step": 1497 + }, + { + "epoch": 0.13384859383027678, + "grad_norm": 0.3830418884754181, + "learning_rate": 9.71991144619207e-05, + "loss": 1.0585, + "step": 1498 + }, + { + "epoch": 0.13393794536153864, + "grad_norm": 0.4215502142906189, + "learning_rate": 9.719433720485539e-05, + "loss": 1.0535, + "step": 1499 + }, + { + "epoch": 0.1340272968928005, + "grad_norm": 0.4030102789402008, + "learning_rate": 9.718955599476129e-05, + "loss": 1.0467, + "step": 1500 + }, + { + "epoch": 0.13411664842406237, + "grad_norm": 0.4001905024051666, + "learning_rate": 9.718477083203887e-05, + "loss": 1.0354, + "step": 1501 + }, + { + "epoch": 0.13420599995532423, + "grad_norm": 0.6067075133323669, + "learning_rate": 9.717998171708895e-05, + "loss": 0.9813, + "step": 1502 + }, + { + "epoch": 0.1342953514865861, + "grad_norm": 0.4722452163696289, + "learning_rate": 9.717518865031266e-05, + "loss": 1.0531, + "step": 1503 + }, + { + "epoch": 0.13438470301784797, + "grad_norm": 0.45017173886299133, + "learning_rate": 9.717039163211146e-05, + "loss": 0.9219, + "step": 1504 + }, + { + "epoch": 0.13447405454910982, + "grad_norm": 0.5280361771583557, + "learning_rate": 9.716559066288715e-05, + "loss": 0.9697, + "step": 1505 + }, + { + "epoch": 0.1345634060803717, + "grad_norm": 0.4430614709854126, + "learning_rate": 9.716078574304189e-05, + "loss": 1.0229, + "step": 1506 + }, + { + "epoch": 0.13465275761163356, + "grad_norm": 0.4364091157913208, + "learning_rate": 9.715597687297813e-05, + "loss": 1.0449, + "step": 1507 + }, + { + "epoch": 0.13474210914289544, + "grad_norm": 0.43157270550727844, + "learning_rate": 9.715116405309865e-05, + "loss": 1.0936, + "step": 1508 + }, + { + "epoch": 0.1348314606741573, + "grad_norm": 0.439143568277359, + "learning_rate": 9.714634728380658e-05, + "loss": 1.0145, + "step": 1509 + }, + { + "epoch": 0.13492081220541918, + "grad_norm": 0.3858429789543152, + "learning_rate": 9.714152656550539e-05, + "loss": 1.1219, + "step": 1510 + }, + { + "epoch": 0.13501016373668104, + "grad_norm": 0.4329967200756073, + "learning_rate": 9.713670189859887e-05, + "loss": 0.9739, + "step": 1511 + }, + { + "epoch": 0.1350995152679429, + "grad_norm": 0.4508189260959625, + "learning_rate": 9.713187328349111e-05, + "loss": 1.0605, + "step": 1512 + }, + { + "epoch": 0.13518886679920478, + "grad_norm": 0.4645385444164276, + "learning_rate": 9.712704072058656e-05, + "loss": 0.9873, + "step": 1513 + }, + { + "epoch": 0.13527821833046663, + "grad_norm": 0.5293644666671753, + "learning_rate": 9.712220421029003e-05, + "loss": 1.0814, + "step": 1514 + }, + { + "epoch": 0.13536756986172852, + "grad_norm": 0.4389038681983948, + "learning_rate": 9.711736375300661e-05, + "loss": 1.0325, + "step": 1515 + }, + { + "epoch": 0.13545692139299037, + "grad_norm": 0.349090039730072, + "learning_rate": 9.711251934914174e-05, + "loss": 1.1654, + "step": 1516 + }, + { + "epoch": 0.13554627292425223, + "grad_norm": 0.5453407168388367, + "learning_rate": 9.710767099910119e-05, + "loss": 1.0776, + "step": 1517 + }, + { + "epoch": 0.1356356244555141, + "grad_norm": 0.4334562420845032, + "learning_rate": 9.710281870329105e-05, + "loss": 1.0226, + "step": 1518 + }, + { + "epoch": 0.13572497598677596, + "grad_norm": 0.3693782687187195, + "learning_rate": 9.709796246211777e-05, + "loss": 1.0437, + "step": 1519 + }, + { + "epoch": 0.13581432751803785, + "grad_norm": 0.49413368105888367, + "learning_rate": 9.709310227598811e-05, + "loss": 1.0567, + "step": 1520 + }, + { + "epoch": 0.1359036790492997, + "grad_norm": 0.4487200081348419, + "learning_rate": 9.708823814530917e-05, + "loss": 0.981, + "step": 1521 + }, + { + "epoch": 0.13599303058056159, + "grad_norm": 0.38888999819755554, + "learning_rate": 9.708337007048834e-05, + "loss": 1.0563, + "step": 1522 + }, + { + "epoch": 0.13608238211182344, + "grad_norm": 0.3906208276748657, + "learning_rate": 9.70784980519334e-05, + "loss": 1.0467, + "step": 1523 + }, + { + "epoch": 0.1361717336430853, + "grad_norm": 0.3793841302394867, + "learning_rate": 9.707362209005244e-05, + "loss": 1.0433, + "step": 1524 + }, + { + "epoch": 0.13626108517434718, + "grad_norm": 0.38040420413017273, + "learning_rate": 9.706874218525385e-05, + "loss": 1.0493, + "step": 1525 + }, + { + "epoch": 0.13635043670560903, + "grad_norm": 0.5357340574264526, + "learning_rate": 9.706385833794638e-05, + "loss": 0.9978, + "step": 1526 + }, + { + "epoch": 0.13643978823687092, + "grad_norm": 0.45937198400497437, + "learning_rate": 9.705897054853912e-05, + "loss": 1.0814, + "step": 1527 + }, + { + "epoch": 0.13652913976813277, + "grad_norm": 0.41274699568748474, + "learning_rate": 9.705407881744146e-05, + "loss": 1.0937, + "step": 1528 + }, + { + "epoch": 0.13661849129939466, + "grad_norm": 0.4878060519695282, + "learning_rate": 9.704918314506313e-05, + "loss": 1.0107, + "step": 1529 + }, + { + "epoch": 0.1367078428306565, + "grad_norm": 0.4097362756729126, + "learning_rate": 9.704428353181421e-05, + "loss": 1.1267, + "step": 1530 + }, + { + "epoch": 0.13679719436191837, + "grad_norm": 0.40500837564468384, + "learning_rate": 9.703937997810511e-05, + "loss": 1.0243, + "step": 1531 + }, + { + "epoch": 0.13688654589318025, + "grad_norm": 0.4004482626914978, + "learning_rate": 9.70344724843465e-05, + "loss": 1.0469, + "step": 1532 + }, + { + "epoch": 0.1369758974244421, + "grad_norm": 0.5236450433731079, + "learning_rate": 9.702956105094948e-05, + "loss": 0.9977, + "step": 1533 + }, + { + "epoch": 0.137065248955704, + "grad_norm": 0.4368230998516083, + "learning_rate": 9.702464567832543e-05, + "loss": 1.0354, + "step": 1534 + }, + { + "epoch": 0.13715460048696584, + "grad_norm": 0.4011850357055664, + "learning_rate": 9.701972636688606e-05, + "loss": 1.0555, + "step": 1535 + }, + { + "epoch": 0.1372439520182277, + "grad_norm": 0.4206826984882355, + "learning_rate": 9.701480311704339e-05, + "loss": 1.007, + "step": 1536 + }, + { + "epoch": 0.13733330354948958, + "grad_norm": 0.45997175574302673, + "learning_rate": 9.700987592920983e-05, + "loss": 1.0538, + "step": 1537 + }, + { + "epoch": 0.13742265508075144, + "grad_norm": 0.4319978356361389, + "learning_rate": 9.700494480379807e-05, + "loss": 0.9962, + "step": 1538 + }, + { + "epoch": 0.13751200661201332, + "grad_norm": 0.4265231490135193, + "learning_rate": 9.700000974122115e-05, + "loss": 0.9786, + "step": 1539 + }, + { + "epoch": 0.13760135814327518, + "grad_norm": 0.4962487518787384, + "learning_rate": 9.699507074189242e-05, + "loss": 1.0389, + "step": 1540 + }, + { + "epoch": 0.13769070967453706, + "grad_norm": 0.40575921535491943, + "learning_rate": 9.699012780622561e-05, + "loss": 1.0018, + "step": 1541 + }, + { + "epoch": 0.13778006120579891, + "grad_norm": 0.4042010009288788, + "learning_rate": 9.698518093463469e-05, + "loss": 1.0876, + "step": 1542 + }, + { + "epoch": 0.13786941273706077, + "grad_norm": 0.37715786695480347, + "learning_rate": 9.698023012753405e-05, + "loss": 1.0111, + "step": 1543 + }, + { + "epoch": 0.13795876426832265, + "grad_norm": 0.4954730272293091, + "learning_rate": 9.697527538533837e-05, + "loss": 0.9983, + "step": 1544 + }, + { + "epoch": 0.1380481157995845, + "grad_norm": 0.41885560750961304, + "learning_rate": 9.697031670846265e-05, + "loss": 1.042, + "step": 1545 + }, + { + "epoch": 0.1381374673308464, + "grad_norm": 0.3857840895652771, + "learning_rate": 9.696535409732224e-05, + "loss": 1.0516, + "step": 1546 + }, + { + "epoch": 0.13822681886210825, + "grad_norm": 0.3953034281730652, + "learning_rate": 9.696038755233282e-05, + "loss": 1.1183, + "step": 1547 + }, + { + "epoch": 0.1383161703933701, + "grad_norm": 0.4239901006221771, + "learning_rate": 9.695541707391036e-05, + "loss": 1.0304, + "step": 1548 + }, + { + "epoch": 0.13840552192463199, + "grad_norm": 0.4670591652393341, + "learning_rate": 9.695044266247122e-05, + "loss": 1.0292, + "step": 1549 + }, + { + "epoch": 0.13849487345589384, + "grad_norm": 0.40943849086761475, + "learning_rate": 9.694546431843205e-05, + "loss": 1.0806, + "step": 1550 + }, + { + "epoch": 0.13858422498715572, + "grad_norm": 0.4126596748828888, + "learning_rate": 9.694048204220985e-05, + "loss": 1.0379, + "step": 1551 + }, + { + "epoch": 0.13867357651841758, + "grad_norm": 0.4415280222892761, + "learning_rate": 9.693549583422191e-05, + "loss": 1.0358, + "step": 1552 + }, + { + "epoch": 0.13876292804967946, + "grad_norm": 0.4215603768825531, + "learning_rate": 9.69305056948859e-05, + "loss": 1.0874, + "step": 1553 + }, + { + "epoch": 0.13885227958094132, + "grad_norm": 0.42005017399787903, + "learning_rate": 9.692551162461981e-05, + "loss": 1.0552, + "step": 1554 + }, + { + "epoch": 0.13894163111220317, + "grad_norm": 0.4029698669910431, + "learning_rate": 9.692051362384193e-05, + "loss": 1.0081, + "step": 1555 + }, + { + "epoch": 0.13903098264346506, + "grad_norm": 0.5039706826210022, + "learning_rate": 9.691551169297089e-05, + "loss": 1.0135, + "step": 1556 + }, + { + "epoch": 0.1391203341747269, + "grad_norm": 0.48726481199264526, + "learning_rate": 9.691050583242567e-05, + "loss": 1.011, + "step": 1557 + }, + { + "epoch": 0.1392096857059888, + "grad_norm": 0.4722976088523865, + "learning_rate": 9.690549604262555e-05, + "loss": 0.9555, + "step": 1558 + }, + { + "epoch": 0.13929903723725065, + "grad_norm": 0.44578781723976135, + "learning_rate": 9.690048232399017e-05, + "loss": 1.0367, + "step": 1559 + }, + { + "epoch": 0.13938838876851253, + "grad_norm": 0.47405532002449036, + "learning_rate": 9.689546467693946e-05, + "loss": 1.0977, + "step": 1560 + }, + { + "epoch": 0.1394777402997744, + "grad_norm": 0.4186593294143677, + "learning_rate": 9.689044310189371e-05, + "loss": 1.0139, + "step": 1561 + }, + { + "epoch": 0.13956709183103624, + "grad_norm": 0.3761426508426666, + "learning_rate": 9.688541759927354e-05, + "loss": 1.0328, + "step": 1562 + }, + { + "epoch": 0.13965644336229813, + "grad_norm": 0.4144037365913391, + "learning_rate": 9.688038816949989e-05, + "loss": 1.0543, + "step": 1563 + }, + { + "epoch": 0.13974579489355998, + "grad_norm": 0.5622879862785339, + "learning_rate": 9.687535481299402e-05, + "loss": 0.9816, + "step": 1564 + }, + { + "epoch": 0.13983514642482187, + "grad_norm": 0.39679154753685, + "learning_rate": 9.687031753017753e-05, + "loss": 1.075, + "step": 1565 + }, + { + "epoch": 0.13992449795608372, + "grad_norm": 0.48569121956825256, + "learning_rate": 9.686527632147234e-05, + "loss": 0.9407, + "step": 1566 + }, + { + "epoch": 0.14001384948734558, + "grad_norm": 0.43438124656677246, + "learning_rate": 9.68602311873007e-05, + "loss": 1.0061, + "step": 1567 + }, + { + "epoch": 0.14010320101860746, + "grad_norm": 0.4626932144165039, + "learning_rate": 9.685518212808522e-05, + "loss": 0.9583, + "step": 1568 + }, + { + "epoch": 0.14019255254986931, + "grad_norm": 0.5100006461143494, + "learning_rate": 9.685012914424878e-05, + "loss": 0.9871, + "step": 1569 + }, + { + "epoch": 0.1402819040811312, + "grad_norm": 0.4893263280391693, + "learning_rate": 9.684507223621465e-05, + "loss": 0.9678, + "step": 1570 + }, + { + "epoch": 0.14037125561239305, + "grad_norm": 0.40761256217956543, + "learning_rate": 9.684001140440639e-05, + "loss": 1.0929, + "step": 1571 + }, + { + "epoch": 0.14046060714365494, + "grad_norm": 0.4098682403564453, + "learning_rate": 9.68349466492479e-05, + "loss": 1.1117, + "step": 1572 + }, + { + "epoch": 0.1405499586749168, + "grad_norm": 0.36809906363487244, + "learning_rate": 9.682987797116339e-05, + "loss": 1.0983, + "step": 1573 + }, + { + "epoch": 0.14063931020617865, + "grad_norm": 0.4545081853866577, + "learning_rate": 9.682480537057743e-05, + "loss": 1.0238, + "step": 1574 + }, + { + "epoch": 0.14072866173744053, + "grad_norm": 0.46472567319869995, + "learning_rate": 9.681972884791492e-05, + "loss": 1.0105, + "step": 1575 + }, + { + "epoch": 0.14081801326870239, + "grad_norm": 0.4712905287742615, + "learning_rate": 9.681464840360103e-05, + "loss": 0.9822, + "step": 1576 + }, + { + "epoch": 0.14090736479996427, + "grad_norm": 0.4782291352748871, + "learning_rate": 9.680956403806135e-05, + "loss": 1.0426, + "step": 1577 + }, + { + "epoch": 0.14099671633122612, + "grad_norm": 0.4774461090564728, + "learning_rate": 9.680447575172173e-05, + "loss": 1.0958, + "step": 1578 + }, + { + "epoch": 0.141086067862488, + "grad_norm": 0.41189044713974, + "learning_rate": 9.679938354500835e-05, + "loss": 1.0193, + "step": 1579 + }, + { + "epoch": 0.14117541939374986, + "grad_norm": 0.45908036828041077, + "learning_rate": 9.679428741834776e-05, + "loss": 1.0789, + "step": 1580 + }, + { + "epoch": 0.14126477092501172, + "grad_norm": 0.439585417509079, + "learning_rate": 9.67891873721668e-05, + "loss": 1.047, + "step": 1581 + }, + { + "epoch": 0.1413541224562736, + "grad_norm": 0.3884674906730652, + "learning_rate": 9.678408340689267e-05, + "loss": 1.0571, + "step": 1582 + }, + { + "epoch": 0.14144347398753546, + "grad_norm": 0.46724075078964233, + "learning_rate": 9.677897552295288e-05, + "loss": 1.0396, + "step": 1583 + }, + { + "epoch": 0.14153282551879734, + "grad_norm": 0.4285435378551483, + "learning_rate": 9.677386372077524e-05, + "loss": 0.9729, + "step": 1584 + }, + { + "epoch": 0.1416221770500592, + "grad_norm": 0.41435936093330383, + "learning_rate": 9.676874800078796e-05, + "loss": 1.0059, + "step": 1585 + }, + { + "epoch": 0.14171152858132105, + "grad_norm": 0.4586467146873474, + "learning_rate": 9.67636283634195e-05, + "loss": 1.0339, + "step": 1586 + }, + { + "epoch": 0.14180088011258293, + "grad_norm": 0.5122067928314209, + "learning_rate": 9.675850480909872e-05, + "loss": 1.001, + "step": 1587 + }, + { + "epoch": 0.1418902316438448, + "grad_norm": 0.42933905124664307, + "learning_rate": 9.675337733825473e-05, + "loss": 1.0386, + "step": 1588 + }, + { + "epoch": 0.14197958317510667, + "grad_norm": 0.4264965355396271, + "learning_rate": 9.674824595131704e-05, + "loss": 1.0794, + "step": 1589 + }, + { + "epoch": 0.14206893470636853, + "grad_norm": 0.4125423729419708, + "learning_rate": 9.674311064871547e-05, + "loss": 1.1203, + "step": 1590 + }, + { + "epoch": 0.1421582862376304, + "grad_norm": 0.3605044484138489, + "learning_rate": 9.67379714308801e-05, + "loss": 1.0368, + "step": 1591 + }, + { + "epoch": 0.14224763776889227, + "grad_norm": 0.45730555057525635, + "learning_rate": 9.673282829824146e-05, + "loss": 1.0428, + "step": 1592 + }, + { + "epoch": 0.14233698930015412, + "grad_norm": 0.45879489183425903, + "learning_rate": 9.672768125123031e-05, + "loss": 1.055, + "step": 1593 + }, + { + "epoch": 0.142426340831416, + "grad_norm": 0.4261813461780548, + "learning_rate": 9.672253029027777e-05, + "loss": 1.0486, + "step": 1594 + }, + { + "epoch": 0.14251569236267786, + "grad_norm": 0.3854468762874603, + "learning_rate": 9.671737541581529e-05, + "loss": 1.0092, + "step": 1595 + }, + { + "epoch": 0.14260504389393974, + "grad_norm": 0.4169517457485199, + "learning_rate": 9.671221662827465e-05, + "loss": 1.063, + "step": 1596 + }, + { + "epoch": 0.1426943954252016, + "grad_norm": 0.41241368651390076, + "learning_rate": 9.670705392808796e-05, + "loss": 1.0649, + "step": 1597 + }, + { + "epoch": 0.14278374695646345, + "grad_norm": 0.3985016644001007, + "learning_rate": 9.670188731568764e-05, + "loss": 1.0126, + "step": 1598 + }, + { + "epoch": 0.14287309848772534, + "grad_norm": 0.395868182182312, + "learning_rate": 9.669671679150642e-05, + "loss": 1.0801, + "step": 1599 + }, + { + "epoch": 0.1429624500189872, + "grad_norm": 0.3894529938697815, + "learning_rate": 9.669154235597746e-05, + "loss": 1.0315, + "step": 1600 + }, + { + "epoch": 0.14305180155024907, + "grad_norm": 0.4293975234031677, + "learning_rate": 9.66863640095341e-05, + "loss": 1.0163, + "step": 1601 + }, + { + "epoch": 0.14314115308151093, + "grad_norm": 0.3886623978614807, + "learning_rate": 9.668118175261015e-05, + "loss": 1.0487, + "step": 1602 + }, + { + "epoch": 0.1432305046127728, + "grad_norm": 0.46701672673225403, + "learning_rate": 9.66759955856396e-05, + "loss": 1.0463, + "step": 1603 + }, + { + "epoch": 0.14331985614403467, + "grad_norm": 0.4126945436000824, + "learning_rate": 9.667080550905691e-05, + "loss": 1.0268, + "step": 1604 + }, + { + "epoch": 0.14340920767529652, + "grad_norm": 0.4913039207458496, + "learning_rate": 9.666561152329679e-05, + "loss": 0.9639, + "step": 1605 + }, + { + "epoch": 0.1434985592065584, + "grad_norm": 0.3920893967151642, + "learning_rate": 9.666041362879427e-05, + "loss": 1.0301, + "step": 1606 + }, + { + "epoch": 0.14358791073782026, + "grad_norm": 0.38557082414627075, + "learning_rate": 9.665521182598476e-05, + "loss": 1.0137, + "step": 1607 + }, + { + "epoch": 0.14367726226908215, + "grad_norm": 0.532264769077301, + "learning_rate": 9.665000611530392e-05, + "loss": 0.9705, + "step": 1608 + }, + { + "epoch": 0.143766613800344, + "grad_norm": 0.42515647411346436, + "learning_rate": 9.664479649718784e-05, + "loss": 1.0133, + "step": 1609 + }, + { + "epoch": 0.14385596533160588, + "grad_norm": 0.42825859785079956, + "learning_rate": 9.663958297207286e-05, + "loss": 1.0272, + "step": 1610 + }, + { + "epoch": 0.14394531686286774, + "grad_norm": 0.3822021782398224, + "learning_rate": 9.663436554039567e-05, + "loss": 1.0168, + "step": 1611 + }, + { + "epoch": 0.1440346683941296, + "grad_norm": 0.43384209275245667, + "learning_rate": 9.662914420259325e-05, + "loss": 1.031, + "step": 1612 + }, + { + "epoch": 0.14412401992539148, + "grad_norm": 0.4819833040237427, + "learning_rate": 9.662391895910299e-05, + "loss": 1.011, + "step": 1613 + }, + { + "epoch": 0.14421337145665333, + "grad_norm": 0.38827648758888245, + "learning_rate": 9.661868981036255e-05, + "loss": 1.0561, + "step": 1614 + }, + { + "epoch": 0.14430272298791522, + "grad_norm": 0.5991657376289368, + "learning_rate": 9.661345675680991e-05, + "loss": 0.9748, + "step": 1615 + }, + { + "epoch": 0.14439207451917707, + "grad_norm": 0.568602442741394, + "learning_rate": 9.660821979888339e-05, + "loss": 1.0853, + "step": 1616 + }, + { + "epoch": 0.14448142605043893, + "grad_norm": 0.4651549458503723, + "learning_rate": 9.660297893702168e-05, + "loss": 1.0078, + "step": 1617 + }, + { + "epoch": 0.1445707775817008, + "grad_norm": 0.426548570394516, + "learning_rate": 9.659773417166371e-05, + "loss": 0.9844, + "step": 1618 + }, + { + "epoch": 0.14466012911296267, + "grad_norm": 0.4951413571834564, + "learning_rate": 9.659248550324882e-05, + "loss": 0.8695, + "step": 1619 + }, + { + "epoch": 0.14474948064422455, + "grad_norm": 0.4711915850639343, + "learning_rate": 9.658723293221661e-05, + "loss": 1.03, + "step": 1620 + }, + { + "epoch": 0.1448388321754864, + "grad_norm": 0.38477903604507446, + "learning_rate": 9.658197645900708e-05, + "loss": 1.0262, + "step": 1621 + }, + { + "epoch": 0.1449281837067483, + "grad_norm": 0.5240241885185242, + "learning_rate": 9.657671608406047e-05, + "loss": 0.9587, + "step": 1622 + }, + { + "epoch": 0.14501753523801014, + "grad_norm": 0.4152171015739441, + "learning_rate": 9.657145180781743e-05, + "loss": 1.0534, + "step": 1623 + }, + { + "epoch": 0.145106886769272, + "grad_norm": 0.43236833810806274, + "learning_rate": 9.65661836307189e-05, + "loss": 1.0896, + "step": 1624 + }, + { + "epoch": 0.14519623830053388, + "grad_norm": 0.5443985462188721, + "learning_rate": 9.656091155320611e-05, + "loss": 1.0644, + "step": 1625 + }, + { + "epoch": 0.14528558983179574, + "grad_norm": 0.4063294529914856, + "learning_rate": 9.655563557572069e-05, + "loss": 1.0385, + "step": 1626 + }, + { + "epoch": 0.14537494136305762, + "grad_norm": 0.5186159610748291, + "learning_rate": 9.655035569870452e-05, + "loss": 1.0842, + "step": 1627 + }, + { + "epoch": 0.14546429289431947, + "grad_norm": 0.4194406569004059, + "learning_rate": 9.654507192259989e-05, + "loss": 1.0013, + "step": 1628 + }, + { + "epoch": 0.14555364442558133, + "grad_norm": 0.4991320073604584, + "learning_rate": 9.653978424784934e-05, + "loss": 1.096, + "step": 1629 + }, + { + "epoch": 0.1456429959568432, + "grad_norm": 0.4811127185821533, + "learning_rate": 9.653449267489579e-05, + "loss": 0.9865, + "step": 1630 + }, + { + "epoch": 0.14573234748810507, + "grad_norm": 0.4587719440460205, + "learning_rate": 9.652919720418245e-05, + "loss": 1.0453, + "step": 1631 + }, + { + "epoch": 0.14582169901936695, + "grad_norm": 0.40233114361763, + "learning_rate": 9.652389783615287e-05, + "loss": 1.1234, + "step": 1632 + }, + { + "epoch": 0.1459110505506288, + "grad_norm": 0.4691876471042633, + "learning_rate": 9.651859457125096e-05, + "loss": 0.9824, + "step": 1633 + }, + { + "epoch": 0.1460004020818907, + "grad_norm": 0.41825005412101746, + "learning_rate": 9.651328740992088e-05, + "loss": 1.0688, + "step": 1634 + }, + { + "epoch": 0.14608975361315255, + "grad_norm": 0.4713682532310486, + "learning_rate": 9.65079763526072e-05, + "loss": 1.0345, + "step": 1635 + }, + { + "epoch": 0.1461791051444144, + "grad_norm": 0.41411858797073364, + "learning_rate": 9.650266139975474e-05, + "loss": 1.0163, + "step": 1636 + }, + { + "epoch": 0.14626845667567628, + "grad_norm": 0.3815430700778961, + "learning_rate": 9.649734255180873e-05, + "loss": 1.0626, + "step": 1637 + }, + { + "epoch": 0.14635780820693814, + "grad_norm": 0.41789570450782776, + "learning_rate": 9.649201980921465e-05, + "loss": 0.9784, + "step": 1638 + }, + { + "epoch": 0.14644715973820002, + "grad_norm": 0.43766146898269653, + "learning_rate": 9.648669317241833e-05, + "loss": 1.0253, + "step": 1639 + }, + { + "epoch": 0.14653651126946188, + "grad_norm": 0.43024829030036926, + "learning_rate": 9.648136264186595e-05, + "loss": 1.0291, + "step": 1640 + }, + { + "epoch": 0.14662586280072376, + "grad_norm": 0.39453184604644775, + "learning_rate": 9.647602821800399e-05, + "loss": 1.0444, + "step": 1641 + }, + { + "epoch": 0.14671521433198562, + "grad_norm": 0.44370511174201965, + "learning_rate": 9.647068990127928e-05, + "loss": 1.0723, + "step": 1642 + }, + { + "epoch": 0.14680456586324747, + "grad_norm": 0.46351152658462524, + "learning_rate": 9.646534769213893e-05, + "loss": 1.0513, + "step": 1643 + }, + { + "epoch": 0.14689391739450935, + "grad_norm": 0.4535273313522339, + "learning_rate": 9.646000159103043e-05, + "loss": 1.0497, + "step": 1644 + }, + { + "epoch": 0.1469832689257712, + "grad_norm": 0.43159547448158264, + "learning_rate": 9.645465159840157e-05, + "loss": 1.0305, + "step": 1645 + }, + { + "epoch": 0.1470726204570331, + "grad_norm": 0.4147747755050659, + "learning_rate": 9.644929771470048e-05, + "loss": 1.0223, + "step": 1646 + }, + { + "epoch": 0.14716197198829495, + "grad_norm": 0.45313653349876404, + "learning_rate": 9.64439399403756e-05, + "loss": 0.9908, + "step": 1647 + }, + { + "epoch": 0.1472513235195568, + "grad_norm": 0.3770694434642792, + "learning_rate": 9.643857827587566e-05, + "loss": 1.0579, + "step": 1648 + }, + { + "epoch": 0.1473406750508187, + "grad_norm": 0.3648872971534729, + "learning_rate": 9.643321272164981e-05, + "loss": 1.0841, + "step": 1649 + }, + { + "epoch": 0.14743002658208054, + "grad_norm": 0.48402151465415955, + "learning_rate": 9.642784327814744e-05, + "loss": 1.0109, + "step": 1650 + }, + { + "epoch": 0.14751937811334243, + "grad_norm": 0.42030149698257446, + "learning_rate": 9.642246994581832e-05, + "loss": 1.0677, + "step": 1651 + }, + { + "epoch": 0.14760872964460428, + "grad_norm": 0.46909406781196594, + "learning_rate": 9.641709272511252e-05, + "loss": 0.9945, + "step": 1652 + }, + { + "epoch": 0.14769808117586616, + "grad_norm": 0.4196245074272156, + "learning_rate": 9.641171161648043e-05, + "loss": 1.1265, + "step": 1653 + }, + { + "epoch": 0.14778743270712802, + "grad_norm": 0.3893241286277771, + "learning_rate": 9.640632662037278e-05, + "loss": 1.0552, + "step": 1654 + }, + { + "epoch": 0.14787678423838987, + "grad_norm": 0.44918498396873474, + "learning_rate": 9.640093773724061e-05, + "loss": 1.0326, + "step": 1655 + }, + { + "epoch": 0.14796613576965176, + "grad_norm": 0.5244982242584229, + "learning_rate": 9.63955449675353e-05, + "loss": 1.0296, + "step": 1656 + }, + { + "epoch": 0.1480554873009136, + "grad_norm": 0.4323536157608032, + "learning_rate": 9.639014831170858e-05, + "loss": 1.0664, + "step": 1657 + }, + { + "epoch": 0.1481448388321755, + "grad_norm": 0.437716007232666, + "learning_rate": 9.638474777021244e-05, + "loss": 1.0745, + "step": 1658 + }, + { + "epoch": 0.14823419036343735, + "grad_norm": 0.39507436752319336, + "learning_rate": 9.637934334349927e-05, + "loss": 1.0435, + "step": 1659 + }, + { + "epoch": 0.1483235418946992, + "grad_norm": 0.422969251871109, + "learning_rate": 9.63739350320217e-05, + "loss": 1.0611, + "step": 1660 + }, + { + "epoch": 0.1484128934259611, + "grad_norm": 0.3984132409095764, + "learning_rate": 9.636852283623276e-05, + "loss": 1.0717, + "step": 1661 + }, + { + "epoch": 0.14850224495722295, + "grad_norm": 0.4262081980705261, + "learning_rate": 9.63631067565858e-05, + "loss": 1.0278, + "step": 1662 + }, + { + "epoch": 0.14859159648848483, + "grad_norm": 0.43201524019241333, + "learning_rate": 9.635768679353445e-05, + "loss": 0.9729, + "step": 1663 + }, + { + "epoch": 0.14868094801974668, + "grad_norm": 0.4402812719345093, + "learning_rate": 9.635226294753267e-05, + "loss": 1.0292, + "step": 1664 + }, + { + "epoch": 0.14877029955100857, + "grad_norm": 0.41984882950782776, + "learning_rate": 9.634683521903483e-05, + "loss": 1.036, + "step": 1665 + }, + { + "epoch": 0.14885965108227042, + "grad_norm": 0.38695448637008667, + "learning_rate": 9.634140360849548e-05, + "loss": 1.0454, + "step": 1666 + }, + { + "epoch": 0.14894900261353228, + "grad_norm": 0.44035449624061584, + "learning_rate": 9.633596811636964e-05, + "loss": 1.0825, + "step": 1667 + }, + { + "epoch": 0.14903835414479416, + "grad_norm": 0.4655371904373169, + "learning_rate": 9.633052874311255e-05, + "loss": 1.0182, + "step": 1668 + }, + { + "epoch": 0.14912770567605602, + "grad_norm": 0.42064300179481506, + "learning_rate": 9.632508548917983e-05, + "loss": 1.0113, + "step": 1669 + }, + { + "epoch": 0.1492170572073179, + "grad_norm": 0.4009134769439697, + "learning_rate": 9.631963835502742e-05, + "loss": 1.0587, + "step": 1670 + }, + { + "epoch": 0.14930640873857975, + "grad_norm": 0.3674651086330414, + "learning_rate": 9.631418734111155e-05, + "loss": 1.0356, + "step": 1671 + }, + { + "epoch": 0.14939576026984164, + "grad_norm": 0.416415810585022, + "learning_rate": 9.630873244788883e-05, + "loss": 1.0074, + "step": 1672 + }, + { + "epoch": 0.1494851118011035, + "grad_norm": 0.5505238771438599, + "learning_rate": 9.630327367581616e-05, + "loss": 1.0533, + "step": 1673 + }, + { + "epoch": 0.14957446333236535, + "grad_norm": 0.42817139625549316, + "learning_rate": 9.629781102535074e-05, + "loss": 0.9811, + "step": 1674 + }, + { + "epoch": 0.14966381486362723, + "grad_norm": 0.4695313274860382, + "learning_rate": 9.629234449695015e-05, + "loss": 1.1321, + "step": 1675 + }, + { + "epoch": 0.1497531663948891, + "grad_norm": 0.5143523216247559, + "learning_rate": 9.628687409107229e-05, + "loss": 0.9853, + "step": 1676 + }, + { + "epoch": 0.14984251792615097, + "grad_norm": 0.4100930392742157, + "learning_rate": 9.628139980817532e-05, + "loss": 1.1377, + "step": 1677 + }, + { + "epoch": 0.14993186945741283, + "grad_norm": 0.41417646408081055, + "learning_rate": 9.627592164871779e-05, + "loss": 0.9765, + "step": 1678 + }, + { + "epoch": 0.15002122098867468, + "grad_norm": 0.3846787214279175, + "learning_rate": 9.627043961315856e-05, + "loss": 1.0585, + "step": 1679 + }, + { + "epoch": 0.15011057251993656, + "grad_norm": 0.4256773889064789, + "learning_rate": 9.626495370195683e-05, + "loss": 1.0301, + "step": 1680 + }, + { + "epoch": 0.15019992405119842, + "grad_norm": 0.5811467170715332, + "learning_rate": 9.625946391557204e-05, + "loss": 1.0538, + "step": 1681 + }, + { + "epoch": 0.1502892755824603, + "grad_norm": 0.4346529543399811, + "learning_rate": 9.625397025446408e-05, + "loss": 1.0657, + "step": 1682 + }, + { + "epoch": 0.15037862711372216, + "grad_norm": 0.3570214509963989, + "learning_rate": 9.624847271909308e-05, + "loss": 1.0752, + "step": 1683 + }, + { + "epoch": 0.15046797864498404, + "grad_norm": 0.4259663224220276, + "learning_rate": 9.62429713099195e-05, + "loss": 1.1056, + "step": 1684 + }, + { + "epoch": 0.1505573301762459, + "grad_norm": 0.3912825882434845, + "learning_rate": 9.623746602740417e-05, + "loss": 1.0072, + "step": 1685 + }, + { + "epoch": 0.15064668170750775, + "grad_norm": 0.41152167320251465, + "learning_rate": 9.623195687200822e-05, + "loss": 1.0224, + "step": 1686 + }, + { + "epoch": 0.15073603323876963, + "grad_norm": 0.397203266620636, + "learning_rate": 9.622644384419306e-05, + "loss": 1.0702, + "step": 1687 + }, + { + "epoch": 0.1508253847700315, + "grad_norm": 0.4435591399669647, + "learning_rate": 9.62209269444205e-05, + "loss": 0.9521, + "step": 1688 + }, + { + "epoch": 0.15091473630129337, + "grad_norm": 0.36750173568725586, + "learning_rate": 9.621540617315262e-05, + "loss": 0.9843, + "step": 1689 + }, + { + "epoch": 0.15100408783255523, + "grad_norm": 0.4787936210632324, + "learning_rate": 9.620988153085187e-05, + "loss": 1.0932, + "step": 1690 + }, + { + "epoch": 0.15109343936381708, + "grad_norm": 0.44318270683288574, + "learning_rate": 9.620435301798097e-05, + "loss": 1.0223, + "step": 1691 + }, + { + "epoch": 0.15118279089507897, + "grad_norm": 0.47096431255340576, + "learning_rate": 9.619882063500299e-05, + "loss": 1.0623, + "step": 1692 + }, + { + "epoch": 0.15127214242634082, + "grad_norm": 0.44428104162216187, + "learning_rate": 9.619328438238135e-05, + "loss": 1.0413, + "step": 1693 + }, + { + "epoch": 0.1513614939576027, + "grad_norm": 0.4424484670162201, + "learning_rate": 9.618774426057975e-05, + "loss": 1.0759, + "step": 1694 + }, + { + "epoch": 0.15145084548886456, + "grad_norm": 0.3779531717300415, + "learning_rate": 9.618220027006225e-05, + "loss": 1.0476, + "step": 1695 + }, + { + "epoch": 0.15154019702012644, + "grad_norm": 0.40432384610176086, + "learning_rate": 9.617665241129321e-05, + "loss": 1.1215, + "step": 1696 + }, + { + "epoch": 0.1516295485513883, + "grad_norm": 0.48599207401275635, + "learning_rate": 9.617110068473732e-05, + "loss": 1.0024, + "step": 1697 + }, + { + "epoch": 0.15171890008265015, + "grad_norm": 0.49614617228507996, + "learning_rate": 9.61655450908596e-05, + "loss": 0.9352, + "step": 1698 + }, + { + "epoch": 0.15180825161391204, + "grad_norm": 0.5036540627479553, + "learning_rate": 9.615998563012538e-05, + "loss": 0.9844, + "step": 1699 + }, + { + "epoch": 0.1518976031451739, + "grad_norm": 0.3981391191482544, + "learning_rate": 9.615442230300036e-05, + "loss": 1.0222, + "step": 1700 + }, + { + "epoch": 0.15198695467643578, + "grad_norm": 0.4211319386959076, + "learning_rate": 9.614885510995047e-05, + "loss": 1.0769, + "step": 1701 + }, + { + "epoch": 0.15207630620769763, + "grad_norm": 0.4759863317012787, + "learning_rate": 9.614328405144207e-05, + "loss": 0.999, + "step": 1702 + }, + { + "epoch": 0.15216565773895951, + "grad_norm": 0.48188042640686035, + "learning_rate": 9.613770912794178e-05, + "loss": 0.9254, + "step": 1703 + }, + { + "epoch": 0.15225500927022137, + "grad_norm": 0.38263580203056335, + "learning_rate": 9.613213033991655e-05, + "loss": 1.0517, + "step": 1704 + }, + { + "epoch": 0.15234436080148323, + "grad_norm": 0.44071927666664124, + "learning_rate": 9.612654768783368e-05, + "loss": 1.0019, + "step": 1705 + }, + { + "epoch": 0.1524337123327451, + "grad_norm": 0.44804859161376953, + "learning_rate": 9.612096117216076e-05, + "loss": 1.062, + "step": 1706 + }, + { + "epoch": 0.15252306386400696, + "grad_norm": 0.4194650948047638, + "learning_rate": 9.611537079336574e-05, + "loss": 1.0101, + "step": 1707 + }, + { + "epoch": 0.15261241539526885, + "grad_norm": 0.4241783618927002, + "learning_rate": 9.610977655191684e-05, + "loss": 1.0197, + "step": 1708 + }, + { + "epoch": 0.1527017669265307, + "grad_norm": 0.48272955417633057, + "learning_rate": 9.610417844828268e-05, + "loss": 1.014, + "step": 1709 + }, + { + "epoch": 0.15279111845779256, + "grad_norm": 0.4096718728542328, + "learning_rate": 9.609857648293212e-05, + "loss": 1.0106, + "step": 1710 + }, + { + "epoch": 0.15288046998905444, + "grad_norm": 0.3911011815071106, + "learning_rate": 9.609297065633443e-05, + "loss": 1.0542, + "step": 1711 + }, + { + "epoch": 0.1529698215203163, + "grad_norm": 0.43495795130729675, + "learning_rate": 9.60873609689591e-05, + "loss": 1.0039, + "step": 1712 + }, + { + "epoch": 0.15305917305157818, + "grad_norm": 0.4525046646595001, + "learning_rate": 9.608174742127605e-05, + "loss": 0.9883, + "step": 1713 + }, + { + "epoch": 0.15314852458284003, + "grad_norm": 0.3769172430038452, + "learning_rate": 9.607613001375546e-05, + "loss": 1.1064, + "step": 1714 + }, + { + "epoch": 0.15323787611410192, + "grad_norm": 0.435320645570755, + "learning_rate": 9.607050874686784e-05, + "loss": 1.0457, + "step": 1715 + }, + { + "epoch": 0.15332722764536377, + "grad_norm": 0.38679543137550354, + "learning_rate": 9.606488362108404e-05, + "loss": 1.0426, + "step": 1716 + }, + { + "epoch": 0.15341657917662563, + "grad_norm": 0.446005254983902, + "learning_rate": 9.605925463687522e-05, + "loss": 1.041, + "step": 1717 + }, + { + "epoch": 0.1535059307078875, + "grad_norm": 0.4369790554046631, + "learning_rate": 9.605362179471287e-05, + "loss": 0.9566, + "step": 1718 + }, + { + "epoch": 0.15359528223914937, + "grad_norm": 0.3627423942089081, + "learning_rate": 9.604798509506879e-05, + "loss": 1.0247, + "step": 1719 + }, + { + "epoch": 0.15368463377041125, + "grad_norm": 0.40787839889526367, + "learning_rate": 9.604234453841512e-05, + "loss": 1.1035, + "step": 1720 + }, + { + "epoch": 0.1537739853016731, + "grad_norm": 0.4434973895549774, + "learning_rate": 9.603670012522432e-05, + "loss": 1.0767, + "step": 1721 + }, + { + "epoch": 0.153863336832935, + "grad_norm": 0.4459492564201355, + "learning_rate": 9.603105185596917e-05, + "loss": 0.9853, + "step": 1722 + }, + { + "epoch": 0.15395268836419684, + "grad_norm": 0.45601245760917664, + "learning_rate": 9.602539973112278e-05, + "loss": 1.0228, + "step": 1723 + }, + { + "epoch": 0.1540420398954587, + "grad_norm": 0.44149455428123474, + "learning_rate": 9.601974375115856e-05, + "loss": 1.0443, + "step": 1724 + }, + { + "epoch": 0.15413139142672058, + "grad_norm": 0.5166652798652649, + "learning_rate": 9.601408391655027e-05, + "loss": 0.9864, + "step": 1725 + }, + { + "epoch": 0.15422074295798244, + "grad_norm": 0.5009913444519043, + "learning_rate": 9.600842022777198e-05, + "loss": 1.0516, + "step": 1726 + }, + { + "epoch": 0.15431009448924432, + "grad_norm": 0.4483872652053833, + "learning_rate": 9.600275268529807e-05, + "loss": 0.9622, + "step": 1727 + }, + { + "epoch": 0.15439944602050618, + "grad_norm": 0.4618902802467346, + "learning_rate": 9.599708128960327e-05, + "loss": 0.9304, + "step": 1728 + }, + { + "epoch": 0.15448879755176803, + "grad_norm": 0.39204496145248413, + "learning_rate": 9.599140604116263e-05, + "loss": 1.0674, + "step": 1729 + }, + { + "epoch": 0.15457814908302991, + "grad_norm": 0.44558286666870117, + "learning_rate": 9.598572694045149e-05, + "loss": 1.0133, + "step": 1730 + }, + { + "epoch": 0.15466750061429177, + "grad_norm": 0.39136767387390137, + "learning_rate": 9.598004398794556e-05, + "loss": 0.9855, + "step": 1731 + }, + { + "epoch": 0.15475685214555365, + "grad_norm": 0.47076520323753357, + "learning_rate": 9.597435718412082e-05, + "loss": 1.0118, + "step": 1732 + }, + { + "epoch": 0.1548462036768155, + "grad_norm": 0.373670369386673, + "learning_rate": 9.596866652945362e-05, + "loss": 1.0864, + "step": 1733 + }, + { + "epoch": 0.1549355552080774, + "grad_norm": 0.3965423107147217, + "learning_rate": 9.59629720244206e-05, + "loss": 0.9902, + "step": 1734 + }, + { + "epoch": 0.15502490673933925, + "grad_norm": 0.4660866856575012, + "learning_rate": 9.595727366949875e-05, + "loss": 1.0477, + "step": 1735 + }, + { + "epoch": 0.1551142582706011, + "grad_norm": 0.4512030780315399, + "learning_rate": 9.595157146516535e-05, + "loss": 1.058, + "step": 1736 + }, + { + "epoch": 0.15520360980186299, + "grad_norm": 0.4368734359741211, + "learning_rate": 9.594586541189804e-05, + "loss": 1.0307, + "step": 1737 + }, + { + "epoch": 0.15529296133312484, + "grad_norm": 0.5563750863075256, + "learning_rate": 9.594015551017477e-05, + "loss": 0.9361, + "step": 1738 + }, + { + "epoch": 0.15538231286438672, + "grad_norm": 0.43998295068740845, + "learning_rate": 9.593444176047378e-05, + "loss": 1.041, + "step": 1739 + }, + { + "epoch": 0.15547166439564858, + "grad_norm": 0.6449422836303711, + "learning_rate": 9.592872416327365e-05, + "loss": 0.9818, + "step": 1740 + }, + { + "epoch": 0.15556101592691043, + "grad_norm": 0.40925419330596924, + "learning_rate": 9.592300271905332e-05, + "loss": 1.0854, + "step": 1741 + }, + { + "epoch": 0.15565036745817232, + "grad_norm": 0.3952295780181885, + "learning_rate": 9.591727742829199e-05, + "loss": 1.1511, + "step": 1742 + }, + { + "epoch": 0.15573971898943417, + "grad_norm": 0.49510762095451355, + "learning_rate": 9.591154829146927e-05, + "loss": 0.9822, + "step": 1743 + }, + { + "epoch": 0.15582907052069606, + "grad_norm": 0.43776780366897583, + "learning_rate": 9.590581530906497e-05, + "loss": 1.0781, + "step": 1744 + }, + { + "epoch": 0.1559184220519579, + "grad_norm": 0.40279847383499146, + "learning_rate": 9.590007848155932e-05, + "loss": 1.0741, + "step": 1745 + }, + { + "epoch": 0.1560077735832198, + "grad_norm": 0.36434587836265564, + "learning_rate": 9.589433780943284e-05, + "loss": 1.0873, + "step": 1746 + }, + { + "epoch": 0.15609712511448165, + "grad_norm": 0.3859378695487976, + "learning_rate": 9.588859329316637e-05, + "loss": 1.0719, + "step": 1747 + }, + { + "epoch": 0.1561864766457435, + "grad_norm": 0.34931498765945435, + "learning_rate": 9.588284493324106e-05, + "loss": 1.0645, + "step": 1748 + }, + { + "epoch": 0.1562758281770054, + "grad_norm": 0.3696591556072235, + "learning_rate": 9.587709273013845e-05, + "loss": 1.0927, + "step": 1749 + }, + { + "epoch": 0.15636517970826724, + "grad_norm": 0.4064505100250244, + "learning_rate": 9.587133668434027e-05, + "loss": 1.0693, + "step": 1750 + }, + { + "epoch": 0.15645453123952913, + "grad_norm": 0.5394856333732605, + "learning_rate": 9.586557679632871e-05, + "loss": 1.0407, + "step": 1751 + }, + { + "epoch": 0.15654388277079098, + "grad_norm": 0.47162312269210815, + "learning_rate": 9.58598130665862e-05, + "loss": 1.0118, + "step": 1752 + }, + { + "epoch": 0.15663323430205287, + "grad_norm": 0.46519139409065247, + "learning_rate": 9.585404549559551e-05, + "loss": 1.0815, + "step": 1753 + }, + { + "epoch": 0.15672258583331472, + "grad_norm": 0.4289345443248749, + "learning_rate": 9.584827408383974e-05, + "loss": 1.039, + "step": 1754 + }, + { + "epoch": 0.15681193736457658, + "grad_norm": 0.4443482756614685, + "learning_rate": 9.58424988318023e-05, + "loss": 1.0424, + "step": 1755 + }, + { + "epoch": 0.15690128889583846, + "grad_norm": 0.4442220628261566, + "learning_rate": 9.583671973996695e-05, + "loss": 1.0615, + "step": 1756 + }, + { + "epoch": 0.15699064042710031, + "grad_norm": 0.4155905544757843, + "learning_rate": 9.583093680881774e-05, + "loss": 1.0328, + "step": 1757 + }, + { + "epoch": 0.1570799919583622, + "grad_norm": 0.40333032608032227, + "learning_rate": 9.582515003883904e-05, + "loss": 1.0237, + "step": 1758 + }, + { + "epoch": 0.15716934348962405, + "grad_norm": 0.4274482727050781, + "learning_rate": 9.581935943051557e-05, + "loss": 1.0246, + "step": 1759 + }, + { + "epoch": 0.1572586950208859, + "grad_norm": 0.47601258754730225, + "learning_rate": 9.581356498433233e-05, + "loss": 1.0917, + "step": 1760 + }, + { + "epoch": 0.1573480465521478, + "grad_norm": 0.37168294191360474, + "learning_rate": 9.580776670077471e-05, + "loss": 1.0442, + "step": 1761 + }, + { + "epoch": 0.15743739808340965, + "grad_norm": 0.48139792680740356, + "learning_rate": 9.580196458032836e-05, + "loss": 1.1097, + "step": 1762 + }, + { + "epoch": 0.15752674961467153, + "grad_norm": 0.49613094329833984, + "learning_rate": 9.579615862347924e-05, + "loss": 1.0116, + "step": 1763 + }, + { + "epoch": 0.15761610114593338, + "grad_norm": 0.3607613742351532, + "learning_rate": 9.579034883071368e-05, + "loss": 1.0563, + "step": 1764 + }, + { + "epoch": 0.15770545267719527, + "grad_norm": 0.5845707654953003, + "learning_rate": 9.578453520251833e-05, + "loss": 1.0129, + "step": 1765 + }, + { + "epoch": 0.15779480420845712, + "grad_norm": 0.4461520314216614, + "learning_rate": 9.577871773938011e-05, + "loss": 0.99, + "step": 1766 + }, + { + "epoch": 0.15788415573971898, + "grad_norm": 0.4064408242702484, + "learning_rate": 9.577289644178634e-05, + "loss": 1.0397, + "step": 1767 + }, + { + "epoch": 0.15797350727098086, + "grad_norm": 0.475175678730011, + "learning_rate": 9.576707131022458e-05, + "loss": 0.9823, + "step": 1768 + }, + { + "epoch": 0.15806285880224272, + "grad_norm": 0.5856608152389526, + "learning_rate": 9.576124234518275e-05, + "loss": 0.9467, + "step": 1769 + }, + { + "epoch": 0.1581522103335046, + "grad_norm": 0.44236186146736145, + "learning_rate": 9.57554095471491e-05, + "loss": 0.9745, + "step": 1770 + }, + { + "epoch": 0.15824156186476646, + "grad_norm": 0.38273531198501587, + "learning_rate": 9.574957291661218e-05, + "loss": 1.0545, + "step": 1771 + }, + { + "epoch": 0.1583309133960283, + "grad_norm": 0.37142929434776306, + "learning_rate": 9.574373245406086e-05, + "loss": 0.9557, + "step": 1772 + }, + { + "epoch": 0.1584202649272902, + "grad_norm": 0.4939393997192383, + "learning_rate": 9.573788815998437e-05, + "loss": 1.027, + "step": 1773 + }, + { + "epoch": 0.15850961645855205, + "grad_norm": 0.5472790598869324, + "learning_rate": 9.573204003487221e-05, + "loss": 0.987, + "step": 1774 + }, + { + "epoch": 0.15859896798981393, + "grad_norm": 0.44142383337020874, + "learning_rate": 9.572618807921423e-05, + "loss": 1.0675, + "step": 1775 + }, + { + "epoch": 0.1586883195210758, + "grad_norm": 0.4367615878582001, + "learning_rate": 9.572033229350059e-05, + "loss": 1.0437, + "step": 1776 + }, + { + "epoch": 0.15877767105233767, + "grad_norm": 0.38631755113601685, + "learning_rate": 9.571447267822178e-05, + "loss": 1.0157, + "step": 1777 + }, + { + "epoch": 0.15886702258359953, + "grad_norm": 0.5570868849754333, + "learning_rate": 9.57086092338686e-05, + "loss": 0.9977, + "step": 1778 + }, + { + "epoch": 0.15895637411486138, + "grad_norm": 0.4448052942752838, + "learning_rate": 9.570274196093217e-05, + "loss": 1.0383, + "step": 1779 + }, + { + "epoch": 0.15904572564612326, + "grad_norm": 0.46690064668655396, + "learning_rate": 9.569687085990395e-05, + "loss": 1.0864, + "step": 1780 + }, + { + "epoch": 0.15913507717738512, + "grad_norm": 0.411737322807312, + "learning_rate": 9.569099593127571e-05, + "loss": 0.9839, + "step": 1781 + }, + { + "epoch": 0.159224428708647, + "grad_norm": 0.4243115484714508, + "learning_rate": 9.568511717553952e-05, + "loss": 0.9728, + "step": 1782 + }, + { + "epoch": 0.15931378023990886, + "grad_norm": 0.36506015062332153, + "learning_rate": 9.567923459318782e-05, + "loss": 1.0646, + "step": 1783 + }, + { + "epoch": 0.15940313177117074, + "grad_norm": 0.47241315245628357, + "learning_rate": 9.56733481847133e-05, + "loss": 1.0306, + "step": 1784 + }, + { + "epoch": 0.1594924833024326, + "grad_norm": 0.4867679178714752, + "learning_rate": 9.566745795060901e-05, + "loss": 0.9727, + "step": 1785 + }, + { + "epoch": 0.15958183483369445, + "grad_norm": 0.47150614857673645, + "learning_rate": 9.566156389136835e-05, + "loss": 1.0212, + "step": 1786 + }, + { + "epoch": 0.15967118636495634, + "grad_norm": 0.4018579423427582, + "learning_rate": 9.5655666007485e-05, + "loss": 1.0142, + "step": 1787 + }, + { + "epoch": 0.1597605378962182, + "grad_norm": 0.4174135625362396, + "learning_rate": 9.564976429945298e-05, + "loss": 1.0768, + "step": 1788 + }, + { + "epoch": 0.15984988942748007, + "grad_norm": 0.5170420408248901, + "learning_rate": 9.56438587677666e-05, + "loss": 0.9997, + "step": 1789 + }, + { + "epoch": 0.15993924095874193, + "grad_norm": 0.4216405153274536, + "learning_rate": 9.563794941292051e-05, + "loss": 1.0355, + "step": 1790 + }, + { + "epoch": 0.16002859249000378, + "grad_norm": 0.431983083486557, + "learning_rate": 9.563203623540969e-05, + "loss": 1.0086, + "step": 1791 + }, + { + "epoch": 0.16011794402126567, + "grad_norm": 0.45916739106178284, + "learning_rate": 9.562611923572944e-05, + "loss": 1.0688, + "step": 1792 + }, + { + "epoch": 0.16020729555252752, + "grad_norm": 0.45291900634765625, + "learning_rate": 9.562019841437537e-05, + "loss": 1.057, + "step": 1793 + }, + { + "epoch": 0.1602966470837894, + "grad_norm": 0.4525638520717621, + "learning_rate": 9.56142737718434e-05, + "loss": 1.1029, + "step": 1794 + }, + { + "epoch": 0.16038599861505126, + "grad_norm": 0.4891865849494934, + "learning_rate": 9.560834530862979e-05, + "loss": 1.0798, + "step": 1795 + }, + { + "epoch": 0.16047535014631314, + "grad_norm": 0.3669840097427368, + "learning_rate": 9.560241302523109e-05, + "loss": 1.0062, + "step": 1796 + }, + { + "epoch": 0.160564701677575, + "grad_norm": 0.4095466732978821, + "learning_rate": 9.559647692214424e-05, + "loss": 0.9993, + "step": 1797 + }, + { + "epoch": 0.16065405320883686, + "grad_norm": 0.5019549131393433, + "learning_rate": 9.559053699986642e-05, + "loss": 1.0768, + "step": 1798 + }, + { + "epoch": 0.16074340474009874, + "grad_norm": 0.40123990178108215, + "learning_rate": 9.558459325889515e-05, + "loss": 1.0714, + "step": 1799 + }, + { + "epoch": 0.1608327562713606, + "grad_norm": 0.4176787734031677, + "learning_rate": 9.557864569972832e-05, + "loss": 1.0862, + "step": 1800 + }, + { + "epoch": 0.16092210780262248, + "grad_norm": 0.4730657935142517, + "learning_rate": 9.557269432286407e-05, + "loss": 1.0686, + "step": 1801 + }, + { + "epoch": 0.16101145933388433, + "grad_norm": 0.4306131899356842, + "learning_rate": 9.55667391288009e-05, + "loss": 0.9852, + "step": 1802 + }, + { + "epoch": 0.1611008108651462, + "grad_norm": 0.40398168563842773, + "learning_rate": 9.556078011803762e-05, + "loss": 1.003, + "step": 1803 + }, + { + "epoch": 0.16119016239640807, + "grad_norm": 0.3656451106071472, + "learning_rate": 9.555481729107336e-05, + "loss": 1.0516, + "step": 1804 + }, + { + "epoch": 0.16127951392766993, + "grad_norm": 0.5433897376060486, + "learning_rate": 9.554885064840758e-05, + "loss": 0.9463, + "step": 1805 + }, + { + "epoch": 0.1613688654589318, + "grad_norm": 0.3712410628795624, + "learning_rate": 9.554288019054003e-05, + "loss": 1.0896, + "step": 1806 + }, + { + "epoch": 0.16145821699019366, + "grad_norm": 0.4026222825050354, + "learning_rate": 9.553690591797082e-05, + "loss": 1.1181, + "step": 1807 + }, + { + "epoch": 0.16154756852145555, + "grad_norm": 0.45911315083503723, + "learning_rate": 9.553092783120034e-05, + "loss": 0.9981, + "step": 1808 + }, + { + "epoch": 0.1616369200527174, + "grad_norm": 0.44033879041671753, + "learning_rate": 9.552494593072935e-05, + "loss": 1.0534, + "step": 1809 + }, + { + "epoch": 0.16172627158397926, + "grad_norm": 0.42678460478782654, + "learning_rate": 9.551896021705886e-05, + "loss": 1.0123, + "step": 1810 + }, + { + "epoch": 0.16181562311524114, + "grad_norm": 0.48469212651252747, + "learning_rate": 9.551297069069027e-05, + "loss": 0.9846, + "step": 1811 + }, + { + "epoch": 0.161904974646503, + "grad_norm": 0.4134121537208557, + "learning_rate": 9.550697735212523e-05, + "loss": 1.1542, + "step": 1812 + }, + { + "epoch": 0.16199432617776488, + "grad_norm": 0.44429683685302734, + "learning_rate": 9.550098020186579e-05, + "loss": 1.0126, + "step": 1813 + }, + { + "epoch": 0.16208367770902674, + "grad_norm": 0.4219866096973419, + "learning_rate": 9.549497924041424e-05, + "loss": 1.0309, + "step": 1814 + }, + { + "epoch": 0.16217302924028862, + "grad_norm": 0.38047802448272705, + "learning_rate": 9.548897446827322e-05, + "loss": 1.0875, + "step": 1815 + }, + { + "epoch": 0.16226238077155047, + "grad_norm": 0.37720173597335815, + "learning_rate": 9.548296588594575e-05, + "loss": 1.0008, + "step": 1816 + }, + { + "epoch": 0.16235173230281233, + "grad_norm": 0.4290127754211426, + "learning_rate": 9.547695349393504e-05, + "loss": 1.0353, + "step": 1817 + }, + { + "epoch": 0.1624410838340742, + "grad_norm": 0.4377609193325043, + "learning_rate": 9.547093729274474e-05, + "loss": 1.0352, + "step": 1818 + }, + { + "epoch": 0.16253043536533607, + "grad_norm": 0.3721230924129486, + "learning_rate": 9.546491728287876e-05, + "loss": 1.0833, + "step": 1819 + }, + { + "epoch": 0.16261978689659795, + "grad_norm": 0.43184563517570496, + "learning_rate": 9.545889346484134e-05, + "loss": 0.9966, + "step": 1820 + }, + { + "epoch": 0.1627091384278598, + "grad_norm": 0.41010618209838867, + "learning_rate": 9.545286583913702e-05, + "loss": 0.9989, + "step": 1821 + }, + { + "epoch": 0.16279848995912166, + "grad_norm": 0.5316981077194214, + "learning_rate": 9.54468344062707e-05, + "loss": 0.9844, + "step": 1822 + }, + { + "epoch": 0.16288784149038354, + "grad_norm": 0.35896286368370056, + "learning_rate": 9.544079916674757e-05, + "loss": 1.0765, + "step": 1823 + }, + { + "epoch": 0.1629771930216454, + "grad_norm": 0.45125991106033325, + "learning_rate": 9.543476012107313e-05, + "loss": 1.1044, + "step": 1824 + }, + { + "epoch": 0.16306654455290728, + "grad_norm": 0.4255354404449463, + "learning_rate": 9.542871726975327e-05, + "loss": 1.0465, + "step": 1825 + }, + { + "epoch": 0.16315589608416914, + "grad_norm": 0.4363675117492676, + "learning_rate": 9.542267061329406e-05, + "loss": 1.0555, + "step": 1826 + }, + { + "epoch": 0.16324524761543102, + "grad_norm": 0.3892308175563812, + "learning_rate": 9.541662015220205e-05, + "loss": 1.055, + "step": 1827 + }, + { + "epoch": 0.16333459914669288, + "grad_norm": 0.43629783391952515, + "learning_rate": 9.541056588698397e-05, + "loss": 1.0117, + "step": 1828 + }, + { + "epoch": 0.16342395067795473, + "grad_norm": 0.454475998878479, + "learning_rate": 9.540450781814696e-05, + "loss": 1.0849, + "step": 1829 + }, + { + "epoch": 0.16351330220921662, + "grad_norm": 0.41698095202445984, + "learning_rate": 9.539844594619845e-05, + "loss": 1.0137, + "step": 1830 + }, + { + "epoch": 0.16360265374047847, + "grad_norm": 0.41076821088790894, + "learning_rate": 9.539238027164619e-05, + "loss": 0.9891, + "step": 1831 + }, + { + "epoch": 0.16369200527174035, + "grad_norm": 0.4605240821838379, + "learning_rate": 9.538631079499823e-05, + "loss": 0.9447, + "step": 1832 + }, + { + "epoch": 0.1637813568030022, + "grad_norm": 0.36705470085144043, + "learning_rate": 9.538023751676294e-05, + "loss": 1.023, + "step": 1833 + }, + { + "epoch": 0.16387070833426406, + "grad_norm": 0.44232022762298584, + "learning_rate": 9.537416043744905e-05, + "loss": 0.9757, + "step": 1834 + }, + { + "epoch": 0.16396005986552595, + "grad_norm": 0.41922441124916077, + "learning_rate": 9.536807955756557e-05, + "loss": 1.0834, + "step": 1835 + }, + { + "epoch": 0.1640494113967878, + "grad_norm": 0.44340333342552185, + "learning_rate": 9.536199487762182e-05, + "loss": 0.9799, + "step": 1836 + }, + { + "epoch": 0.1641387629280497, + "grad_norm": 0.5596421360969543, + "learning_rate": 9.53559063981275e-05, + "loss": 0.921, + "step": 1837 + }, + { + "epoch": 0.16422811445931154, + "grad_norm": 0.3959425091743469, + "learning_rate": 9.534981411959255e-05, + "loss": 1.0426, + "step": 1838 + }, + { + "epoch": 0.16431746599057342, + "grad_norm": 0.431082546710968, + "learning_rate": 9.534371804252728e-05, + "loss": 1.1178, + "step": 1839 + }, + { + "epoch": 0.16440681752183528, + "grad_norm": 0.42235735058784485, + "learning_rate": 9.533761816744228e-05, + "loss": 1.0315, + "step": 1840 + }, + { + "epoch": 0.16449616905309714, + "grad_norm": 0.4385235905647278, + "learning_rate": 9.533151449484851e-05, + "loss": 1.0135, + "step": 1841 + }, + { + "epoch": 0.16458552058435902, + "grad_norm": 0.41497084498405457, + "learning_rate": 9.53254070252572e-05, + "loss": 1.0006, + "step": 1842 + }, + { + "epoch": 0.16467487211562087, + "grad_norm": 0.4509193003177643, + "learning_rate": 9.531929575917991e-05, + "loss": 0.9943, + "step": 1843 + }, + { + "epoch": 0.16476422364688276, + "grad_norm": 0.45146405696868896, + "learning_rate": 9.531318069712854e-05, + "loss": 0.9926, + "step": 1844 + }, + { + "epoch": 0.1648535751781446, + "grad_norm": 0.43658965826034546, + "learning_rate": 9.530706183961526e-05, + "loss": 0.9999, + "step": 1845 + }, + { + "epoch": 0.1649429267094065, + "grad_norm": 0.4209139347076416, + "learning_rate": 9.530093918715264e-05, + "loss": 1.0795, + "step": 1846 + }, + { + "epoch": 0.16503227824066835, + "grad_norm": 0.39283883571624756, + "learning_rate": 9.529481274025347e-05, + "loss": 1.101, + "step": 1847 + }, + { + "epoch": 0.1651216297719302, + "grad_norm": 0.5160419344902039, + "learning_rate": 9.528868249943095e-05, + "loss": 0.9985, + "step": 1848 + }, + { + "epoch": 0.1652109813031921, + "grad_norm": 0.5293583869934082, + "learning_rate": 9.528254846519851e-05, + "loss": 0.9466, + "step": 1849 + }, + { + "epoch": 0.16530033283445394, + "grad_norm": 0.4382696747779846, + "learning_rate": 9.527641063806996e-05, + "loss": 1.0173, + "step": 1850 + }, + { + "epoch": 0.16538968436571583, + "grad_norm": 0.4551234841346741, + "learning_rate": 9.52702690185594e-05, + "loss": 0.9932, + "step": 1851 + }, + { + "epoch": 0.16547903589697768, + "grad_norm": 0.42946553230285645, + "learning_rate": 9.526412360718127e-05, + "loss": 1.0688, + "step": 1852 + }, + { + "epoch": 0.16556838742823954, + "grad_norm": 0.5565560460090637, + "learning_rate": 9.525797440445031e-05, + "loss": 1.024, + "step": 1853 + }, + { + "epoch": 0.16565773895950142, + "grad_norm": 0.483884334564209, + "learning_rate": 9.525182141088159e-05, + "loss": 0.9546, + "step": 1854 + }, + { + "epoch": 0.16574709049076328, + "grad_norm": 0.4862859547138214, + "learning_rate": 9.524566462699045e-05, + "loss": 0.9695, + "step": 1855 + }, + { + "epoch": 0.16583644202202516, + "grad_norm": 0.5029091238975525, + "learning_rate": 9.523950405329262e-05, + "loss": 0.9478, + "step": 1856 + }, + { + "epoch": 0.16592579355328702, + "grad_norm": 0.6389296650886536, + "learning_rate": 9.523333969030413e-05, + "loss": 1.0239, + "step": 1857 + }, + { + "epoch": 0.1660151450845489, + "grad_norm": 0.39252007007598877, + "learning_rate": 9.522717153854125e-05, + "loss": 1.1161, + "step": 1858 + }, + { + "epoch": 0.16610449661581075, + "grad_norm": 0.45961377024650574, + "learning_rate": 9.522099959852071e-05, + "loss": 1.0234, + "step": 1859 + }, + { + "epoch": 0.1661938481470726, + "grad_norm": 0.3859967291355133, + "learning_rate": 9.52148238707594e-05, + "loss": 1.0622, + "step": 1860 + }, + { + "epoch": 0.1662831996783345, + "grad_norm": 0.48423507809638977, + "learning_rate": 9.520864435577466e-05, + "loss": 1.0117, + "step": 1861 + }, + { + "epoch": 0.16637255120959635, + "grad_norm": 0.4309901297092438, + "learning_rate": 9.520246105408403e-05, + "loss": 1.0523, + "step": 1862 + }, + { + "epoch": 0.16646190274085823, + "grad_norm": 0.487351655960083, + "learning_rate": 9.519627396620549e-05, + "loss": 0.9941, + "step": 1863 + }, + { + "epoch": 0.1665512542721201, + "grad_norm": 0.44148099422454834, + "learning_rate": 9.519008309265724e-05, + "loss": 1.0301, + "step": 1864 + }, + { + "epoch": 0.16664060580338197, + "grad_norm": 0.4186917245388031, + "learning_rate": 9.518388843395786e-05, + "loss": 0.996, + "step": 1865 + }, + { + "epoch": 0.16672995733464382, + "grad_norm": 0.4065192937850952, + "learning_rate": 9.517768999062617e-05, + "loss": 1.0351, + "step": 1866 + }, + { + "epoch": 0.16681930886590568, + "grad_norm": 0.402235209941864, + "learning_rate": 9.51714877631814e-05, + "loss": 1.0227, + "step": 1867 + }, + { + "epoch": 0.16690866039716756, + "grad_norm": 0.4134058952331543, + "learning_rate": 9.516528175214303e-05, + "loss": 1.0518, + "step": 1868 + }, + { + "epoch": 0.16699801192842942, + "grad_norm": 0.39137229323387146, + "learning_rate": 9.515907195803088e-05, + "loss": 1.0648, + "step": 1869 + }, + { + "epoch": 0.1670873634596913, + "grad_norm": 0.43055975437164307, + "learning_rate": 9.51528583813651e-05, + "loss": 0.9951, + "step": 1870 + }, + { + "epoch": 0.16717671499095316, + "grad_norm": 0.45151713490486145, + "learning_rate": 9.514664102266615e-05, + "loss": 1.0613, + "step": 1871 + }, + { + "epoch": 0.167266066522215, + "grad_norm": 0.45818963646888733, + "learning_rate": 9.514041988245477e-05, + "loss": 1.0874, + "step": 1872 + }, + { + "epoch": 0.1673554180534769, + "grad_norm": 0.38174670934677124, + "learning_rate": 9.513419496125206e-05, + "loss": 1.0888, + "step": 1873 + }, + { + "epoch": 0.16744476958473875, + "grad_norm": 0.4613979160785675, + "learning_rate": 9.512796625957943e-05, + "loss": 0.9875, + "step": 1874 + }, + { + "epoch": 0.16753412111600063, + "grad_norm": 0.49428126215934753, + "learning_rate": 9.512173377795859e-05, + "loss": 0.9716, + "step": 1875 + }, + { + "epoch": 0.1676234726472625, + "grad_norm": 0.5154160261154175, + "learning_rate": 9.511549751691159e-05, + "loss": 0.9448, + "step": 1876 + }, + { + "epoch": 0.16771282417852437, + "grad_norm": 0.4421161711215973, + "learning_rate": 9.510925747696077e-05, + "loss": 1.009, + "step": 1877 + }, + { + "epoch": 0.16780217570978623, + "grad_norm": 0.5333375930786133, + "learning_rate": 9.510301365862882e-05, + "loss": 1.0682, + "step": 1878 + }, + { + "epoch": 0.16789152724104808, + "grad_norm": 0.35133177042007446, + "learning_rate": 9.509676606243869e-05, + "loss": 1.0646, + "step": 1879 + }, + { + "epoch": 0.16798087877230997, + "grad_norm": 0.39709219336509705, + "learning_rate": 9.509051468891372e-05, + "loss": 1.0559, + "step": 1880 + }, + { + "epoch": 0.16807023030357182, + "grad_norm": 0.45971524715423584, + "learning_rate": 9.508425953857752e-05, + "loss": 1.0145, + "step": 1881 + }, + { + "epoch": 0.1681595818348337, + "grad_norm": 0.44881150126457214, + "learning_rate": 9.507800061195401e-05, + "loss": 1.0404, + "step": 1882 + }, + { + "epoch": 0.16824893336609556, + "grad_norm": 0.5401568412780762, + "learning_rate": 9.507173790956746e-05, + "loss": 0.9762, + "step": 1883 + }, + { + "epoch": 0.16833828489735742, + "grad_norm": 0.48548710346221924, + "learning_rate": 9.506547143194242e-05, + "loss": 1.0197, + "step": 1884 + }, + { + "epoch": 0.1684276364286193, + "grad_norm": 0.4435642659664154, + "learning_rate": 9.505920117960379e-05, + "loss": 1.0289, + "step": 1885 + }, + { + "epoch": 0.16851698795988115, + "grad_norm": 0.4363146722316742, + "learning_rate": 9.505292715307676e-05, + "loss": 1.0346, + "step": 1886 + }, + { + "epoch": 0.16860633949114304, + "grad_norm": 0.45172473788261414, + "learning_rate": 9.504664935288685e-05, + "loss": 1.0222, + "step": 1887 + }, + { + "epoch": 0.1686956910224049, + "grad_norm": 0.4839716851711273, + "learning_rate": 9.504036777955991e-05, + "loss": 1.0304, + "step": 1888 + }, + { + "epoch": 0.16878504255366678, + "grad_norm": 0.4805682599544525, + "learning_rate": 9.503408243362206e-05, + "loss": 1.0433, + "step": 1889 + }, + { + "epoch": 0.16887439408492863, + "grad_norm": 0.4286407232284546, + "learning_rate": 9.502779331559977e-05, + "loss": 1.059, + "step": 1890 + }, + { + "epoch": 0.16896374561619049, + "grad_norm": 0.39787065982818604, + "learning_rate": 9.502150042601985e-05, + "loss": 1.1139, + "step": 1891 + }, + { + "epoch": 0.16905309714745237, + "grad_norm": 0.42047420144081116, + "learning_rate": 9.501520376540936e-05, + "loss": 1.0147, + "step": 1892 + }, + { + "epoch": 0.16914244867871422, + "grad_norm": 0.4411332309246063, + "learning_rate": 9.500890333429573e-05, + "loss": 1.0576, + "step": 1893 + }, + { + "epoch": 0.1692318002099761, + "grad_norm": 0.43769532442092896, + "learning_rate": 9.500259913320668e-05, + "loss": 1.0455, + "step": 1894 + }, + { + "epoch": 0.16932115174123796, + "grad_norm": 0.4141598343849182, + "learning_rate": 9.499629116267026e-05, + "loss": 1.0782, + "step": 1895 + }, + { + "epoch": 0.16941050327249985, + "grad_norm": 0.4736109972000122, + "learning_rate": 9.498997942321483e-05, + "loss": 0.9884, + "step": 1896 + }, + { + "epoch": 0.1694998548037617, + "grad_norm": 0.38666075468063354, + "learning_rate": 9.498366391536907e-05, + "loss": 1.0701, + "step": 1897 + }, + { + "epoch": 0.16958920633502356, + "grad_norm": 0.4399631917476654, + "learning_rate": 9.497734463966196e-05, + "loss": 0.996, + "step": 1898 + }, + { + "epoch": 0.16967855786628544, + "grad_norm": 0.3586561977863312, + "learning_rate": 9.497102159662281e-05, + "loss": 1.0366, + "step": 1899 + }, + { + "epoch": 0.1697679093975473, + "grad_norm": 0.3949701189994812, + "learning_rate": 9.496469478678126e-05, + "loss": 1.0616, + "step": 1900 + }, + { + "epoch": 0.16985726092880918, + "grad_norm": 0.38532134890556335, + "learning_rate": 9.495836421066722e-05, + "loss": 1.0576, + "step": 1901 + }, + { + "epoch": 0.16994661246007103, + "grad_norm": 0.4000793993473053, + "learning_rate": 9.495202986881095e-05, + "loss": 1.0262, + "step": 1902 + }, + { + "epoch": 0.1700359639913329, + "grad_norm": 0.3810192048549652, + "learning_rate": 9.494569176174304e-05, + "loss": 1.0097, + "step": 1903 + }, + { + "epoch": 0.17012531552259477, + "grad_norm": 0.4256387948989868, + "learning_rate": 9.493934988999436e-05, + "loss": 1.0186, + "step": 1904 + }, + { + "epoch": 0.17021466705385663, + "grad_norm": 0.4718528389930725, + "learning_rate": 9.493300425409609e-05, + "loss": 0.9866, + "step": 1905 + }, + { + "epoch": 0.1703040185851185, + "grad_norm": 0.3986504077911377, + "learning_rate": 9.492665485457976e-05, + "loss": 1.0735, + "step": 1906 + }, + { + "epoch": 0.17039337011638037, + "grad_norm": 0.4253017008304596, + "learning_rate": 9.492030169197722e-05, + "loss": 1.0168, + "step": 1907 + }, + { + "epoch": 0.17048272164764225, + "grad_norm": 0.34923726320266724, + "learning_rate": 9.49139447668206e-05, + "loss": 1.1098, + "step": 1908 + }, + { + "epoch": 0.1705720731789041, + "grad_norm": 0.46308934688568115, + "learning_rate": 9.490758407964234e-05, + "loss": 0.9807, + "step": 1909 + }, + { + "epoch": 0.17066142471016596, + "grad_norm": 0.5132005214691162, + "learning_rate": 9.490121963097525e-05, + "loss": 1.0316, + "step": 1910 + }, + { + "epoch": 0.17075077624142784, + "grad_norm": 0.4535066485404968, + "learning_rate": 9.489485142135238e-05, + "loss": 0.9653, + "step": 1911 + }, + { + "epoch": 0.1708401277726897, + "grad_norm": 0.41993486881256104, + "learning_rate": 9.488847945130718e-05, + "loss": 1.0554, + "step": 1912 + }, + { + "epoch": 0.17092947930395158, + "grad_norm": 0.46821948885917664, + "learning_rate": 9.488210372137335e-05, + "loss": 0.9511, + "step": 1913 + }, + { + "epoch": 0.17101883083521344, + "grad_norm": 0.5106322765350342, + "learning_rate": 9.487572423208491e-05, + "loss": 0.9633, + "step": 1914 + }, + { + "epoch": 0.1711081823664753, + "grad_norm": 0.4303719997406006, + "learning_rate": 9.486934098397622e-05, + "loss": 1.0664, + "step": 1915 + }, + { + "epoch": 0.17119753389773718, + "grad_norm": 0.3766723573207855, + "learning_rate": 9.486295397758196e-05, + "loss": 1.0226, + "step": 1916 + }, + { + "epoch": 0.17128688542899903, + "grad_norm": 0.4431716799736023, + "learning_rate": 9.48565632134371e-05, + "loss": 1.0226, + "step": 1917 + }, + { + "epoch": 0.1713762369602609, + "grad_norm": 0.41526105999946594, + "learning_rate": 9.485016869207695e-05, + "loss": 1.0072, + "step": 1918 + }, + { + "epoch": 0.17146558849152277, + "grad_norm": 0.4168999195098877, + "learning_rate": 9.484377041403706e-05, + "loss": 1.0846, + "step": 1919 + }, + { + "epoch": 0.17155494002278465, + "grad_norm": 0.4972972571849823, + "learning_rate": 9.483736837985344e-05, + "loss": 1.0172, + "step": 1920 + }, + { + "epoch": 0.1716442915540465, + "grad_norm": 0.4009465277194977, + "learning_rate": 9.483096259006228e-05, + "loss": 1.04, + "step": 1921 + }, + { + "epoch": 0.17173364308530836, + "grad_norm": 0.37553495168685913, + "learning_rate": 9.482455304520013e-05, + "loss": 1.0406, + "step": 1922 + }, + { + "epoch": 0.17182299461657025, + "grad_norm": 0.37648624181747437, + "learning_rate": 9.481813974580386e-05, + "loss": 1.0371, + "step": 1923 + }, + { + "epoch": 0.1719123461478321, + "grad_norm": 0.3838346302509308, + "learning_rate": 9.481172269241067e-05, + "loss": 0.9912, + "step": 1924 + }, + { + "epoch": 0.17200169767909398, + "grad_norm": 0.4350638687610626, + "learning_rate": 9.480530188555805e-05, + "loss": 1.0651, + "step": 1925 + }, + { + "epoch": 0.17209104921035584, + "grad_norm": 0.46230390667915344, + "learning_rate": 9.47988773257838e-05, + "loss": 0.9876, + "step": 1926 + }, + { + "epoch": 0.17218040074161772, + "grad_norm": 0.41337013244628906, + "learning_rate": 9.479244901362605e-05, + "loss": 1.09, + "step": 1927 + }, + { + "epoch": 0.17226975227287958, + "grad_norm": 0.4298297166824341, + "learning_rate": 9.478601694962323e-05, + "loss": 1.1069, + "step": 1928 + }, + { + "epoch": 0.17235910380414143, + "grad_norm": 0.4469965398311615, + "learning_rate": 9.477958113431413e-05, + "loss": 1.0354, + "step": 1929 + }, + { + "epoch": 0.17244845533540332, + "grad_norm": 0.4125996530056, + "learning_rate": 9.477314156823779e-05, + "loss": 1.0402, + "step": 1930 + }, + { + "epoch": 0.17253780686666517, + "grad_norm": 0.44828668236732483, + "learning_rate": 9.476669825193359e-05, + "loss": 1.0471, + "step": 1931 + }, + { + "epoch": 0.17262715839792706, + "grad_norm": 0.4745740592479706, + "learning_rate": 9.476025118594124e-05, + "loss": 1.01, + "step": 1932 + }, + { + "epoch": 0.1727165099291889, + "grad_norm": 0.46582111716270447, + "learning_rate": 9.475380037080073e-05, + "loss": 1.0378, + "step": 1933 + }, + { + "epoch": 0.17280586146045077, + "grad_norm": 0.45007333159446716, + "learning_rate": 9.47473458070524e-05, + "loss": 1.0021, + "step": 1934 + }, + { + "epoch": 0.17289521299171265, + "grad_norm": 0.3929324448108673, + "learning_rate": 9.474088749523689e-05, + "loss": 0.9972, + "step": 1935 + }, + { + "epoch": 0.1729845645229745, + "grad_norm": 0.42300376296043396, + "learning_rate": 9.473442543589515e-05, + "loss": 1.0603, + "step": 1936 + }, + { + "epoch": 0.1730739160542364, + "grad_norm": 0.5826932191848755, + "learning_rate": 9.472795962956844e-05, + "loss": 0.9595, + "step": 1937 + }, + { + "epoch": 0.17316326758549824, + "grad_norm": 0.5283807516098022, + "learning_rate": 9.472149007679836e-05, + "loss": 0.9726, + "step": 1938 + }, + { + "epoch": 0.17325261911676013, + "grad_norm": 0.4322817921638489, + "learning_rate": 9.471501677812677e-05, + "loss": 0.9882, + "step": 1939 + }, + { + "epoch": 0.17334197064802198, + "grad_norm": 0.42819416522979736, + "learning_rate": 9.47085397340959e-05, + "loss": 1.0776, + "step": 1940 + }, + { + "epoch": 0.17343132217928384, + "grad_norm": 0.379272997379303, + "learning_rate": 9.470205894524829e-05, + "loss": 1.0924, + "step": 1941 + }, + { + "epoch": 0.17352067371054572, + "grad_norm": 0.4515071213245392, + "learning_rate": 9.469557441212674e-05, + "loss": 1.0024, + "step": 1942 + }, + { + "epoch": 0.17361002524180758, + "grad_norm": 0.4782482981681824, + "learning_rate": 9.468908613527441e-05, + "loss": 0.9794, + "step": 1943 + }, + { + "epoch": 0.17369937677306946, + "grad_norm": 0.3846750855445862, + "learning_rate": 9.468259411523476e-05, + "loss": 0.9786, + "step": 1944 + }, + { + "epoch": 0.1737887283043313, + "grad_norm": 0.4089057743549347, + "learning_rate": 9.467609835255158e-05, + "loss": 1.1039, + "step": 1945 + }, + { + "epoch": 0.17387807983559317, + "grad_norm": 0.5269849896430969, + "learning_rate": 9.466959884776894e-05, + "loss": 1.0726, + "step": 1946 + }, + { + "epoch": 0.17396743136685505, + "grad_norm": 0.43416985869407654, + "learning_rate": 9.466309560143126e-05, + "loss": 1.0341, + "step": 1947 + }, + { + "epoch": 0.1740567828981169, + "grad_norm": 0.41887369751930237, + "learning_rate": 9.465658861408324e-05, + "loss": 1.0519, + "step": 1948 + }, + { + "epoch": 0.1741461344293788, + "grad_norm": 0.47824710607528687, + "learning_rate": 9.465007788626993e-05, + "loss": 1.0775, + "step": 1949 + }, + { + "epoch": 0.17423548596064065, + "grad_norm": 0.3974583148956299, + "learning_rate": 9.464356341853666e-05, + "loss": 1.026, + "step": 1950 + }, + { + "epoch": 0.17432483749190253, + "grad_norm": 0.4442320764064789, + "learning_rate": 9.463704521142909e-05, + "loss": 1.0781, + "step": 1951 + }, + { + "epoch": 0.17441418902316438, + "grad_norm": 0.4253045618534088, + "learning_rate": 9.463052326549317e-05, + "loss": 1.0449, + "step": 1952 + }, + { + "epoch": 0.17450354055442624, + "grad_norm": 0.40720415115356445, + "learning_rate": 9.462399758127521e-05, + "loss": 1.0281, + "step": 1953 + }, + { + "epoch": 0.17459289208568812, + "grad_norm": 0.46845048666000366, + "learning_rate": 9.46174681593218e-05, + "loss": 1.0313, + "step": 1954 + }, + { + "epoch": 0.17468224361694998, + "grad_norm": 0.4253866374492645, + "learning_rate": 9.461093500017984e-05, + "loss": 1.0191, + "step": 1955 + }, + { + "epoch": 0.17477159514821186, + "grad_norm": 0.41982749104499817, + "learning_rate": 9.460439810439655e-05, + "loss": 1.0332, + "step": 1956 + }, + { + "epoch": 0.17486094667947372, + "grad_norm": 0.45301470160484314, + "learning_rate": 9.459785747251948e-05, + "loss": 0.9582, + "step": 1957 + }, + { + "epoch": 0.1749502982107356, + "grad_norm": 0.44690030813217163, + "learning_rate": 9.459131310509646e-05, + "loss": 1.0285, + "step": 1958 + }, + { + "epoch": 0.17503964974199746, + "grad_norm": 0.5572661757469177, + "learning_rate": 9.458476500267566e-05, + "loss": 0.9837, + "step": 1959 + }, + { + "epoch": 0.1751290012732593, + "grad_norm": 0.3876509666442871, + "learning_rate": 9.457821316580555e-05, + "loss": 1.058, + "step": 1960 + }, + { + "epoch": 0.1752183528045212, + "grad_norm": 0.4823840856552124, + "learning_rate": 9.457165759503493e-05, + "loss": 0.9689, + "step": 1961 + }, + { + "epoch": 0.17530770433578305, + "grad_norm": 0.4866883456707001, + "learning_rate": 9.456509829091287e-05, + "loss": 1.0991, + "step": 1962 + }, + { + "epoch": 0.17539705586704493, + "grad_norm": 0.47182294726371765, + "learning_rate": 9.455853525398881e-05, + "loss": 0.898, + "step": 1963 + }, + { + "epoch": 0.1754864073983068, + "grad_norm": 0.4766868054866791, + "learning_rate": 9.455196848481244e-05, + "loss": 1.017, + "step": 1964 + }, + { + "epoch": 0.17557575892956864, + "grad_norm": 0.47620826959609985, + "learning_rate": 9.454539798393385e-05, + "loss": 1.117, + "step": 1965 + }, + { + "epoch": 0.17566511046083053, + "grad_norm": 0.391517698764801, + "learning_rate": 9.453882375190335e-05, + "loss": 1.0444, + "step": 1966 + }, + { + "epoch": 0.17575446199209238, + "grad_norm": 0.4250124394893646, + "learning_rate": 9.45322457892716e-05, + "loss": 1.1342, + "step": 1967 + }, + { + "epoch": 0.17584381352335426, + "grad_norm": 0.4004350006580353, + "learning_rate": 9.45256640965896e-05, + "loss": 1.0022, + "step": 1968 + }, + { + "epoch": 0.17593316505461612, + "grad_norm": 0.4620151221752167, + "learning_rate": 9.451907867440862e-05, + "loss": 1.0139, + "step": 1969 + }, + { + "epoch": 0.176022516585878, + "grad_norm": 0.43265095353126526, + "learning_rate": 9.451248952328025e-05, + "loss": 1.1093, + "step": 1970 + }, + { + "epoch": 0.17611186811713986, + "grad_norm": 0.5694372653961182, + "learning_rate": 9.450589664375643e-05, + "loss": 0.9312, + "step": 1971 + }, + { + "epoch": 0.1762012196484017, + "grad_norm": 0.4211016297340393, + "learning_rate": 9.449930003638935e-05, + "loss": 1.0137, + "step": 1972 + }, + { + "epoch": 0.1762905711796636, + "grad_norm": 0.44613519310951233, + "learning_rate": 9.449269970173158e-05, + "loss": 0.969, + "step": 1973 + }, + { + "epoch": 0.17637992271092545, + "grad_norm": 0.3986136019229889, + "learning_rate": 9.448609564033593e-05, + "loss": 1.1211, + "step": 1974 + }, + { + "epoch": 0.17646927424218734, + "grad_norm": 0.5278311967849731, + "learning_rate": 9.44794878527556e-05, + "loss": 0.9798, + "step": 1975 + }, + { + "epoch": 0.1765586257734492, + "grad_norm": 0.5130774974822998, + "learning_rate": 9.447287633954405e-05, + "loss": 1.0396, + "step": 1976 + }, + { + "epoch": 0.17664797730471105, + "grad_norm": 0.483941912651062, + "learning_rate": 9.446626110125505e-05, + "loss": 0.9771, + "step": 1977 + }, + { + "epoch": 0.17673732883597293, + "grad_norm": 0.46051788330078125, + "learning_rate": 9.445964213844269e-05, + "loss": 1.0256, + "step": 1978 + }, + { + "epoch": 0.17682668036723478, + "grad_norm": 0.5272265076637268, + "learning_rate": 9.445301945166143e-05, + "loss": 0.9884, + "step": 1979 + }, + { + "epoch": 0.17691603189849667, + "grad_norm": 0.4559291899204254, + "learning_rate": 9.444639304146593e-05, + "loss": 0.9899, + "step": 1980 + }, + { + "epoch": 0.17700538342975852, + "grad_norm": 0.42420053482055664, + "learning_rate": 9.443976290841126e-05, + "loss": 0.9969, + "step": 1981 + }, + { + "epoch": 0.1770947349610204, + "grad_norm": 0.5223949551582336, + "learning_rate": 9.443312905305274e-05, + "loss": 1.0163, + "step": 1982 + }, + { + "epoch": 0.17718408649228226, + "grad_norm": 0.4252959191799164, + "learning_rate": 9.442649147594606e-05, + "loss": 1.0653, + "step": 1983 + }, + { + "epoch": 0.17727343802354412, + "grad_norm": 0.40141063928604126, + "learning_rate": 9.441985017764715e-05, + "loss": 1.0184, + "step": 1984 + }, + { + "epoch": 0.177362789554806, + "grad_norm": 0.4690384566783905, + "learning_rate": 9.44132051587123e-05, + "loss": 0.984, + "step": 1985 + }, + { + "epoch": 0.17745214108606785, + "grad_norm": 0.412387490272522, + "learning_rate": 9.440655641969814e-05, + "loss": 1.0184, + "step": 1986 + }, + { + "epoch": 0.17754149261732974, + "grad_norm": 0.37352073192596436, + "learning_rate": 9.43999039611615e-05, + "loss": 1.0471, + "step": 1987 + }, + { + "epoch": 0.1776308441485916, + "grad_norm": 0.4083310663700104, + "learning_rate": 9.439324778365965e-05, + "loss": 1.043, + "step": 1988 + }, + { + "epoch": 0.17772019567985348, + "grad_norm": 0.41002601385116577, + "learning_rate": 9.43865878877501e-05, + "loss": 0.9932, + "step": 1989 + }, + { + "epoch": 0.17780954721111533, + "grad_norm": 0.5643458962440491, + "learning_rate": 9.437992427399069e-05, + "loss": 0.9207, + "step": 1990 + }, + { + "epoch": 0.1778988987423772, + "grad_norm": 0.43547579646110535, + "learning_rate": 9.437325694293957e-05, + "loss": 1.0288, + "step": 1991 + }, + { + "epoch": 0.17798825027363907, + "grad_norm": 0.41671237349510193, + "learning_rate": 9.43665858951552e-05, + "loss": 1.0792, + "step": 1992 + }, + { + "epoch": 0.17807760180490093, + "grad_norm": 0.3886220455169678, + "learning_rate": 9.435991113119634e-05, + "loss": 1.0739, + "step": 1993 + }, + { + "epoch": 0.1781669533361628, + "grad_norm": 0.42051705718040466, + "learning_rate": 9.435323265162207e-05, + "loss": 0.9779, + "step": 1994 + }, + { + "epoch": 0.17825630486742466, + "grad_norm": 0.4543735086917877, + "learning_rate": 9.43465504569918e-05, + "loss": 0.9808, + "step": 1995 + }, + { + "epoch": 0.17834565639868652, + "grad_norm": 0.43816715478897095, + "learning_rate": 9.433986454786523e-05, + "loss": 1.028, + "step": 1996 + }, + { + "epoch": 0.1784350079299484, + "grad_norm": 0.3944636881351471, + "learning_rate": 9.433317492480238e-05, + "loss": 1.0616, + "step": 1997 + }, + { + "epoch": 0.17852435946121026, + "grad_norm": 0.39649829268455505, + "learning_rate": 9.432648158836357e-05, + "loss": 0.9368, + "step": 1998 + }, + { + "epoch": 0.17861371099247214, + "grad_norm": 0.422755628824234, + "learning_rate": 9.431978453910943e-05, + "loss": 1.065, + "step": 1999 + }, + { + "epoch": 0.178703062523734, + "grad_norm": 0.40051138401031494, + "learning_rate": 9.431308377760094e-05, + "loss": 1.0022, + "step": 2000 + }, + { + "epoch": 0.17879241405499588, + "grad_norm": 0.514773428440094, + "learning_rate": 9.430637930439933e-05, + "loss": 0.9755, + "step": 2001 + }, + { + "epoch": 0.17888176558625773, + "grad_norm": 0.36524417996406555, + "learning_rate": 9.429967112006619e-05, + "loss": 1.0829, + "step": 2002 + }, + { + "epoch": 0.1789711171175196, + "grad_norm": 0.3744485378265381, + "learning_rate": 9.429295922516337e-05, + "loss": 1.0275, + "step": 2003 + }, + { + "epoch": 0.17906046864878147, + "grad_norm": 0.4923553168773651, + "learning_rate": 9.42862436202531e-05, + "loss": 0.9636, + "step": 2004 + }, + { + "epoch": 0.17914982018004333, + "grad_norm": 0.3743354082107544, + "learning_rate": 9.427952430589789e-05, + "loss": 1.0459, + "step": 2005 + }, + { + "epoch": 0.1792391717113052, + "grad_norm": 0.4594390392303467, + "learning_rate": 9.42728012826605e-05, + "loss": 1.0414, + "step": 2006 + }, + { + "epoch": 0.17932852324256707, + "grad_norm": 0.4210093915462494, + "learning_rate": 9.42660745511041e-05, + "loss": 1.088, + "step": 2007 + }, + { + "epoch": 0.17941787477382895, + "grad_norm": 0.4122314751148224, + "learning_rate": 9.425934411179211e-05, + "loss": 0.9287, + "step": 2008 + }, + { + "epoch": 0.1795072263050908, + "grad_norm": 0.41803067922592163, + "learning_rate": 9.425260996528829e-05, + "loss": 1.0501, + "step": 2009 + }, + { + "epoch": 0.17959657783635266, + "grad_norm": 0.4542330503463745, + "learning_rate": 9.424587211215669e-05, + "loss": 1.0908, + "step": 2010 + }, + { + "epoch": 0.17968592936761454, + "grad_norm": 0.4137341380119324, + "learning_rate": 9.423913055296165e-05, + "loss": 0.9796, + "step": 2011 + }, + { + "epoch": 0.1797752808988764, + "grad_norm": 0.3948180079460144, + "learning_rate": 9.42323852882679e-05, + "loss": 1.0888, + "step": 2012 + }, + { + "epoch": 0.17986463243013828, + "grad_norm": 0.398710161447525, + "learning_rate": 9.422563631864038e-05, + "loss": 1.0466, + "step": 2013 + }, + { + "epoch": 0.17995398396140014, + "grad_norm": 0.42401716113090515, + "learning_rate": 9.421888364464442e-05, + "loss": 1.0015, + "step": 2014 + }, + { + "epoch": 0.180043335492662, + "grad_norm": 0.45904120802879333, + "learning_rate": 9.42121272668456e-05, + "loss": 0.9718, + "step": 2015 + }, + { + "epoch": 0.18013268702392388, + "grad_norm": 0.45379403233528137, + "learning_rate": 9.420536718580986e-05, + "loss": 1.0008, + "step": 2016 + }, + { + "epoch": 0.18022203855518573, + "grad_norm": 0.4098423421382904, + "learning_rate": 9.419860340210342e-05, + "loss": 1.1183, + "step": 2017 + }, + { + "epoch": 0.18031139008644761, + "grad_norm": 0.3899945616722107, + "learning_rate": 9.419183591629284e-05, + "loss": 1.0653, + "step": 2018 + }, + { + "epoch": 0.18040074161770947, + "grad_norm": 0.44282397627830505, + "learning_rate": 9.418506472894492e-05, + "loss": 1.0232, + "step": 2019 + }, + { + "epoch": 0.18049009314897135, + "grad_norm": 0.3919029235839844, + "learning_rate": 9.417828984062687e-05, + "loss": 1.031, + "step": 2020 + }, + { + "epoch": 0.1805794446802332, + "grad_norm": 0.3580346703529358, + "learning_rate": 9.417151125190614e-05, + "loss": 1.1478, + "step": 2021 + }, + { + "epoch": 0.18066879621149506, + "grad_norm": 0.4946131706237793, + "learning_rate": 9.416472896335051e-05, + "loss": 0.9887, + "step": 2022 + }, + { + "epoch": 0.18075814774275695, + "grad_norm": 0.4798043966293335, + "learning_rate": 9.415794297552805e-05, + "loss": 0.9341, + "step": 2023 + }, + { + "epoch": 0.1808474992740188, + "grad_norm": 0.42261967062950134, + "learning_rate": 9.415115328900719e-05, + "loss": 1.063, + "step": 2024 + }, + { + "epoch": 0.18093685080528069, + "grad_norm": 0.4271705448627472, + "learning_rate": 9.414435990435663e-05, + "loss": 1.0327, + "step": 2025 + }, + { + "epoch": 0.18102620233654254, + "grad_norm": 0.394653856754303, + "learning_rate": 9.413756282214537e-05, + "loss": 1.1644, + "step": 2026 + }, + { + "epoch": 0.1811155538678044, + "grad_norm": 0.43346062302589417, + "learning_rate": 9.413076204294275e-05, + "loss": 1.0556, + "step": 2027 + }, + { + "epoch": 0.18120490539906628, + "grad_norm": 0.390903115272522, + "learning_rate": 9.412395756731843e-05, + "loss": 1.0185, + "step": 2028 + }, + { + "epoch": 0.18129425693032813, + "grad_norm": 0.3963910937309265, + "learning_rate": 9.41171493958423e-05, + "loss": 1.0377, + "step": 2029 + }, + { + "epoch": 0.18138360846159002, + "grad_norm": 0.4594999849796295, + "learning_rate": 9.41103375290847e-05, + "loss": 1.0494, + "step": 2030 + }, + { + "epoch": 0.18147295999285187, + "grad_norm": 0.42716097831726074, + "learning_rate": 9.410352196761612e-05, + "loss": 0.9836, + "step": 2031 + }, + { + "epoch": 0.18156231152411376, + "grad_norm": 0.43300801515579224, + "learning_rate": 9.409670271200745e-05, + "loss": 1.0358, + "step": 2032 + }, + { + "epoch": 0.1816516630553756, + "grad_norm": 0.3748253583908081, + "learning_rate": 9.408987976282993e-05, + "loss": 1.0185, + "step": 2033 + }, + { + "epoch": 0.18174101458663747, + "grad_norm": 0.4405686855316162, + "learning_rate": 9.408305312065498e-05, + "loss": 1.0303, + "step": 2034 + }, + { + "epoch": 0.18183036611789935, + "grad_norm": 0.4379626512527466, + "learning_rate": 9.407622278605445e-05, + "loss": 0.984, + "step": 2035 + }, + { + "epoch": 0.1819197176491612, + "grad_norm": 0.39666110277175903, + "learning_rate": 9.406938875960045e-05, + "loss": 1.1574, + "step": 2036 + }, + { + "epoch": 0.1820090691804231, + "grad_norm": 0.39308610558509827, + "learning_rate": 9.406255104186541e-05, + "loss": 1.0372, + "step": 2037 + }, + { + "epoch": 0.18209842071168494, + "grad_norm": 0.4402790069580078, + "learning_rate": 9.405570963342203e-05, + "loss": 1.102, + "step": 2038 + }, + { + "epoch": 0.18218777224294683, + "grad_norm": 0.3761218190193176, + "learning_rate": 9.404886453484336e-05, + "loss": 1.0392, + "step": 2039 + }, + { + "epoch": 0.18227712377420868, + "grad_norm": 0.62852942943573, + "learning_rate": 9.404201574670278e-05, + "loss": 0.9496, + "step": 2040 + }, + { + "epoch": 0.18236647530547054, + "grad_norm": 0.4357655644416809, + "learning_rate": 9.40351632695739e-05, + "loss": 1.0507, + "step": 2041 + }, + { + "epoch": 0.18245582683673242, + "grad_norm": 0.4296508729457855, + "learning_rate": 9.402830710403074e-05, + "loss": 1.0769, + "step": 2042 + }, + { + "epoch": 0.18254517836799428, + "grad_norm": 0.4572366774082184, + "learning_rate": 9.402144725064753e-05, + "loss": 0.9844, + "step": 2043 + }, + { + "epoch": 0.18263452989925616, + "grad_norm": 0.48715341091156006, + "learning_rate": 9.40145837099989e-05, + "loss": 0.9948, + "step": 2044 + }, + { + "epoch": 0.18272388143051801, + "grad_norm": 0.4071628749370575, + "learning_rate": 9.40077164826597e-05, + "loss": 1.0108, + "step": 2045 + }, + { + "epoch": 0.18281323296177987, + "grad_norm": 0.4973810017108917, + "learning_rate": 9.400084556920517e-05, + "loss": 1.0148, + "step": 2046 + }, + { + "epoch": 0.18290258449304175, + "grad_norm": 0.4013235867023468, + "learning_rate": 9.399397097021082e-05, + "loss": 0.9818, + "step": 2047 + }, + { + "epoch": 0.1829919360243036, + "grad_norm": 0.4264853596687317, + "learning_rate": 9.398709268625244e-05, + "loss": 1.0554, + "step": 2048 + }, + { + "epoch": 0.1830812875555655, + "grad_norm": 0.5234047174453735, + "learning_rate": 9.398021071790617e-05, + "loss": 1.0184, + "step": 2049 + }, + { + "epoch": 0.18317063908682735, + "grad_norm": 0.4011201560497284, + "learning_rate": 9.397332506574848e-05, + "loss": 1.0643, + "step": 2050 + }, + { + "epoch": 0.18325999061808923, + "grad_norm": 0.4270339906215668, + "learning_rate": 9.396643573035608e-05, + "loss": 1.0095, + "step": 2051 + }, + { + "epoch": 0.18334934214935109, + "grad_norm": 0.3894960880279541, + "learning_rate": 9.395954271230604e-05, + "loss": 1.0769, + "step": 2052 + }, + { + "epoch": 0.18343869368061294, + "grad_norm": 0.3954775035381317, + "learning_rate": 9.395264601217573e-05, + "loss": 1.0095, + "step": 2053 + }, + { + "epoch": 0.18352804521187482, + "grad_norm": 0.4338105618953705, + "learning_rate": 9.394574563054282e-05, + "loss": 0.9931, + "step": 2054 + }, + { + "epoch": 0.18361739674313668, + "grad_norm": 0.4074452519416809, + "learning_rate": 9.393884156798527e-05, + "loss": 1.0317, + "step": 2055 + }, + { + "epoch": 0.18370674827439856, + "grad_norm": 0.39676398038864136, + "learning_rate": 9.393193382508138e-05, + "loss": 0.9515, + "step": 2056 + }, + { + "epoch": 0.18379609980566042, + "grad_norm": 0.42293867468833923, + "learning_rate": 9.392502240240977e-05, + "loss": 1.1357, + "step": 2057 + }, + { + "epoch": 0.18388545133692227, + "grad_norm": 0.44433289766311646, + "learning_rate": 9.391810730054932e-05, + "loss": 1.0308, + "step": 2058 + }, + { + "epoch": 0.18397480286818416, + "grad_norm": 0.4930141270160675, + "learning_rate": 9.391118852007926e-05, + "loss": 1.0061, + "step": 2059 + }, + { + "epoch": 0.184064154399446, + "grad_norm": 0.3936423361301422, + "learning_rate": 9.390426606157907e-05, + "loss": 1.0441, + "step": 2060 + }, + { + "epoch": 0.1841535059307079, + "grad_norm": 0.5013602375984192, + "learning_rate": 9.389733992562863e-05, + "loss": 1.0648, + "step": 2061 + }, + { + "epoch": 0.18424285746196975, + "grad_norm": 0.4181208610534668, + "learning_rate": 9.389041011280808e-05, + "loss": 1.0172, + "step": 2062 + }, + { + "epoch": 0.18433220899323163, + "grad_norm": 0.3877794146537781, + "learning_rate": 9.388347662369782e-05, + "loss": 1.0403, + "step": 2063 + }, + { + "epoch": 0.1844215605244935, + "grad_norm": 0.4322667419910431, + "learning_rate": 9.387653945887864e-05, + "loss": 1.0691, + "step": 2064 + }, + { + "epoch": 0.18451091205575534, + "grad_norm": 0.4271637201309204, + "learning_rate": 9.386959861893158e-05, + "loss": 1.0122, + "step": 2065 + }, + { + "epoch": 0.18460026358701723, + "grad_norm": 0.3978729248046875, + "learning_rate": 9.386265410443802e-05, + "loss": 1.0317, + "step": 2066 + }, + { + "epoch": 0.18468961511827908, + "grad_norm": 0.40776076912879944, + "learning_rate": 9.385570591597966e-05, + "loss": 0.9875, + "step": 2067 + }, + { + "epoch": 0.18477896664954097, + "grad_norm": 0.3995235860347748, + "learning_rate": 9.384875405413843e-05, + "loss": 0.9726, + "step": 2068 + }, + { + "epoch": 0.18486831818080282, + "grad_norm": 0.3666049540042877, + "learning_rate": 9.384179851949666e-05, + "loss": 1.0619, + "step": 2069 + }, + { + "epoch": 0.1849576697120647, + "grad_norm": 0.37846672534942627, + "learning_rate": 9.383483931263695e-05, + "loss": 1.0445, + "step": 2070 + }, + { + "epoch": 0.18504702124332656, + "grad_norm": 0.44983261823654175, + "learning_rate": 9.382787643414221e-05, + "loss": 0.9651, + "step": 2071 + }, + { + "epoch": 0.18513637277458841, + "grad_norm": 0.46223461627960205, + "learning_rate": 9.382090988459564e-05, + "loss": 0.9688, + "step": 2072 + }, + { + "epoch": 0.1852257243058503, + "grad_norm": 0.3715665638446808, + "learning_rate": 9.381393966458077e-05, + "loss": 1.0355, + "step": 2073 + }, + { + "epoch": 0.18531507583711215, + "grad_norm": 0.45313575863838196, + "learning_rate": 9.380696577468142e-05, + "loss": 0.9863, + "step": 2074 + }, + { + "epoch": 0.18540442736837404, + "grad_norm": 0.45069244503974915, + "learning_rate": 9.379998821548175e-05, + "loss": 1.0141, + "step": 2075 + }, + { + "epoch": 0.1854937788996359, + "grad_norm": 0.3656700551509857, + "learning_rate": 9.37930069875662e-05, + "loss": 1.0549, + "step": 2076 + }, + { + "epoch": 0.18558313043089775, + "grad_norm": 0.44431185722351074, + "learning_rate": 9.37860220915195e-05, + "loss": 1.0339, + "step": 2077 + }, + { + "epoch": 0.18567248196215963, + "grad_norm": 0.5284201502799988, + "learning_rate": 9.377903352792672e-05, + "loss": 0.9624, + "step": 2078 + }, + { + "epoch": 0.18576183349342149, + "grad_norm": 0.3846912682056427, + "learning_rate": 9.377204129737325e-05, + "loss": 1.0481, + "step": 2079 + }, + { + "epoch": 0.18585118502468337, + "grad_norm": 0.41815292835235596, + "learning_rate": 9.376504540044472e-05, + "loss": 1.0371, + "step": 2080 + }, + { + "epoch": 0.18594053655594522, + "grad_norm": 0.38547441363334656, + "learning_rate": 9.375804583772716e-05, + "loss": 1.0483, + "step": 2081 + }, + { + "epoch": 0.1860298880872071, + "grad_norm": 0.4458550810813904, + "learning_rate": 9.375104260980683e-05, + "loss": 1.0046, + "step": 2082 + }, + { + "epoch": 0.18611923961846896, + "grad_norm": 0.5194621086120605, + "learning_rate": 9.374403571727032e-05, + "loss": 1.0876, + "step": 2083 + }, + { + "epoch": 0.18620859114973082, + "grad_norm": 0.41328129172325134, + "learning_rate": 9.373702516070453e-05, + "loss": 1.0493, + "step": 2084 + }, + { + "epoch": 0.1862979426809927, + "grad_norm": 0.4268978536128998, + "learning_rate": 9.373001094069671e-05, + "loss": 0.998, + "step": 2085 + }, + { + "epoch": 0.18638729421225456, + "grad_norm": 0.4924493134021759, + "learning_rate": 9.372299305783432e-05, + "loss": 1.0129, + "step": 2086 + }, + { + "epoch": 0.18647664574351644, + "grad_norm": 0.49052175879478455, + "learning_rate": 9.371597151270521e-05, + "loss": 0.9506, + "step": 2087 + }, + { + "epoch": 0.1865659972747783, + "grad_norm": 0.44871586561203003, + "learning_rate": 9.370894630589753e-05, + "loss": 0.9917, + "step": 2088 + }, + { + "epoch": 0.18665534880604015, + "grad_norm": 0.4465029239654541, + "learning_rate": 9.370191743799968e-05, + "loss": 1.0708, + "step": 2089 + }, + { + "epoch": 0.18674470033730203, + "grad_norm": 0.42947274446487427, + "learning_rate": 9.369488490960042e-05, + "loss": 1.0145, + "step": 2090 + }, + { + "epoch": 0.1868340518685639, + "grad_norm": 0.43883195519447327, + "learning_rate": 9.368784872128878e-05, + "loss": 1.0238, + "step": 2091 + }, + { + "epoch": 0.18692340339982577, + "grad_norm": 0.4147821068763733, + "learning_rate": 9.368080887365413e-05, + "loss": 1.019, + "step": 2092 + }, + { + "epoch": 0.18701275493108763, + "grad_norm": 0.4450010061264038, + "learning_rate": 9.367376536728613e-05, + "loss": 0.9557, + "step": 2093 + }, + { + "epoch": 0.1871021064623495, + "grad_norm": 0.5454333424568176, + "learning_rate": 9.366671820277477e-05, + "loss": 1.013, + "step": 2094 + }, + { + "epoch": 0.18719145799361137, + "grad_norm": 0.41965803503990173, + "learning_rate": 9.36596673807103e-05, + "loss": 0.9919, + "step": 2095 + }, + { + "epoch": 0.18728080952487322, + "grad_norm": 0.4064487814903259, + "learning_rate": 9.365261290168331e-05, + "loss": 0.9643, + "step": 2096 + }, + { + "epoch": 0.1873701610561351, + "grad_norm": 0.46642303466796875, + "learning_rate": 9.364555476628467e-05, + "loss": 0.9944, + "step": 2097 + }, + { + "epoch": 0.18745951258739696, + "grad_norm": 0.41478562355041504, + "learning_rate": 9.36384929751056e-05, + "loss": 1.0993, + "step": 2098 + }, + { + "epoch": 0.18754886411865884, + "grad_norm": 0.3898516893386841, + "learning_rate": 9.36314275287376e-05, + "loss": 1.0821, + "step": 2099 + }, + { + "epoch": 0.1876382156499207, + "grad_norm": 0.5555103421211243, + "learning_rate": 9.362435842777246e-05, + "loss": 0.8719, + "step": 2100 + }, + { + "epoch": 0.18772756718118258, + "grad_norm": 0.38932761549949646, + "learning_rate": 9.36172856728023e-05, + "loss": 1.0611, + "step": 2101 + }, + { + "epoch": 0.18781691871244444, + "grad_norm": 0.411034494638443, + "learning_rate": 9.361020926441955e-05, + "loss": 0.9853, + "step": 2102 + }, + { + "epoch": 0.1879062702437063, + "grad_norm": 0.47794967889785767, + "learning_rate": 9.36031292032169e-05, + "loss": 1.0755, + "step": 2103 + }, + { + "epoch": 0.18799562177496817, + "grad_norm": 0.40055814385414124, + "learning_rate": 9.359604548978742e-05, + "loss": 1.072, + "step": 2104 + }, + { + "epoch": 0.18808497330623003, + "grad_norm": 0.4187031686306, + "learning_rate": 9.358895812472442e-05, + "loss": 1.0448, + "step": 2105 + }, + { + "epoch": 0.1881743248374919, + "grad_norm": 0.39350834488868713, + "learning_rate": 9.358186710862156e-05, + "loss": 1.0356, + "step": 2106 + }, + { + "epoch": 0.18826367636875377, + "grad_norm": 0.36614513397216797, + "learning_rate": 9.357477244207278e-05, + "loss": 1.0865, + "step": 2107 + }, + { + "epoch": 0.18835302790001562, + "grad_norm": 0.4072873890399933, + "learning_rate": 9.356767412567234e-05, + "loss": 1.043, + "step": 2108 + }, + { + "epoch": 0.1884423794312775, + "grad_norm": 0.3540877103805542, + "learning_rate": 9.356057216001477e-05, + "loss": 1.0828, + "step": 2109 + }, + { + "epoch": 0.18853173096253936, + "grad_norm": 0.41696932911872864, + "learning_rate": 9.355346654569497e-05, + "loss": 1.0219, + "step": 2110 + }, + { + "epoch": 0.18862108249380125, + "grad_norm": 0.5428374409675598, + "learning_rate": 9.354635728330811e-05, + "loss": 1.0378, + "step": 2111 + }, + { + "epoch": 0.1887104340250631, + "grad_norm": 0.3925424814224243, + "learning_rate": 9.353924437344966e-05, + "loss": 1.0572, + "step": 2112 + }, + { + "epoch": 0.18879978555632498, + "grad_norm": 0.3727796971797943, + "learning_rate": 9.353212781671538e-05, + "loss": 1.0736, + "step": 2113 + }, + { + "epoch": 0.18888913708758684, + "grad_norm": 0.4229578971862793, + "learning_rate": 9.352500761370139e-05, + "loss": 1.0785, + "step": 2114 + }, + { + "epoch": 0.1889784886188487, + "grad_norm": 0.4206151068210602, + "learning_rate": 9.351788376500406e-05, + "loss": 1.0187, + "step": 2115 + }, + { + "epoch": 0.18906784015011058, + "grad_norm": 0.4051399230957031, + "learning_rate": 9.35107562712201e-05, + "loss": 1.0228, + "step": 2116 + }, + { + "epoch": 0.18915719168137243, + "grad_norm": 0.3675979971885681, + "learning_rate": 9.350362513294651e-05, + "loss": 1.024, + "step": 2117 + }, + { + "epoch": 0.18924654321263432, + "grad_norm": 0.43674102425575256, + "learning_rate": 9.349649035078061e-05, + "loss": 1.0003, + "step": 2118 + }, + { + "epoch": 0.18933589474389617, + "grad_norm": 0.4083952009677887, + "learning_rate": 9.348935192532001e-05, + "loss": 1.0628, + "step": 2119 + }, + { + "epoch": 0.18942524627515803, + "grad_norm": 0.4974057972431183, + "learning_rate": 9.34822098571626e-05, + "loss": 1.0283, + "step": 2120 + }, + { + "epoch": 0.1895145978064199, + "grad_norm": 0.39554762840270996, + "learning_rate": 9.347506414690663e-05, + "loss": 1.0382, + "step": 2121 + }, + { + "epoch": 0.18960394933768177, + "grad_norm": 0.40948179364204407, + "learning_rate": 9.346791479515063e-05, + "loss": 1.0227, + "step": 2122 + }, + { + "epoch": 0.18969330086894365, + "grad_norm": 0.42662203311920166, + "learning_rate": 9.346076180249345e-05, + "loss": 1.0179, + "step": 2123 + }, + { + "epoch": 0.1897826524002055, + "grad_norm": 0.40149882435798645, + "learning_rate": 9.345360516953418e-05, + "loss": 1.0281, + "step": 2124 + }, + { + "epoch": 0.1898720039314674, + "grad_norm": 0.49369674921035767, + "learning_rate": 9.344644489687233e-05, + "loss": 0.9582, + "step": 2125 + }, + { + "epoch": 0.18996135546272924, + "grad_norm": 0.43468207120895386, + "learning_rate": 9.343928098510759e-05, + "loss": 1.0257, + "step": 2126 + }, + { + "epoch": 0.1900507069939911, + "grad_norm": 0.5939388275146484, + "learning_rate": 9.343211343484004e-05, + "loss": 1.0418, + "step": 2127 + }, + { + "epoch": 0.19014005852525298, + "grad_norm": 0.42721638083457947, + "learning_rate": 9.342494224667004e-05, + "loss": 0.9871, + "step": 2128 + }, + { + "epoch": 0.19022941005651484, + "grad_norm": 0.4589712917804718, + "learning_rate": 9.341776742119825e-05, + "loss": 0.9894, + "step": 2129 + }, + { + "epoch": 0.19031876158777672, + "grad_norm": 0.3878852128982544, + "learning_rate": 9.341058895902563e-05, + "loss": 1.0514, + "step": 2130 + }, + { + "epoch": 0.19040811311903857, + "grad_norm": 0.47722190618515015, + "learning_rate": 9.340340686075349e-05, + "loss": 0.9354, + "step": 2131 + }, + { + "epoch": 0.19049746465030046, + "grad_norm": 0.4715014398097992, + "learning_rate": 9.339622112698334e-05, + "loss": 1.0487, + "step": 2132 + }, + { + "epoch": 0.1905868161815623, + "grad_norm": 0.44013863801956177, + "learning_rate": 9.338903175831712e-05, + "loss": 1.0149, + "step": 2133 + }, + { + "epoch": 0.19067616771282417, + "grad_norm": 0.4588448107242584, + "learning_rate": 9.3381838755357e-05, + "loss": 1.0304, + "step": 2134 + }, + { + "epoch": 0.19076551924408605, + "grad_norm": 0.47280123829841614, + "learning_rate": 9.337464211870546e-05, + "loss": 1.0329, + "step": 2135 + }, + { + "epoch": 0.1908548707753479, + "grad_norm": 0.4202229380607605, + "learning_rate": 9.336744184896531e-05, + "loss": 1.1011, + "step": 2136 + }, + { + "epoch": 0.1909442223066098, + "grad_norm": 0.3864685297012329, + "learning_rate": 9.336023794673962e-05, + "loss": 1.0398, + "step": 2137 + }, + { + "epoch": 0.19103357383787165, + "grad_norm": 0.3656110465526581, + "learning_rate": 9.335303041263183e-05, + "loss": 0.9896, + "step": 2138 + }, + { + "epoch": 0.1911229253691335, + "grad_norm": 0.3890793025493622, + "learning_rate": 9.334581924724564e-05, + "loss": 1.0371, + "step": 2139 + }, + { + "epoch": 0.19121227690039538, + "grad_norm": 0.4095704257488251, + "learning_rate": 9.333860445118505e-05, + "loss": 1.0538, + "step": 2140 + }, + { + "epoch": 0.19130162843165724, + "grad_norm": 0.3988470733165741, + "learning_rate": 9.333138602505437e-05, + "loss": 0.9512, + "step": 2141 + }, + { + "epoch": 0.19139097996291912, + "grad_norm": 0.42787396907806396, + "learning_rate": 9.332416396945824e-05, + "loss": 1.0253, + "step": 2142 + }, + { + "epoch": 0.19148033149418098, + "grad_norm": 0.44595029950141907, + "learning_rate": 9.331693828500159e-05, + "loss": 1.005, + "step": 2143 + }, + { + "epoch": 0.19156968302544286, + "grad_norm": 0.44697070121765137, + "learning_rate": 9.33097089722896e-05, + "loss": 0.9655, + "step": 2144 + }, + { + "epoch": 0.19165903455670472, + "grad_norm": 0.45841777324676514, + "learning_rate": 9.330247603192786e-05, + "loss": 1.0242, + "step": 2145 + }, + { + "epoch": 0.19174838608796657, + "grad_norm": 0.4379693865776062, + "learning_rate": 9.32952394645222e-05, + "loss": 1.0129, + "step": 2146 + }, + { + "epoch": 0.19183773761922845, + "grad_norm": 0.3796353042125702, + "learning_rate": 9.32879992706787e-05, + "loss": 1.0516, + "step": 2147 + }, + { + "epoch": 0.1919270891504903, + "grad_norm": 0.5354682207107544, + "learning_rate": 9.328075545100385e-05, + "loss": 0.85, + "step": 2148 + }, + { + "epoch": 0.1920164406817522, + "grad_norm": 0.5834673643112183, + "learning_rate": 9.32735080061044e-05, + "loss": 1.0113, + "step": 2149 + }, + { + "epoch": 0.19210579221301405, + "grad_norm": 0.46997979283332825, + "learning_rate": 9.326625693658739e-05, + "loss": 1.0355, + "step": 2150 + }, + { + "epoch": 0.19219514374427593, + "grad_norm": 0.47971826791763306, + "learning_rate": 9.325900224306019e-05, + "loss": 0.9207, + "step": 2151 + }, + { + "epoch": 0.1922844952755378, + "grad_norm": 0.49349215626716614, + "learning_rate": 9.325174392613043e-05, + "loss": 0.9519, + "step": 2152 + }, + { + "epoch": 0.19237384680679964, + "grad_norm": 0.603679895401001, + "learning_rate": 9.32444819864061e-05, + "loss": 0.9713, + "step": 2153 + }, + { + "epoch": 0.19246319833806153, + "grad_norm": 0.44981256127357483, + "learning_rate": 9.323721642449543e-05, + "loss": 0.9577, + "step": 2154 + }, + { + "epoch": 0.19255254986932338, + "grad_norm": 0.3963879644870758, + "learning_rate": 9.322994724100702e-05, + "loss": 1.0324, + "step": 2155 + }, + { + "epoch": 0.19264190140058526, + "grad_norm": 0.4557402729988098, + "learning_rate": 9.322267443654972e-05, + "loss": 1.042, + "step": 2156 + }, + { + "epoch": 0.19273125293184712, + "grad_norm": 0.4886689782142639, + "learning_rate": 9.321539801173274e-05, + "loss": 1.01, + "step": 2157 + }, + { + "epoch": 0.19282060446310897, + "grad_norm": 0.416227787733078, + "learning_rate": 9.320811796716552e-05, + "loss": 0.9619, + "step": 2158 + }, + { + "epoch": 0.19290995599437086, + "grad_norm": 0.44444751739501953, + "learning_rate": 9.320083430345785e-05, + "loss": 1.0472, + "step": 2159 + }, + { + "epoch": 0.1929993075256327, + "grad_norm": 0.4092031717300415, + "learning_rate": 9.319354702121981e-05, + "loss": 1.0692, + "step": 2160 + }, + { + "epoch": 0.1930886590568946, + "grad_norm": 0.3944936692714691, + "learning_rate": 9.318625612106182e-05, + "loss": 1.0244, + "step": 2161 + }, + { + "epoch": 0.19317801058815645, + "grad_norm": 0.457409530878067, + "learning_rate": 9.317896160359454e-05, + "loss": 1.1001, + "step": 2162 + }, + { + "epoch": 0.19326736211941833, + "grad_norm": 0.4606925845146179, + "learning_rate": 9.317166346942897e-05, + "loss": 1.0092, + "step": 2163 + }, + { + "epoch": 0.1933567136506802, + "grad_norm": 0.458124041557312, + "learning_rate": 9.31643617191764e-05, + "loss": 1.0144, + "step": 2164 + }, + { + "epoch": 0.19344606518194205, + "grad_norm": 0.3879139721393585, + "learning_rate": 9.315705635344844e-05, + "loss": 0.9889, + "step": 2165 + }, + { + "epoch": 0.19353541671320393, + "grad_norm": 0.3774491548538208, + "learning_rate": 9.314974737285699e-05, + "loss": 1.0031, + "step": 2166 + }, + { + "epoch": 0.19362476824446578, + "grad_norm": 0.38061612844467163, + "learning_rate": 9.314243477801425e-05, + "loss": 0.9962, + "step": 2167 + }, + { + "epoch": 0.19371411977572767, + "grad_norm": 0.4952597916126251, + "learning_rate": 9.313511856953274e-05, + "loss": 1.0836, + "step": 2168 + }, + { + "epoch": 0.19380347130698952, + "grad_norm": 0.41984468698501587, + "learning_rate": 9.312779874802526e-05, + "loss": 1.0027, + "step": 2169 + }, + { + "epoch": 0.19389282283825138, + "grad_norm": 0.47165659070014954, + "learning_rate": 9.312047531410493e-05, + "loss": 1.0573, + "step": 2170 + }, + { + "epoch": 0.19398217436951326, + "grad_norm": 0.39716780185699463, + "learning_rate": 9.311314826838515e-05, + "loss": 1.0426, + "step": 2171 + }, + { + "epoch": 0.19407152590077512, + "grad_norm": 0.4184114336967468, + "learning_rate": 9.310581761147966e-05, + "loss": 1.0309, + "step": 2172 + }, + { + "epoch": 0.194160877432037, + "grad_norm": 0.4498063027858734, + "learning_rate": 9.309848334400246e-05, + "loss": 0.9592, + "step": 2173 + }, + { + "epoch": 0.19425022896329885, + "grad_norm": 0.4691358208656311, + "learning_rate": 9.30911454665679e-05, + "loss": 1.0557, + "step": 2174 + }, + { + "epoch": 0.19433958049456074, + "grad_norm": 0.4362775981426239, + "learning_rate": 9.308380397979057e-05, + "loss": 1.1005, + "step": 2175 + }, + { + "epoch": 0.1944289320258226, + "grad_norm": 0.39508792757987976, + "learning_rate": 9.307645888428542e-05, + "loss": 1.0745, + "step": 2176 + }, + { + "epoch": 0.19451828355708445, + "grad_norm": 0.4104553461074829, + "learning_rate": 9.306911018066769e-05, + "loss": 1.0801, + "step": 2177 + }, + { + "epoch": 0.19460763508834633, + "grad_norm": 0.4288639426231384, + "learning_rate": 9.306175786955289e-05, + "loss": 1.1468, + "step": 2178 + }, + { + "epoch": 0.1946969866196082, + "grad_norm": 0.4734762907028198, + "learning_rate": 9.305440195155686e-05, + "loss": 1.0742, + "step": 2179 + }, + { + "epoch": 0.19478633815087007, + "grad_norm": 0.45509761571884155, + "learning_rate": 9.304704242729575e-05, + "loss": 0.9943, + "step": 2180 + }, + { + "epoch": 0.19487568968213193, + "grad_norm": 0.412614643573761, + "learning_rate": 9.303967929738598e-05, + "loss": 1.0259, + "step": 2181 + }, + { + "epoch": 0.1949650412133938, + "grad_norm": 0.38879382610321045, + "learning_rate": 9.30323125624443e-05, + "loss": 1.0245, + "step": 2182 + }, + { + "epoch": 0.19505439274465566, + "grad_norm": 0.459376722574234, + "learning_rate": 9.302494222308774e-05, + "loss": 1.0003, + "step": 2183 + }, + { + "epoch": 0.19514374427591752, + "grad_norm": 0.41094890236854553, + "learning_rate": 9.301756827993367e-05, + "loss": 0.9735, + "step": 2184 + }, + { + "epoch": 0.1952330958071794, + "grad_norm": 0.4334050118923187, + "learning_rate": 9.301019073359972e-05, + "loss": 0.9722, + "step": 2185 + }, + { + "epoch": 0.19532244733844126, + "grad_norm": 0.43469616770744324, + "learning_rate": 9.300280958470384e-05, + "loss": 1.0839, + "step": 2186 + }, + { + "epoch": 0.19541179886970314, + "grad_norm": 0.45029255747795105, + "learning_rate": 9.299542483386428e-05, + "loss": 0.9987, + "step": 2187 + }, + { + "epoch": 0.195501150400965, + "grad_norm": 0.42933982610702515, + "learning_rate": 9.298803648169958e-05, + "loss": 1.0202, + "step": 2188 + }, + { + "epoch": 0.19559050193222685, + "grad_norm": 0.37860357761383057, + "learning_rate": 9.298064452882862e-05, + "loss": 1.053, + "step": 2189 + }, + { + "epoch": 0.19567985346348873, + "grad_norm": 0.43233370780944824, + "learning_rate": 9.297324897587054e-05, + "loss": 1.07, + "step": 2190 + }, + { + "epoch": 0.1957692049947506, + "grad_norm": 0.4745257794857025, + "learning_rate": 9.296584982344478e-05, + "loss": 0.951, + "step": 2191 + }, + { + "epoch": 0.19585855652601247, + "grad_norm": 0.453050434589386, + "learning_rate": 9.295844707217114e-05, + "loss": 0.9839, + "step": 2192 + }, + { + "epoch": 0.19594790805727433, + "grad_norm": 0.4505593776702881, + "learning_rate": 9.295104072266965e-05, + "loss": 1.0556, + "step": 2193 + }, + { + "epoch": 0.1960372595885362, + "grad_norm": 0.41230955719947815, + "learning_rate": 9.294363077556066e-05, + "loss": 1.0569, + "step": 2194 + }, + { + "epoch": 0.19612661111979807, + "grad_norm": 0.38011908531188965, + "learning_rate": 9.293621723146485e-05, + "loss": 1.0311, + "step": 2195 + }, + { + "epoch": 0.19621596265105992, + "grad_norm": 0.37792640924453735, + "learning_rate": 9.292880009100318e-05, + "loss": 1.0998, + "step": 2196 + }, + { + "epoch": 0.1963053141823218, + "grad_norm": 0.42640724778175354, + "learning_rate": 9.292137935479692e-05, + "loss": 1.0311, + "step": 2197 + }, + { + "epoch": 0.19639466571358366, + "grad_norm": 0.39782777428627014, + "learning_rate": 9.291395502346763e-05, + "loss": 1.035, + "step": 2198 + }, + { + "epoch": 0.19648401724484554, + "grad_norm": 0.39150485396385193, + "learning_rate": 9.290652709763717e-05, + "loss": 1.0524, + "step": 2199 + }, + { + "epoch": 0.1965733687761074, + "grad_norm": 0.41855573654174805, + "learning_rate": 9.289909557792771e-05, + "loss": 1.05, + "step": 2200 + }, + { + "epoch": 0.19666272030736925, + "grad_norm": 0.4688778817653656, + "learning_rate": 9.289166046496172e-05, + "loss": 1.0219, + "step": 2201 + }, + { + "epoch": 0.19675207183863114, + "grad_norm": 0.4490063488483429, + "learning_rate": 9.2884221759362e-05, + "loss": 0.9859, + "step": 2202 + }, + { + "epoch": 0.196841423369893, + "grad_norm": 0.37221378087997437, + "learning_rate": 9.287677946175157e-05, + "loss": 1.0359, + "step": 2203 + }, + { + "epoch": 0.19693077490115488, + "grad_norm": 0.5122733116149902, + "learning_rate": 9.286933357275385e-05, + "loss": 0.9099, + "step": 2204 + }, + { + "epoch": 0.19702012643241673, + "grad_norm": 0.48560839891433716, + "learning_rate": 9.286188409299246e-05, + "loss": 0.9573, + "step": 2205 + }, + { + "epoch": 0.19710947796367861, + "grad_norm": 0.42828473448753357, + "learning_rate": 9.285443102309142e-05, + "loss": 1.0508, + "step": 2206 + }, + { + "epoch": 0.19719882949494047, + "grad_norm": 0.417341947555542, + "learning_rate": 9.284697436367497e-05, + "loss": 1.085, + "step": 2207 + }, + { + "epoch": 0.19728818102620232, + "grad_norm": 0.4165228307247162, + "learning_rate": 9.283951411536773e-05, + "loss": 1.1126, + "step": 2208 + }, + { + "epoch": 0.1973775325574642, + "grad_norm": 0.4660586714744568, + "learning_rate": 9.283205027879454e-05, + "loss": 0.9542, + "step": 2209 + }, + { + "epoch": 0.19746688408872606, + "grad_norm": 0.41941720247268677, + "learning_rate": 9.282458285458055e-05, + "loss": 1.0903, + "step": 2210 + }, + { + "epoch": 0.19755623561998795, + "grad_norm": 0.4800897538661957, + "learning_rate": 9.281711184335131e-05, + "loss": 0.9575, + "step": 2211 + }, + { + "epoch": 0.1976455871512498, + "grad_norm": 0.39509686827659607, + "learning_rate": 9.280963724573253e-05, + "loss": 1.0522, + "step": 2212 + }, + { + "epoch": 0.19773493868251169, + "grad_norm": 0.4314636290073395, + "learning_rate": 9.280215906235032e-05, + "loss": 1.0846, + "step": 2213 + }, + { + "epoch": 0.19782429021377354, + "grad_norm": 0.39220911264419556, + "learning_rate": 9.279467729383105e-05, + "loss": 0.9884, + "step": 2214 + }, + { + "epoch": 0.1979136417450354, + "grad_norm": 0.45912548899650574, + "learning_rate": 9.27871919408014e-05, + "loss": 0.984, + "step": 2215 + }, + { + "epoch": 0.19800299327629728, + "grad_norm": 0.48036208748817444, + "learning_rate": 9.277970300388834e-05, + "loss": 0.9564, + "step": 2216 + }, + { + "epoch": 0.19809234480755913, + "grad_norm": 0.5072821974754333, + "learning_rate": 9.277221048371917e-05, + "loss": 0.911, + "step": 2217 + }, + { + "epoch": 0.19818169633882102, + "grad_norm": 0.48071086406707764, + "learning_rate": 9.276471438092145e-05, + "loss": 1.0484, + "step": 2218 + }, + { + "epoch": 0.19827104787008287, + "grad_norm": 0.5344045758247375, + "learning_rate": 9.275721469612304e-05, + "loss": 0.8887, + "step": 2219 + }, + { + "epoch": 0.19836039940134473, + "grad_norm": 0.43411949276924133, + "learning_rate": 9.274971142995216e-05, + "loss": 1.0001, + "step": 2220 + }, + { + "epoch": 0.1984497509326066, + "grad_norm": 0.4209814667701721, + "learning_rate": 9.274220458303727e-05, + "loss": 1.0129, + "step": 2221 + }, + { + "epoch": 0.19853910246386847, + "grad_norm": 0.33118653297424316, + "learning_rate": 9.273469415600713e-05, + "loss": 1.0303, + "step": 2222 + }, + { + "epoch": 0.19862845399513035, + "grad_norm": 0.4255508780479431, + "learning_rate": 9.272718014949085e-05, + "loss": 0.9976, + "step": 2223 + }, + { + "epoch": 0.1987178055263922, + "grad_norm": 0.4046938121318817, + "learning_rate": 9.271966256411779e-05, + "loss": 1.0221, + "step": 2224 + }, + { + "epoch": 0.1988071570576541, + "grad_norm": 0.4717459976673126, + "learning_rate": 9.271214140051763e-05, + "loss": 0.9862, + "step": 2225 + }, + { + "epoch": 0.19889650858891594, + "grad_norm": 0.4104228615760803, + "learning_rate": 9.270461665932034e-05, + "loss": 1.024, + "step": 2226 + }, + { + "epoch": 0.1989858601201778, + "grad_norm": 0.41744041442871094, + "learning_rate": 9.269708834115622e-05, + "loss": 1.0642, + "step": 2227 + }, + { + "epoch": 0.19907521165143968, + "grad_norm": 0.4957825541496277, + "learning_rate": 9.268955644665582e-05, + "loss": 0.978, + "step": 2228 + }, + { + "epoch": 0.19916456318270154, + "grad_norm": 0.4245834946632385, + "learning_rate": 9.268202097645005e-05, + "loss": 0.9519, + "step": 2229 + }, + { + "epoch": 0.19925391471396342, + "grad_norm": 0.4112485647201538, + "learning_rate": 9.267448193117005e-05, + "loss": 1.0033, + "step": 2230 + }, + { + "epoch": 0.19934326624522528, + "grad_norm": 0.45706307888031006, + "learning_rate": 9.266693931144732e-05, + "loss": 1.0904, + "step": 2231 + }, + { + "epoch": 0.19943261777648713, + "grad_norm": 0.44358548521995544, + "learning_rate": 9.265939311791362e-05, + "loss": 0.9685, + "step": 2232 + }, + { + "epoch": 0.19952196930774901, + "grad_norm": 0.4372340440750122, + "learning_rate": 9.265184335120103e-05, + "loss": 1.0331, + "step": 2233 + }, + { + "epoch": 0.19961132083901087, + "grad_norm": 0.5012243986129761, + "learning_rate": 9.264429001194193e-05, + "loss": 0.9479, + "step": 2234 + }, + { + "epoch": 0.19970067237027275, + "grad_norm": 0.45009467005729675, + "learning_rate": 9.263673310076897e-05, + "loss": 0.9737, + "step": 2235 + }, + { + "epoch": 0.1997900239015346, + "grad_norm": 0.3963136076927185, + "learning_rate": 9.262917261831515e-05, + "loss": 1.0775, + "step": 2236 + }, + { + "epoch": 0.1998793754327965, + "grad_norm": 0.4253182113170624, + "learning_rate": 9.262160856521372e-05, + "loss": 1.013, + "step": 2237 + }, + { + "epoch": 0.19996872696405835, + "grad_norm": 0.40562012791633606, + "learning_rate": 9.261404094209827e-05, + "loss": 1.0651, + "step": 2238 + }, + { + "epoch": 0.2000580784953202, + "grad_norm": 0.42175614833831787, + "learning_rate": 9.260646974960265e-05, + "loss": 1.052, + "step": 2239 + }, + { + "epoch": 0.20014743002658208, + "grad_norm": 0.3817167580127716, + "learning_rate": 9.259889498836105e-05, + "loss": 1.0804, + "step": 2240 + }, + { + "epoch": 0.20023678155784394, + "grad_norm": 0.42921021580696106, + "learning_rate": 9.259131665900792e-05, + "loss": 0.9756, + "step": 2241 + }, + { + "epoch": 0.20032613308910582, + "grad_norm": 0.4635266363620758, + "learning_rate": 9.258373476217801e-05, + "loss": 0.9926, + "step": 2242 + }, + { + "epoch": 0.20041548462036768, + "grad_norm": 0.45946842432022095, + "learning_rate": 9.257614929850642e-05, + "loss": 0.9873, + "step": 2243 + }, + { + "epoch": 0.20050483615162956, + "grad_norm": 0.4113311767578125, + "learning_rate": 9.256856026862847e-05, + "loss": 1.0364, + "step": 2244 + }, + { + "epoch": 0.20059418768289142, + "grad_norm": 0.38544759154319763, + "learning_rate": 9.256096767317989e-05, + "loss": 1.0698, + "step": 2245 + }, + { + "epoch": 0.20068353921415327, + "grad_norm": 0.483548104763031, + "learning_rate": 9.255337151279658e-05, + "loss": 0.9635, + "step": 2246 + }, + { + "epoch": 0.20077289074541516, + "grad_norm": 0.41710737347602844, + "learning_rate": 9.254577178811482e-05, + "loss": 0.9602, + "step": 2247 + }, + { + "epoch": 0.200862242276677, + "grad_norm": 0.41707658767700195, + "learning_rate": 9.253816849977117e-05, + "loss": 1.0529, + "step": 2248 + }, + { + "epoch": 0.2009515938079389, + "grad_norm": 0.49464061856269836, + "learning_rate": 9.253056164840248e-05, + "loss": 1.0603, + "step": 2249 + }, + { + "epoch": 0.20104094533920075, + "grad_norm": 0.4921278655529022, + "learning_rate": 9.252295123464592e-05, + "loss": 1.0333, + "step": 2250 + }, + { + "epoch": 0.2011302968704626, + "grad_norm": 0.41099363565444946, + "learning_rate": 9.251533725913893e-05, + "loss": 0.9884, + "step": 2251 + }, + { + "epoch": 0.2012196484017245, + "grad_norm": 0.40399447083473206, + "learning_rate": 9.250771972251925e-05, + "loss": 0.9832, + "step": 2252 + }, + { + "epoch": 0.20130899993298634, + "grad_norm": 0.39591044187545776, + "learning_rate": 9.250009862542495e-05, + "loss": 1.0229, + "step": 2253 + }, + { + "epoch": 0.20139835146424823, + "grad_norm": 0.4617488980293274, + "learning_rate": 9.249247396849437e-05, + "loss": 0.9989, + "step": 2254 + }, + { + "epoch": 0.20148770299551008, + "grad_norm": 0.3816708028316498, + "learning_rate": 9.248484575236616e-05, + "loss": 1.1098, + "step": 2255 + }, + { + "epoch": 0.20157705452677196, + "grad_norm": 0.43088236451148987, + "learning_rate": 9.247721397767926e-05, + "loss": 0.9852, + "step": 2256 + }, + { + "epoch": 0.20166640605803382, + "grad_norm": 0.39949992299079895, + "learning_rate": 9.246957864507292e-05, + "loss": 1.0444, + "step": 2257 + }, + { + "epoch": 0.20175575758929568, + "grad_norm": 0.5272142291069031, + "learning_rate": 9.246193975518667e-05, + "loss": 0.9864, + "step": 2258 + }, + { + "epoch": 0.20184510912055756, + "grad_norm": 0.44182446599006653, + "learning_rate": 9.245429730866035e-05, + "loss": 0.9412, + "step": 2259 + }, + { + "epoch": 0.20193446065181941, + "grad_norm": 0.4208958148956299, + "learning_rate": 9.244665130613411e-05, + "loss": 1.0687, + "step": 2260 + }, + { + "epoch": 0.2020238121830813, + "grad_norm": 0.5523428916931152, + "learning_rate": 9.243900174824838e-05, + "loss": 0.9329, + "step": 2261 + }, + { + "epoch": 0.20211316371434315, + "grad_norm": 0.3945762813091278, + "learning_rate": 9.243134863564387e-05, + "loss": 1.0095, + "step": 2262 + }, + { + "epoch": 0.202202515245605, + "grad_norm": 0.5793361067771912, + "learning_rate": 9.242369196896163e-05, + "loss": 0.9279, + "step": 2263 + }, + { + "epoch": 0.2022918667768669, + "grad_norm": 0.4429938495159149, + "learning_rate": 9.241603174884299e-05, + "loss": 1.0672, + "step": 2264 + }, + { + "epoch": 0.20238121830812875, + "grad_norm": 0.4614643454551697, + "learning_rate": 9.240836797592958e-05, + "loss": 1.0062, + "step": 2265 + }, + { + "epoch": 0.20247056983939063, + "grad_norm": 0.4438598155975342, + "learning_rate": 9.240070065086328e-05, + "loss": 0.9787, + "step": 2266 + }, + { + "epoch": 0.20255992137065248, + "grad_norm": 0.3669438362121582, + "learning_rate": 9.239302977428637e-05, + "loss": 1.0367, + "step": 2267 + }, + { + "epoch": 0.20264927290191437, + "grad_norm": 0.43813052773475647, + "learning_rate": 9.238535534684131e-05, + "loss": 0.9767, + "step": 2268 + }, + { + "epoch": 0.20273862443317622, + "grad_norm": 0.4687936007976532, + "learning_rate": 9.237767736917098e-05, + "loss": 1.0335, + "step": 2269 + }, + { + "epoch": 0.20282797596443808, + "grad_norm": 0.3983701467514038, + "learning_rate": 9.236999584191843e-05, + "loss": 1.0086, + "step": 2270 + }, + { + "epoch": 0.20291732749569996, + "grad_norm": 0.4753643870353699, + "learning_rate": 9.23623107657271e-05, + "loss": 0.9726, + "step": 2271 + }, + { + "epoch": 0.20300667902696182, + "grad_norm": 0.4003112316131592, + "learning_rate": 9.235462214124071e-05, + "loss": 1.0838, + "step": 2272 + }, + { + "epoch": 0.2030960305582237, + "grad_norm": 0.4177161157131195, + "learning_rate": 9.234692996910323e-05, + "loss": 1.0637, + "step": 2273 + }, + { + "epoch": 0.20318538208948556, + "grad_norm": 0.48143237829208374, + "learning_rate": 9.233923424995899e-05, + "loss": 0.9929, + "step": 2274 + }, + { + "epoch": 0.20327473362074744, + "grad_norm": 0.4232881963253021, + "learning_rate": 9.233153498445258e-05, + "loss": 1.0583, + "step": 2275 + }, + { + "epoch": 0.2033640851520093, + "grad_norm": 0.47992992401123047, + "learning_rate": 9.232383217322889e-05, + "loss": 0.9638, + "step": 2276 + }, + { + "epoch": 0.20345343668327115, + "grad_norm": 0.4345097839832306, + "learning_rate": 9.23161258169331e-05, + "loss": 1.0855, + "step": 2277 + }, + { + "epoch": 0.20354278821453303, + "grad_norm": 0.45901912450790405, + "learning_rate": 9.230841591621073e-05, + "loss": 1.0315, + "step": 2278 + }, + { + "epoch": 0.2036321397457949, + "grad_norm": 0.45470988750457764, + "learning_rate": 9.230070247170755e-05, + "loss": 1.0359, + "step": 2279 + }, + { + "epoch": 0.20372149127705677, + "grad_norm": 0.4291987419128418, + "learning_rate": 9.229298548406964e-05, + "loss": 1.0296, + "step": 2280 + }, + { + "epoch": 0.20381084280831863, + "grad_norm": 0.3951917588710785, + "learning_rate": 9.228526495394339e-05, + "loss": 0.9465, + "step": 2281 + }, + { + "epoch": 0.20390019433958048, + "grad_norm": 0.3695099353790283, + "learning_rate": 9.227754088197548e-05, + "loss": 1.03, + "step": 2282 + }, + { + "epoch": 0.20398954587084236, + "grad_norm": 0.40235844254493713, + "learning_rate": 9.226981326881286e-05, + "loss": 1.0398, + "step": 2283 + }, + { + "epoch": 0.20407889740210422, + "grad_norm": 0.42109382152557373, + "learning_rate": 9.226208211510282e-05, + "loss": 1.0524, + "step": 2284 + }, + { + "epoch": 0.2041682489333661, + "grad_norm": 0.4391646981239319, + "learning_rate": 9.225434742149293e-05, + "loss": 1.0501, + "step": 2285 + }, + { + "epoch": 0.20425760046462796, + "grad_norm": 0.45787525177001953, + "learning_rate": 9.224660918863104e-05, + "loss": 0.9669, + "step": 2286 + }, + { + "epoch": 0.20434695199588984, + "grad_norm": 0.3951761722564697, + "learning_rate": 9.22388674171653e-05, + "loss": 0.9985, + "step": 2287 + }, + { + "epoch": 0.2044363035271517, + "grad_norm": 0.5199114680290222, + "learning_rate": 9.22311221077442e-05, + "loss": 0.946, + "step": 2288 + }, + { + "epoch": 0.20452565505841355, + "grad_norm": 0.41517964005470276, + "learning_rate": 9.222337326101647e-05, + "loss": 1.0267, + "step": 2289 + }, + { + "epoch": 0.20461500658967544, + "grad_norm": 0.45859935879707336, + "learning_rate": 9.221562087763114e-05, + "loss": 0.9967, + "step": 2290 + }, + { + "epoch": 0.2047043581209373, + "grad_norm": 0.4338325560092926, + "learning_rate": 9.220786495823758e-05, + "loss": 1.023, + "step": 2291 + }, + { + "epoch": 0.20479370965219917, + "grad_norm": 0.41822728514671326, + "learning_rate": 9.220010550348544e-05, + "loss": 1.0318, + "step": 2292 + }, + { + "epoch": 0.20488306118346103, + "grad_norm": 0.4380994141101837, + "learning_rate": 9.219234251402464e-05, + "loss": 1.0465, + "step": 2293 + }, + { + "epoch": 0.20497241271472288, + "grad_norm": 0.4628269672393799, + "learning_rate": 9.218457599050542e-05, + "loss": 0.9637, + "step": 2294 + }, + { + "epoch": 0.20506176424598477, + "grad_norm": 0.41425222158432007, + "learning_rate": 9.217680593357829e-05, + "loss": 1.0134, + "step": 2295 + }, + { + "epoch": 0.20515111577724662, + "grad_norm": 0.33953461050987244, + "learning_rate": 9.216903234389412e-05, + "loss": 1.0589, + "step": 2296 + }, + { + "epoch": 0.2052404673085085, + "grad_norm": 0.42335033416748047, + "learning_rate": 9.216125522210398e-05, + "loss": 0.9652, + "step": 2297 + }, + { + "epoch": 0.20532981883977036, + "grad_norm": 0.39172297716140747, + "learning_rate": 9.21534745688593e-05, + "loss": 1.0338, + "step": 2298 + }, + { + "epoch": 0.20541917037103224, + "grad_norm": 0.4049052298069, + "learning_rate": 9.214569038481183e-05, + "loss": 0.9537, + "step": 2299 + }, + { + "epoch": 0.2055085219022941, + "grad_norm": 0.4806058406829834, + "learning_rate": 9.213790267061352e-05, + "loss": 0.9846, + "step": 2300 + }, + { + "epoch": 0.20559787343355596, + "grad_norm": 0.437541663646698, + "learning_rate": 9.213011142691671e-05, + "loss": 0.9776, + "step": 2301 + }, + { + "epoch": 0.20568722496481784, + "grad_norm": 0.42535480856895447, + "learning_rate": 9.2122316654374e-05, + "loss": 0.9601, + "step": 2302 + }, + { + "epoch": 0.2057765764960797, + "grad_norm": 0.3970091640949249, + "learning_rate": 9.211451835363828e-05, + "loss": 0.9903, + "step": 2303 + }, + { + "epoch": 0.20586592802734158, + "grad_norm": 0.4786219894886017, + "learning_rate": 9.210671652536274e-05, + "loss": 0.9416, + "step": 2304 + }, + { + "epoch": 0.20595527955860343, + "grad_norm": 0.4173687696456909, + "learning_rate": 9.209891117020087e-05, + "loss": 1.0231, + "step": 2305 + }, + { + "epoch": 0.20604463108986532, + "grad_norm": 0.49292880296707153, + "learning_rate": 9.209110228880642e-05, + "loss": 0.8597, + "step": 2306 + }, + { + "epoch": 0.20613398262112717, + "grad_norm": 0.4839416742324829, + "learning_rate": 9.208328988183352e-05, + "loss": 1.0004, + "step": 2307 + }, + { + "epoch": 0.20622333415238903, + "grad_norm": 0.5510335564613342, + "learning_rate": 9.207547394993651e-05, + "loss": 0.9456, + "step": 2308 + }, + { + "epoch": 0.2063126856836509, + "grad_norm": 0.3770674169063568, + "learning_rate": 9.206765449377006e-05, + "loss": 1.0004, + "step": 2309 + }, + { + "epoch": 0.20640203721491276, + "grad_norm": 0.43098023533821106, + "learning_rate": 9.205983151398915e-05, + "loss": 1.0377, + "step": 2310 + }, + { + "epoch": 0.20649138874617465, + "grad_norm": 0.4793153405189514, + "learning_rate": 9.205200501124902e-05, + "loss": 0.9746, + "step": 2311 + }, + { + "epoch": 0.2065807402774365, + "grad_norm": 0.5100858807563782, + "learning_rate": 9.204417498620522e-05, + "loss": 1.021, + "step": 2312 + }, + { + "epoch": 0.20667009180869836, + "grad_norm": 0.42289164662361145, + "learning_rate": 9.203634143951361e-05, + "loss": 0.9899, + "step": 2313 + }, + { + "epoch": 0.20675944333996024, + "grad_norm": 0.4890194237232208, + "learning_rate": 9.202850437183033e-05, + "loss": 0.94, + "step": 2314 + }, + { + "epoch": 0.2068487948712221, + "grad_norm": 0.3784908652305603, + "learning_rate": 9.202066378381183e-05, + "loss": 1.0157, + "step": 2315 + }, + { + "epoch": 0.20693814640248398, + "grad_norm": 0.4029442369937897, + "learning_rate": 9.201281967611481e-05, + "loss": 1.0325, + "step": 2316 + }, + { + "epoch": 0.20702749793374584, + "grad_norm": 0.413496732711792, + "learning_rate": 9.200497204939633e-05, + "loss": 1.0768, + "step": 2317 + }, + { + "epoch": 0.20711684946500772, + "grad_norm": 0.4051424562931061, + "learning_rate": 9.19971209043137e-05, + "loss": 1.0192, + "step": 2318 + }, + { + "epoch": 0.20720620099626957, + "grad_norm": 0.399827241897583, + "learning_rate": 9.198926624152453e-05, + "loss": 1.0137, + "step": 2319 + }, + { + "epoch": 0.20729555252753143, + "grad_norm": 0.41111016273498535, + "learning_rate": 9.198140806168673e-05, + "loss": 1.0135, + "step": 2320 + }, + { + "epoch": 0.2073849040587933, + "grad_norm": 0.49021974205970764, + "learning_rate": 9.197354636545853e-05, + "loss": 1.0126, + "step": 2321 + }, + { + "epoch": 0.20747425559005517, + "grad_norm": 0.37917208671569824, + "learning_rate": 9.196568115349842e-05, + "loss": 1.0455, + "step": 2322 + }, + { + "epoch": 0.20756360712131705, + "grad_norm": 0.4736994802951813, + "learning_rate": 9.195781242646517e-05, + "loss": 1.0129, + "step": 2323 + }, + { + "epoch": 0.2076529586525789, + "grad_norm": 0.444503515958786, + "learning_rate": 9.194994018501793e-05, + "loss": 0.998, + "step": 2324 + }, + { + "epoch": 0.2077423101838408, + "grad_norm": 0.4442484378814697, + "learning_rate": 9.194206442981601e-05, + "loss": 0.9962, + "step": 2325 + }, + { + "epoch": 0.20783166171510264, + "grad_norm": 0.48136886954307556, + "learning_rate": 9.193418516151912e-05, + "loss": 1.0792, + "step": 2326 + }, + { + "epoch": 0.2079210132463645, + "grad_norm": 0.4007897675037384, + "learning_rate": 9.192630238078725e-05, + "loss": 0.9787, + "step": 2327 + }, + { + "epoch": 0.20801036477762638, + "grad_norm": 0.42562204599380493, + "learning_rate": 9.191841608828066e-05, + "loss": 1.0511, + "step": 2328 + }, + { + "epoch": 0.20809971630888824, + "grad_norm": 0.4017636179924011, + "learning_rate": 9.19105262846599e-05, + "loss": 1.0076, + "step": 2329 + }, + { + "epoch": 0.20818906784015012, + "grad_norm": 0.4797520637512207, + "learning_rate": 9.190263297058583e-05, + "loss": 1.0116, + "step": 2330 + }, + { + "epoch": 0.20827841937141198, + "grad_norm": 0.37784191966056824, + "learning_rate": 9.189473614671959e-05, + "loss": 1.0355, + "step": 2331 + }, + { + "epoch": 0.20836777090267383, + "grad_norm": 0.43021056056022644, + "learning_rate": 9.188683581372264e-05, + "loss": 1.0424, + "step": 2332 + }, + { + "epoch": 0.20845712243393572, + "grad_norm": 0.3950195610523224, + "learning_rate": 9.187893197225672e-05, + "loss": 1.0493, + "step": 2333 + }, + { + "epoch": 0.20854647396519757, + "grad_norm": 0.40289580821990967, + "learning_rate": 9.187102462298384e-05, + "loss": 1.0145, + "step": 2334 + }, + { + "epoch": 0.20863582549645945, + "grad_norm": 0.3758835196495056, + "learning_rate": 9.186311376656633e-05, + "loss": 1.0352, + "step": 2335 + }, + { + "epoch": 0.2087251770277213, + "grad_norm": 0.44723740220069885, + "learning_rate": 9.185519940366682e-05, + "loss": 1.0553, + "step": 2336 + }, + { + "epoch": 0.2088145285589832, + "grad_norm": 0.4580692946910858, + "learning_rate": 9.18472815349482e-05, + "loss": 1.0657, + "step": 2337 + }, + { + "epoch": 0.20890388009024505, + "grad_norm": 0.5509561896324158, + "learning_rate": 9.183936016107371e-05, + "loss": 0.9759, + "step": 2338 + }, + { + "epoch": 0.2089932316215069, + "grad_norm": 0.5175040364265442, + "learning_rate": 9.183143528270682e-05, + "loss": 1.0432, + "step": 2339 + }, + { + "epoch": 0.2090825831527688, + "grad_norm": 0.40656107664108276, + "learning_rate": 9.182350690051133e-05, + "loss": 1.0077, + "step": 2340 + }, + { + "epoch": 0.20917193468403064, + "grad_norm": 0.44654014706611633, + "learning_rate": 9.181557501515134e-05, + "loss": 1.0218, + "step": 2341 + }, + { + "epoch": 0.20926128621529252, + "grad_norm": 0.4651091396808624, + "learning_rate": 9.180763962729123e-05, + "loss": 1.0464, + "step": 2342 + }, + { + "epoch": 0.20935063774655438, + "grad_norm": 0.4404200613498688, + "learning_rate": 9.179970073759565e-05, + "loss": 1.0241, + "step": 2343 + }, + { + "epoch": 0.20943998927781624, + "grad_norm": 0.5655425190925598, + "learning_rate": 9.17917583467296e-05, + "loss": 0.9452, + "step": 2344 + }, + { + "epoch": 0.20952934080907812, + "grad_norm": 0.442924827337265, + "learning_rate": 9.178381245535829e-05, + "loss": 1.0046, + "step": 2345 + }, + { + "epoch": 0.20961869234033997, + "grad_norm": 0.40375635027885437, + "learning_rate": 9.177586306414731e-05, + "loss": 1.0673, + "step": 2346 + }, + { + "epoch": 0.20970804387160186, + "grad_norm": 0.46348804235458374, + "learning_rate": 9.176791017376252e-05, + "loss": 1.0156, + "step": 2347 + }, + { + "epoch": 0.2097973954028637, + "grad_norm": 0.41273656487464905, + "learning_rate": 9.175995378487003e-05, + "loss": 1.0553, + "step": 2348 + }, + { + "epoch": 0.2098867469341256, + "grad_norm": 0.37597787380218506, + "learning_rate": 9.175199389813627e-05, + "loss": 0.9665, + "step": 2349 + }, + { + "epoch": 0.20997609846538745, + "grad_norm": 0.4291061758995056, + "learning_rate": 9.174403051422798e-05, + "loss": 0.9993, + "step": 2350 + }, + { + "epoch": 0.2100654499966493, + "grad_norm": 0.363323837518692, + "learning_rate": 9.173606363381219e-05, + "loss": 1.0601, + "step": 2351 + }, + { + "epoch": 0.2101548015279112, + "grad_norm": 0.4082275629043579, + "learning_rate": 9.172809325755618e-05, + "loss": 1.0879, + "step": 2352 + }, + { + "epoch": 0.21024415305917304, + "grad_norm": 0.41467830538749695, + "learning_rate": 9.172011938612757e-05, + "loss": 1.0513, + "step": 2353 + }, + { + "epoch": 0.21033350459043493, + "grad_norm": 0.40125423669815063, + "learning_rate": 9.171214202019428e-05, + "loss": 1.0362, + "step": 2354 + }, + { + "epoch": 0.21042285612169678, + "grad_norm": 0.38793444633483887, + "learning_rate": 9.170416116042444e-05, + "loss": 0.9959, + "step": 2355 + }, + { + "epoch": 0.21051220765295867, + "grad_norm": 0.4493177533149719, + "learning_rate": 9.169617680748659e-05, + "loss": 1.004, + "step": 2356 + }, + { + "epoch": 0.21060155918422052, + "grad_norm": 0.39111337065696716, + "learning_rate": 9.168818896204948e-05, + "loss": 0.9994, + "step": 2357 + }, + { + "epoch": 0.21069091071548238, + "grad_norm": 0.37774431705474854, + "learning_rate": 9.168019762478218e-05, + "loss": 1.0901, + "step": 2358 + }, + { + "epoch": 0.21078026224674426, + "grad_norm": 0.3941672444343567, + "learning_rate": 9.167220279635406e-05, + "loss": 1.0383, + "step": 2359 + }, + { + "epoch": 0.21086961377800612, + "grad_norm": 0.39567404985427856, + "learning_rate": 9.166420447743475e-05, + "loss": 0.986, + "step": 2360 + }, + { + "epoch": 0.210958965309268, + "grad_norm": 0.5099696516990662, + "learning_rate": 9.165620266869421e-05, + "loss": 0.9966, + "step": 2361 + }, + { + "epoch": 0.21104831684052985, + "grad_norm": 0.4867641031742096, + "learning_rate": 9.164819737080267e-05, + "loss": 1.037, + "step": 2362 + }, + { + "epoch": 0.2111376683717917, + "grad_norm": 0.41066691279411316, + "learning_rate": 9.164018858443066e-05, + "loss": 1.0574, + "step": 2363 + }, + { + "epoch": 0.2112270199030536, + "grad_norm": 0.4427512586116791, + "learning_rate": 9.163217631024901e-05, + "loss": 0.989, + "step": 2364 + }, + { + "epoch": 0.21131637143431545, + "grad_norm": 0.4066883325576782, + "learning_rate": 9.162416054892882e-05, + "loss": 0.979, + "step": 2365 + }, + { + "epoch": 0.21140572296557733, + "grad_norm": 0.4055411219596863, + "learning_rate": 9.161614130114151e-05, + "loss": 1.0365, + "step": 2366 + }, + { + "epoch": 0.21149507449683919, + "grad_norm": 0.4873175621032715, + "learning_rate": 9.160811856755877e-05, + "loss": 0.9807, + "step": 2367 + }, + { + "epoch": 0.21158442602810107, + "grad_norm": 0.4377719461917877, + "learning_rate": 9.160009234885258e-05, + "loss": 1.0499, + "step": 2368 + }, + { + "epoch": 0.21167377755936292, + "grad_norm": 0.4379160702228546, + "learning_rate": 9.159206264569524e-05, + "loss": 1.0161, + "step": 2369 + }, + { + "epoch": 0.21176312909062478, + "grad_norm": 0.41256406903266907, + "learning_rate": 9.158402945875932e-05, + "loss": 0.9943, + "step": 2370 + }, + { + "epoch": 0.21185248062188666, + "grad_norm": 0.3979935646057129, + "learning_rate": 9.157599278871767e-05, + "loss": 1.0566, + "step": 2371 + }, + { + "epoch": 0.21194183215314852, + "grad_norm": 0.4434359073638916, + "learning_rate": 9.156795263624345e-05, + "loss": 0.9642, + "step": 2372 + }, + { + "epoch": 0.2120311836844104, + "grad_norm": 0.42993849515914917, + "learning_rate": 9.155990900201012e-05, + "loss": 0.9881, + "step": 2373 + }, + { + "epoch": 0.21212053521567226, + "grad_norm": 0.4635986387729645, + "learning_rate": 9.155186188669143e-05, + "loss": 0.988, + "step": 2374 + }, + { + "epoch": 0.2122098867469341, + "grad_norm": 0.3796444535255432, + "learning_rate": 9.154381129096137e-05, + "loss": 1.0141, + "step": 2375 + }, + { + "epoch": 0.212299238278196, + "grad_norm": 0.468389093875885, + "learning_rate": 9.15357572154943e-05, + "loss": 0.9412, + "step": 2376 + }, + { + "epoch": 0.21238858980945785, + "grad_norm": 0.410773366689682, + "learning_rate": 9.152769966096482e-05, + "loss": 1.0115, + "step": 2377 + }, + { + "epoch": 0.21247794134071973, + "grad_norm": 0.41390907764434814, + "learning_rate": 9.151963862804784e-05, + "loss": 1.0499, + "step": 2378 + }, + { + "epoch": 0.2125672928719816, + "grad_norm": 0.4567076563835144, + "learning_rate": 9.151157411741858e-05, + "loss": 0.9748, + "step": 2379 + }, + { + "epoch": 0.21265664440324347, + "grad_norm": 0.4838281273841858, + "learning_rate": 9.150350612975247e-05, + "loss": 1.0076, + "step": 2380 + }, + { + "epoch": 0.21274599593450533, + "grad_norm": 0.4495866000652313, + "learning_rate": 9.149543466572535e-05, + "loss": 0.959, + "step": 2381 + }, + { + "epoch": 0.21283534746576718, + "grad_norm": 0.4260130226612091, + "learning_rate": 9.148735972601326e-05, + "loss": 1.0522, + "step": 2382 + }, + { + "epoch": 0.21292469899702907, + "grad_norm": 0.3835292160511017, + "learning_rate": 9.147928131129256e-05, + "loss": 1.0342, + "step": 2383 + }, + { + "epoch": 0.21301405052829092, + "grad_norm": 0.4930969774723053, + "learning_rate": 9.147119942223993e-05, + "loss": 0.9595, + "step": 2384 + }, + { + "epoch": 0.2131034020595528, + "grad_norm": 0.6031138896942139, + "learning_rate": 9.14631140595323e-05, + "loss": 1.0145, + "step": 2385 + }, + { + "epoch": 0.21319275359081466, + "grad_norm": 0.5279069542884827, + "learning_rate": 9.145502522384688e-05, + "loss": 0.9652, + "step": 2386 + }, + { + "epoch": 0.21328210512207654, + "grad_norm": 0.4844546616077423, + "learning_rate": 9.144693291586124e-05, + "loss": 1.0302, + "step": 2387 + }, + { + "epoch": 0.2133714566533384, + "grad_norm": 0.4257798194885254, + "learning_rate": 9.143883713625317e-05, + "loss": 1.0513, + "step": 2388 + }, + { + "epoch": 0.21346080818460025, + "grad_norm": 0.41385725140571594, + "learning_rate": 9.143073788570077e-05, + "loss": 0.9939, + "step": 2389 + }, + { + "epoch": 0.21355015971586214, + "grad_norm": 0.3999519944190979, + "learning_rate": 9.142263516488245e-05, + "loss": 0.9706, + "step": 2390 + }, + { + "epoch": 0.213639511247124, + "grad_norm": 0.3895846903324127, + "learning_rate": 9.141452897447692e-05, + "loss": 1.074, + "step": 2391 + }, + { + "epoch": 0.21372886277838588, + "grad_norm": 0.44434553384780884, + "learning_rate": 9.140641931516314e-05, + "loss": 1.0137, + "step": 2392 + }, + { + "epoch": 0.21381821430964773, + "grad_norm": 0.3749370574951172, + "learning_rate": 9.139830618762038e-05, + "loss": 1.0124, + "step": 2393 + }, + { + "epoch": 0.21390756584090959, + "grad_norm": 0.37694430351257324, + "learning_rate": 9.139018959252819e-05, + "loss": 1.0097, + "step": 2394 + }, + { + "epoch": 0.21399691737217147, + "grad_norm": 0.3813773989677429, + "learning_rate": 9.138206953056644e-05, + "loss": 1.0833, + "step": 2395 + }, + { + "epoch": 0.21408626890343332, + "grad_norm": 0.4502156972885132, + "learning_rate": 9.137394600241527e-05, + "loss": 0.9692, + "step": 2396 + }, + { + "epoch": 0.2141756204346952, + "grad_norm": 0.41509172320365906, + "learning_rate": 9.136581900875512e-05, + "loss": 0.9825, + "step": 2397 + }, + { + "epoch": 0.21426497196595706, + "grad_norm": 0.4003329277038574, + "learning_rate": 9.135768855026668e-05, + "loss": 1.0834, + "step": 2398 + }, + { + "epoch": 0.21435432349721895, + "grad_norm": 0.4339703917503357, + "learning_rate": 9.1349554627631e-05, + "loss": 1.0946, + "step": 2399 + }, + { + "epoch": 0.2144436750284808, + "grad_norm": 0.43198466300964355, + "learning_rate": 9.134141724152934e-05, + "loss": 0.9449, + "step": 2400 + }, + { + "epoch": 0.21453302655974266, + "grad_norm": 0.44103294610977173, + "learning_rate": 9.133327639264332e-05, + "loss": 0.9653, + "step": 2401 + }, + { + "epoch": 0.21462237809100454, + "grad_norm": 0.4824722707271576, + "learning_rate": 9.132513208165486e-05, + "loss": 1.0428, + "step": 2402 + }, + { + "epoch": 0.2147117296222664, + "grad_norm": 0.49322792887687683, + "learning_rate": 9.131698430924605e-05, + "loss": 1.0646, + "step": 2403 + }, + { + "epoch": 0.21480108115352828, + "grad_norm": 0.4989064931869507, + "learning_rate": 9.130883307609942e-05, + "loss": 1.0013, + "step": 2404 + }, + { + "epoch": 0.21489043268479013, + "grad_norm": 0.459974467754364, + "learning_rate": 9.130067838289769e-05, + "loss": 1.0519, + "step": 2405 + }, + { + "epoch": 0.214979784216052, + "grad_norm": 0.44831836223602295, + "learning_rate": 9.129252023032391e-05, + "loss": 1.055, + "step": 2406 + }, + { + "epoch": 0.21506913574731387, + "grad_norm": 0.4193100929260254, + "learning_rate": 9.128435861906142e-05, + "loss": 0.9937, + "step": 2407 + }, + { + "epoch": 0.21515848727857573, + "grad_norm": 0.4522268772125244, + "learning_rate": 9.127619354979384e-05, + "loss": 1.076, + "step": 2408 + }, + { + "epoch": 0.2152478388098376, + "grad_norm": 0.4380955100059509, + "learning_rate": 9.126802502320509e-05, + "loss": 0.9917, + "step": 2409 + }, + { + "epoch": 0.21533719034109947, + "grad_norm": 0.45167332887649536, + "learning_rate": 9.125985303997933e-05, + "loss": 1.0884, + "step": 2410 + }, + { + "epoch": 0.21542654187236135, + "grad_norm": 0.4360027611255646, + "learning_rate": 9.125167760080108e-05, + "loss": 0.9882, + "step": 2411 + }, + { + "epoch": 0.2155158934036232, + "grad_norm": 0.390898197889328, + "learning_rate": 9.124349870635515e-05, + "loss": 1.0076, + "step": 2412 + }, + { + "epoch": 0.21560524493488506, + "grad_norm": 0.4726807475090027, + "learning_rate": 9.123531635732656e-05, + "loss": 0.9496, + "step": 2413 + }, + { + "epoch": 0.21569459646614694, + "grad_norm": 0.4268726706504822, + "learning_rate": 9.122713055440069e-05, + "loss": 1.0008, + "step": 2414 + }, + { + "epoch": 0.2157839479974088, + "grad_norm": 0.43627238273620605, + "learning_rate": 9.121894129826318e-05, + "loss": 0.9642, + "step": 2415 + }, + { + "epoch": 0.21587329952867068, + "grad_norm": 0.4274177849292755, + "learning_rate": 9.121074858959997e-05, + "loss": 1.0273, + "step": 2416 + }, + { + "epoch": 0.21596265105993254, + "grad_norm": 0.4479106068611145, + "learning_rate": 9.12025524290973e-05, + "loss": 0.9998, + "step": 2417 + }, + { + "epoch": 0.21605200259119442, + "grad_norm": 0.45606303215026855, + "learning_rate": 9.119435281744169e-05, + "loss": 0.9753, + "step": 2418 + }, + { + "epoch": 0.21614135412245628, + "grad_norm": 0.40159735083580017, + "learning_rate": 9.118614975531991e-05, + "loss": 1.011, + "step": 2419 + }, + { + "epoch": 0.21623070565371813, + "grad_norm": 0.45181921124458313, + "learning_rate": 9.117794324341908e-05, + "loss": 1.0253, + "step": 2420 + }, + { + "epoch": 0.21632005718498, + "grad_norm": 0.41297322511672974, + "learning_rate": 9.116973328242658e-05, + "loss": 1.001, + "step": 2421 + }, + { + "epoch": 0.21640940871624187, + "grad_norm": 0.39003559947013855, + "learning_rate": 9.116151987303007e-05, + "loss": 1.0044, + "step": 2422 + }, + { + "epoch": 0.21649876024750375, + "grad_norm": 0.45136478543281555, + "learning_rate": 9.115330301591753e-05, + "loss": 0.9569, + "step": 2423 + }, + { + "epoch": 0.2165881117787656, + "grad_norm": 0.4354073107242584, + "learning_rate": 9.114508271177722e-05, + "loss": 0.9862, + "step": 2424 + }, + { + "epoch": 0.21667746331002746, + "grad_norm": 0.3921179175376892, + "learning_rate": 9.113685896129763e-05, + "loss": 0.9894, + "step": 2425 + }, + { + "epoch": 0.21676681484128935, + "grad_norm": 0.42139732837677, + "learning_rate": 9.112863176516762e-05, + "loss": 0.9777, + "step": 2426 + }, + { + "epoch": 0.2168561663725512, + "grad_norm": 0.5038270950317383, + "learning_rate": 9.11204011240763e-05, + "loss": 1.1157, + "step": 2427 + }, + { + "epoch": 0.21694551790381308, + "grad_norm": 0.5017008781433105, + "learning_rate": 9.111216703871308e-05, + "loss": 0.9634, + "step": 2428 + }, + { + "epoch": 0.21703486943507494, + "grad_norm": 0.5621154308319092, + "learning_rate": 9.110392950976764e-05, + "loss": 1.0222, + "step": 2429 + }, + { + "epoch": 0.21712422096633682, + "grad_norm": 0.43633437156677246, + "learning_rate": 9.109568853792998e-05, + "loss": 1.1109, + "step": 2430 + }, + { + "epoch": 0.21721357249759868, + "grad_norm": 0.48714277148246765, + "learning_rate": 9.108744412389034e-05, + "loss": 0.9751, + "step": 2431 + }, + { + "epoch": 0.21730292402886053, + "grad_norm": 0.4233207702636719, + "learning_rate": 9.107919626833931e-05, + "loss": 1.0543, + "step": 2432 + }, + { + "epoch": 0.21739227556012242, + "grad_norm": 0.3775826096534729, + "learning_rate": 9.107094497196771e-05, + "loss": 1.0672, + "step": 2433 + }, + { + "epoch": 0.21748162709138427, + "grad_norm": 0.47611549496650696, + "learning_rate": 9.106269023546667e-05, + "loss": 1.0062, + "step": 2434 + }, + { + "epoch": 0.21757097862264616, + "grad_norm": 0.40070483088493347, + "learning_rate": 9.105443205952765e-05, + "loss": 0.9908, + "step": 2435 + }, + { + "epoch": 0.217660330153908, + "grad_norm": 0.4242134392261505, + "learning_rate": 9.104617044484233e-05, + "loss": 1.0447, + "step": 2436 + }, + { + "epoch": 0.21774968168516987, + "grad_norm": 0.4634241461753845, + "learning_rate": 9.103790539210271e-05, + "loss": 1.0203, + "step": 2437 + }, + { + "epoch": 0.21783903321643175, + "grad_norm": 0.46753108501434326, + "learning_rate": 9.102963690200108e-05, + "loss": 0.892, + "step": 2438 + }, + { + "epoch": 0.2179283847476936, + "grad_norm": 0.47275984287261963, + "learning_rate": 9.102136497523002e-05, + "loss": 1.0528, + "step": 2439 + }, + { + "epoch": 0.2180177362789555, + "grad_norm": 0.4334014058113098, + "learning_rate": 9.101308961248238e-05, + "loss": 0.9923, + "step": 2440 + }, + { + "epoch": 0.21810708781021734, + "grad_norm": 0.41837531328201294, + "learning_rate": 9.100481081445132e-05, + "loss": 0.9946, + "step": 2441 + }, + { + "epoch": 0.21819643934147923, + "grad_norm": 0.4113021194934845, + "learning_rate": 9.099652858183028e-05, + "loss": 1.0441, + "step": 2442 + }, + { + "epoch": 0.21828579087274108, + "grad_norm": 0.4260026514530182, + "learning_rate": 9.098824291531296e-05, + "loss": 1.0988, + "step": 2443 + }, + { + "epoch": 0.21837514240400294, + "grad_norm": 0.4235369563102722, + "learning_rate": 9.097995381559341e-05, + "loss": 1.0867, + "step": 2444 + }, + { + "epoch": 0.21846449393526482, + "grad_norm": 0.4195026755332947, + "learning_rate": 9.097166128336592e-05, + "loss": 1.0309, + "step": 2445 + }, + { + "epoch": 0.21855384546652667, + "grad_norm": 0.4182673990726471, + "learning_rate": 9.096336531932506e-05, + "loss": 1.0566, + "step": 2446 + }, + { + "epoch": 0.21864319699778856, + "grad_norm": 0.40884897112846375, + "learning_rate": 9.095506592416572e-05, + "loss": 1.0218, + "step": 2447 + }, + { + "epoch": 0.2187325485290504, + "grad_norm": 0.4604688584804535, + "learning_rate": 9.094676309858305e-05, + "loss": 1.0578, + "step": 2448 + }, + { + "epoch": 0.2188219000603123, + "grad_norm": 0.4072805941104889, + "learning_rate": 9.093845684327251e-05, + "loss": 1.0117, + "step": 2449 + }, + { + "epoch": 0.21891125159157415, + "grad_norm": 0.3600502014160156, + "learning_rate": 9.093014715892984e-05, + "loss": 1.0826, + "step": 2450 + }, + { + "epoch": 0.219000603122836, + "grad_norm": 0.421089231967926, + "learning_rate": 9.092183404625107e-05, + "loss": 1.0072, + "step": 2451 + }, + { + "epoch": 0.2190899546540979, + "grad_norm": 0.4491601586341858, + "learning_rate": 9.09135175059325e-05, + "loss": 1.0409, + "step": 2452 + }, + { + "epoch": 0.21917930618535975, + "grad_norm": 0.4364774823188782, + "learning_rate": 9.090519753867072e-05, + "loss": 1.0142, + "step": 2453 + }, + { + "epoch": 0.21926865771662163, + "grad_norm": 0.4364645779132843, + "learning_rate": 9.089687414516265e-05, + "loss": 1.0297, + "step": 2454 + }, + { + "epoch": 0.21935800924788348, + "grad_norm": 0.4864216148853302, + "learning_rate": 9.088854732610543e-05, + "loss": 1.0635, + "step": 2455 + }, + { + "epoch": 0.21944736077914534, + "grad_norm": 0.4514232873916626, + "learning_rate": 9.088021708219652e-05, + "loss": 1.0224, + "step": 2456 + }, + { + "epoch": 0.21953671231040722, + "grad_norm": 0.3978760540485382, + "learning_rate": 9.087188341413369e-05, + "loss": 1.0442, + "step": 2457 + }, + { + "epoch": 0.21962606384166908, + "grad_norm": 0.36813050508499146, + "learning_rate": 9.086354632261496e-05, + "loss": 1.0114, + "step": 2458 + }, + { + "epoch": 0.21971541537293096, + "grad_norm": 0.4238460063934326, + "learning_rate": 9.085520580833866e-05, + "loss": 1.0065, + "step": 2459 + }, + { + "epoch": 0.21980476690419282, + "grad_norm": 0.57405024766922, + "learning_rate": 9.084686187200338e-05, + "loss": 0.9361, + "step": 2460 + }, + { + "epoch": 0.2198941184354547, + "grad_norm": 0.5002081990242004, + "learning_rate": 9.0838514514308e-05, + "loss": 1.0109, + "step": 2461 + }, + { + "epoch": 0.21998346996671655, + "grad_norm": 0.397339403629303, + "learning_rate": 9.083016373595174e-05, + "loss": 1.0188, + "step": 2462 + }, + { + "epoch": 0.2200728214979784, + "grad_norm": 0.4922274947166443, + "learning_rate": 9.082180953763406e-05, + "loss": 0.9552, + "step": 2463 + }, + { + "epoch": 0.2201621730292403, + "grad_norm": 0.49494630098342896, + "learning_rate": 9.08134519200547e-05, + "loss": 1.0058, + "step": 2464 + }, + { + "epoch": 0.22025152456050215, + "grad_norm": 0.44430068135261536, + "learning_rate": 9.080509088391369e-05, + "loss": 1.0486, + "step": 2465 + }, + { + "epoch": 0.22034087609176403, + "grad_norm": 0.4087202548980713, + "learning_rate": 9.079672642991137e-05, + "loss": 1.0006, + "step": 2466 + }, + { + "epoch": 0.2204302276230259, + "grad_norm": 0.49828004837036133, + "learning_rate": 9.078835855874835e-05, + "loss": 1.0229, + "step": 2467 + }, + { + "epoch": 0.22051957915428777, + "grad_norm": 0.42710915207862854, + "learning_rate": 9.077998727112554e-05, + "loss": 1.0254, + "step": 2468 + }, + { + "epoch": 0.22060893068554963, + "grad_norm": 0.40693971514701843, + "learning_rate": 9.077161256774409e-05, + "loss": 1.0551, + "step": 2469 + }, + { + "epoch": 0.22069828221681148, + "grad_norm": 0.4143005609512329, + "learning_rate": 9.076323444930551e-05, + "loss": 0.9634, + "step": 2470 + }, + { + "epoch": 0.22078763374807336, + "grad_norm": 0.4316956102848053, + "learning_rate": 9.075485291651154e-05, + "loss": 0.9414, + "step": 2471 + }, + { + "epoch": 0.22087698527933522, + "grad_norm": 0.42413267493247986, + "learning_rate": 9.07464679700642e-05, + "loss": 0.9748, + "step": 2472 + }, + { + "epoch": 0.2209663368105971, + "grad_norm": 0.4333771765232086, + "learning_rate": 9.073807961066588e-05, + "loss": 0.9875, + "step": 2473 + }, + { + "epoch": 0.22105568834185896, + "grad_norm": 0.4280893802642822, + "learning_rate": 9.072968783901913e-05, + "loss": 0.972, + "step": 2474 + }, + { + "epoch": 0.2211450398731208, + "grad_norm": 0.4792003333568573, + "learning_rate": 9.072129265582689e-05, + "loss": 1.0064, + "step": 2475 + }, + { + "epoch": 0.2212343914043827, + "grad_norm": 0.44787389039993286, + "learning_rate": 9.071289406179231e-05, + "loss": 0.9806, + "step": 2476 + }, + { + "epoch": 0.22132374293564455, + "grad_norm": 0.43630531430244446, + "learning_rate": 9.070449205761891e-05, + "loss": 0.9906, + "step": 2477 + }, + { + "epoch": 0.22141309446690643, + "grad_norm": 0.46949827671051025, + "learning_rate": 9.069608664401041e-05, + "loss": 1.0052, + "step": 2478 + }, + { + "epoch": 0.2215024459981683, + "grad_norm": 0.37613534927368164, + "learning_rate": 9.068767782167086e-05, + "loss": 0.9737, + "step": 2479 + }, + { + "epoch": 0.22159179752943017, + "grad_norm": 0.4791412353515625, + "learning_rate": 9.06792655913046e-05, + "loss": 0.9505, + "step": 2480 + }, + { + "epoch": 0.22168114906069203, + "grad_norm": 0.4607903063297272, + "learning_rate": 9.067084995361623e-05, + "loss": 1.1162, + "step": 2481 + }, + { + "epoch": 0.22177050059195388, + "grad_norm": 0.40373310446739197, + "learning_rate": 9.066243090931066e-05, + "loss": 0.9888, + "step": 2482 + }, + { + "epoch": 0.22185985212321577, + "grad_norm": 0.40429624915122986, + "learning_rate": 9.065400845909308e-05, + "loss": 1.0024, + "step": 2483 + }, + { + "epoch": 0.22194920365447762, + "grad_norm": 0.40631914138793945, + "learning_rate": 9.064558260366893e-05, + "loss": 0.9719, + "step": 2484 + }, + { + "epoch": 0.2220385551857395, + "grad_norm": 0.43395867943763733, + "learning_rate": 9.063715334374401e-05, + "loss": 0.9634, + "step": 2485 + }, + { + "epoch": 0.22212790671700136, + "grad_norm": 0.4469437003135681, + "learning_rate": 9.062872068002432e-05, + "loss": 0.9666, + "step": 2486 + }, + { + "epoch": 0.22221725824826322, + "grad_norm": 0.5009754300117493, + "learning_rate": 9.062028461321621e-05, + "loss": 0.9931, + "step": 2487 + }, + { + "epoch": 0.2223066097795251, + "grad_norm": 0.4312589764595032, + "learning_rate": 9.061184514402627e-05, + "loss": 1.0201, + "step": 2488 + }, + { + "epoch": 0.22239596131078695, + "grad_norm": 0.45867425203323364, + "learning_rate": 9.060340227316142e-05, + "loss": 1.0177, + "step": 2489 + }, + { + "epoch": 0.22248531284204884, + "grad_norm": 0.39511245489120483, + "learning_rate": 9.059495600132883e-05, + "loss": 1.0383, + "step": 2490 + }, + { + "epoch": 0.2225746643733107, + "grad_norm": 0.4442897439002991, + "learning_rate": 9.058650632923595e-05, + "loss": 0.9576, + "step": 2491 + }, + { + "epoch": 0.22266401590457258, + "grad_norm": 0.556317925453186, + "learning_rate": 9.057805325759057e-05, + "loss": 0.9748, + "step": 2492 + }, + { + "epoch": 0.22275336743583443, + "grad_norm": 0.49973076581954956, + "learning_rate": 9.056959678710067e-05, + "loss": 0.8914, + "step": 2493 + }, + { + "epoch": 0.2228427189670963, + "grad_norm": 0.5200818181037903, + "learning_rate": 9.056113691847461e-05, + "loss": 0.9384, + "step": 2494 + }, + { + "epoch": 0.22293207049835817, + "grad_norm": 0.4469231963157654, + "learning_rate": 9.055267365242099e-05, + "loss": 0.9844, + "step": 2495 + }, + { + "epoch": 0.22302142202962003, + "grad_norm": 0.4275433123111725, + "learning_rate": 9.054420698964868e-05, + "loss": 1.0414, + "step": 2496 + }, + { + "epoch": 0.2231107735608819, + "grad_norm": 0.36767879128456116, + "learning_rate": 9.053573693086687e-05, + "loss": 1.026, + "step": 2497 + }, + { + "epoch": 0.22320012509214376, + "grad_norm": 0.3746047019958496, + "learning_rate": 9.052726347678502e-05, + "loss": 1.0667, + "step": 2498 + }, + { + "epoch": 0.22328947662340565, + "grad_norm": 0.44350770115852356, + "learning_rate": 9.051878662811286e-05, + "loss": 1.0337, + "step": 2499 + }, + { + "epoch": 0.2233788281546675, + "grad_norm": 0.42601439356803894, + "learning_rate": 9.051030638556041e-05, + "loss": 1.0673, + "step": 2500 + }, + { + "epoch": 0.22346817968592936, + "grad_norm": 0.4272880554199219, + "learning_rate": 9.050182274983798e-05, + "loss": 1.0239, + "step": 2501 + }, + { + "epoch": 0.22355753121719124, + "grad_norm": 0.4652038812637329, + "learning_rate": 9.04933357216562e-05, + "loss": 0.959, + "step": 2502 + }, + { + "epoch": 0.2236468827484531, + "grad_norm": 0.4481859505176544, + "learning_rate": 9.048484530172592e-05, + "loss": 1.0601, + "step": 2503 + }, + { + "epoch": 0.22373623427971498, + "grad_norm": 0.4447591304779053, + "learning_rate": 9.04763514907583e-05, + "loss": 1.0246, + "step": 2504 + }, + { + "epoch": 0.22382558581097683, + "grad_norm": 0.37949299812316895, + "learning_rate": 9.046785428946481e-05, + "loss": 1.0761, + "step": 2505 + }, + { + "epoch": 0.2239149373422387, + "grad_norm": 0.44117259979248047, + "learning_rate": 9.045935369855716e-05, + "loss": 1.0082, + "step": 2506 + }, + { + "epoch": 0.22400428887350057, + "grad_norm": 0.4014883041381836, + "learning_rate": 9.045084971874738e-05, + "loss": 1.0867, + "step": 2507 + }, + { + "epoch": 0.22409364040476243, + "grad_norm": 0.5892843008041382, + "learning_rate": 9.044234235074775e-05, + "loss": 0.9446, + "step": 2508 + }, + { + "epoch": 0.2241829919360243, + "grad_norm": 0.4554404318332672, + "learning_rate": 9.043383159527087e-05, + "loss": 0.939, + "step": 2509 + }, + { + "epoch": 0.22427234346728617, + "grad_norm": 0.3739197552204132, + "learning_rate": 9.04253174530296e-05, + "loss": 1.0603, + "step": 2510 + }, + { + "epoch": 0.22436169499854805, + "grad_norm": 0.48913314938545227, + "learning_rate": 9.041679992473708e-05, + "loss": 0.9852, + "step": 2511 + }, + { + "epoch": 0.2244510465298099, + "grad_norm": 0.3724241256713867, + "learning_rate": 9.040827901110676e-05, + "loss": 1.0422, + "step": 2512 + }, + { + "epoch": 0.22454039806107176, + "grad_norm": 0.4462158679962158, + "learning_rate": 9.039975471285235e-05, + "loss": 1.0337, + "step": 2513 + }, + { + "epoch": 0.22462974959233364, + "grad_norm": 0.4481222629547119, + "learning_rate": 9.039122703068785e-05, + "loss": 1.0114, + "step": 2514 + }, + { + "epoch": 0.2247191011235955, + "grad_norm": 0.4646155536174774, + "learning_rate": 9.038269596532755e-05, + "loss": 1.0595, + "step": 2515 + }, + { + "epoch": 0.22480845265485738, + "grad_norm": 0.4418215751647949, + "learning_rate": 9.037416151748603e-05, + "loss": 0.9829, + "step": 2516 + }, + { + "epoch": 0.22489780418611924, + "grad_norm": 0.4447876811027527, + "learning_rate": 9.036562368787811e-05, + "loss": 0.9914, + "step": 2517 + }, + { + "epoch": 0.2249871557173811, + "grad_norm": 0.47525134682655334, + "learning_rate": 9.035708247721895e-05, + "loss": 1.0124, + "step": 2518 + }, + { + "epoch": 0.22507650724864298, + "grad_norm": 0.5248557925224304, + "learning_rate": 9.034853788622393e-05, + "loss": 0.9379, + "step": 2519 + }, + { + "epoch": 0.22516585877990483, + "grad_norm": 0.4031184911727905, + "learning_rate": 9.033998991560881e-05, + "loss": 0.9834, + "step": 2520 + }, + { + "epoch": 0.22525521031116671, + "grad_norm": 0.38345515727996826, + "learning_rate": 9.033143856608952e-05, + "loss": 1.043, + "step": 2521 + }, + { + "epoch": 0.22534456184242857, + "grad_norm": 0.40574783086776733, + "learning_rate": 9.032288383838236e-05, + "loss": 1.0062, + "step": 2522 + }, + { + "epoch": 0.22543391337369045, + "grad_norm": 0.5696341395378113, + "learning_rate": 9.031432573320387e-05, + "loss": 0.9409, + "step": 2523 + }, + { + "epoch": 0.2255232649049523, + "grad_norm": 0.39741194248199463, + "learning_rate": 9.030576425127087e-05, + "loss": 1.0018, + "step": 2524 + }, + { + "epoch": 0.22561261643621416, + "grad_norm": 0.41686055064201355, + "learning_rate": 9.029719939330047e-05, + "loss": 0.9734, + "step": 2525 + }, + { + "epoch": 0.22570196796747605, + "grad_norm": 0.39542368054389954, + "learning_rate": 9.028863116001012e-05, + "loss": 0.9796, + "step": 2526 + }, + { + "epoch": 0.2257913194987379, + "grad_norm": 0.4378524720668793, + "learning_rate": 9.028005955211744e-05, + "loss": 0.9701, + "step": 2527 + }, + { + "epoch": 0.22588067102999979, + "grad_norm": 0.5109492540359497, + "learning_rate": 9.027148457034043e-05, + "loss": 1.0008, + "step": 2528 + }, + { + "epoch": 0.22597002256126164, + "grad_norm": 0.35746294260025024, + "learning_rate": 9.02629062153973e-05, + "loss": 1.037, + "step": 2529 + }, + { + "epoch": 0.22605937409252352, + "grad_norm": 0.44290691614151, + "learning_rate": 9.025432448800662e-05, + "loss": 1.0122, + "step": 2530 + }, + { + "epoch": 0.22614872562378538, + "grad_norm": 0.4638045132160187, + "learning_rate": 9.02457393888872e-05, + "loss": 1.0214, + "step": 2531 + }, + { + "epoch": 0.22623807715504723, + "grad_norm": 0.41130268573760986, + "learning_rate": 9.023715091875809e-05, + "loss": 1.0102, + "step": 2532 + }, + { + "epoch": 0.22632742868630912, + "grad_norm": 0.4205174446105957, + "learning_rate": 9.022855907833871e-05, + "loss": 0.9937, + "step": 2533 + }, + { + "epoch": 0.22641678021757097, + "grad_norm": 0.4878970980644226, + "learning_rate": 9.02199638683487e-05, + "loss": 0.9084, + "step": 2534 + }, + { + "epoch": 0.22650613174883286, + "grad_norm": 0.44246265292167664, + "learning_rate": 9.0211365289508e-05, + "loss": 0.946, + "step": 2535 + }, + { + "epoch": 0.2265954832800947, + "grad_norm": 0.4307198226451874, + "learning_rate": 9.020276334253683e-05, + "loss": 1.0387, + "step": 2536 + }, + { + "epoch": 0.22668483481135657, + "grad_norm": 0.45680615305900574, + "learning_rate": 9.019415802815569e-05, + "loss": 1.0461, + "step": 2537 + }, + { + "epoch": 0.22677418634261845, + "grad_norm": 0.3967481851577759, + "learning_rate": 9.01855493470854e-05, + "loss": 1.0014, + "step": 2538 + }, + { + "epoch": 0.2268635378738803, + "grad_norm": 0.49700799584388733, + "learning_rate": 9.0176937300047e-05, + "loss": 1.0862, + "step": 2539 + }, + { + "epoch": 0.2269528894051422, + "grad_norm": 0.4415007531642914, + "learning_rate": 9.016832188776183e-05, + "loss": 1.049, + "step": 2540 + }, + { + "epoch": 0.22704224093640404, + "grad_norm": 0.39791885018348694, + "learning_rate": 9.015970311095156e-05, + "loss": 0.9441, + "step": 2541 + }, + { + "epoch": 0.22713159246766593, + "grad_norm": 0.41884180903434753, + "learning_rate": 9.015108097033806e-05, + "loss": 1.0447, + "step": 2542 + }, + { + "epoch": 0.22722094399892778, + "grad_norm": 0.4121531546115875, + "learning_rate": 9.014245546664357e-05, + "loss": 1.0104, + "step": 2543 + }, + { + "epoch": 0.22731029553018964, + "grad_norm": 0.43119367957115173, + "learning_rate": 9.013382660059053e-05, + "loss": 0.9879, + "step": 2544 + }, + { + "epoch": 0.22739964706145152, + "grad_norm": 0.4344024360179901, + "learning_rate": 9.012519437290172e-05, + "loss": 1.0233, + "step": 2545 + }, + { + "epoch": 0.22748899859271338, + "grad_norm": 0.49549728631973267, + "learning_rate": 9.011655878430019e-05, + "loss": 0.9671, + "step": 2546 + }, + { + "epoch": 0.22757835012397526, + "grad_norm": 0.4547901153564453, + "learning_rate": 9.010791983550923e-05, + "loss": 0.9462, + "step": 2547 + }, + { + "epoch": 0.22766770165523711, + "grad_norm": 0.40302136540412903, + "learning_rate": 9.009927752725247e-05, + "loss": 1.0536, + "step": 2548 + }, + { + "epoch": 0.22775705318649897, + "grad_norm": 0.4038864076137543, + "learning_rate": 9.009063186025379e-05, + "loss": 1.0665, + "step": 2549 + }, + { + "epoch": 0.22784640471776085, + "grad_norm": 0.4060629606246948, + "learning_rate": 9.008198283523737e-05, + "loss": 0.9799, + "step": 2550 + }, + { + "epoch": 0.2279357562490227, + "grad_norm": 0.49202004075050354, + "learning_rate": 9.007333045292764e-05, + "loss": 1.0176, + "step": 2551 + }, + { + "epoch": 0.2280251077802846, + "grad_norm": 0.45856913924217224, + "learning_rate": 9.006467471404932e-05, + "loss": 0.9464, + "step": 2552 + }, + { + "epoch": 0.22811445931154645, + "grad_norm": 0.44552847743034363, + "learning_rate": 9.005601561932745e-05, + "loss": 1.0197, + "step": 2553 + }, + { + "epoch": 0.22820381084280833, + "grad_norm": 0.40756505727767944, + "learning_rate": 9.00473531694873e-05, + "loss": 0.9694, + "step": 2554 + }, + { + "epoch": 0.22829316237407019, + "grad_norm": 0.4505196511745453, + "learning_rate": 9.003868736525444e-05, + "loss": 1.0085, + "step": 2555 + }, + { + "epoch": 0.22838251390533204, + "grad_norm": 0.40980064868927, + "learning_rate": 9.003001820735474e-05, + "loss": 1.0383, + "step": 2556 + }, + { + "epoch": 0.22847186543659392, + "grad_norm": 0.37999671697616577, + "learning_rate": 9.002134569651433e-05, + "loss": 1.026, + "step": 2557 + }, + { + "epoch": 0.22856121696785578, + "grad_norm": 0.4312988817691803, + "learning_rate": 9.00126698334596e-05, + "loss": 1.0029, + "step": 2558 + }, + { + "epoch": 0.22865056849911766, + "grad_norm": 0.45107796788215637, + "learning_rate": 9.000399061891728e-05, + "loss": 1.0555, + "step": 2559 + }, + { + "epoch": 0.22873992003037952, + "grad_norm": 0.39357370138168335, + "learning_rate": 8.999530805361434e-05, + "loss": 1.0421, + "step": 2560 + }, + { + "epoch": 0.2288292715616414, + "grad_norm": 0.4320610761642456, + "learning_rate": 8.998662213827802e-05, + "loss": 1.0491, + "step": 2561 + }, + { + "epoch": 0.22891862309290326, + "grad_norm": 0.4132126271724701, + "learning_rate": 8.997793287363588e-05, + "loss": 0.9682, + "step": 2562 + }, + { + "epoch": 0.2290079746241651, + "grad_norm": 0.38975366950035095, + "learning_rate": 8.996924026041573e-05, + "loss": 1.0131, + "step": 2563 + }, + { + "epoch": 0.229097326155427, + "grad_norm": 0.42048487067222595, + "learning_rate": 8.996054429934567e-05, + "loss": 1.0186, + "step": 2564 + }, + { + "epoch": 0.22918667768668885, + "grad_norm": 0.40763533115386963, + "learning_rate": 8.995184499115405e-05, + "loss": 0.9722, + "step": 2565 + }, + { + "epoch": 0.22927602921795073, + "grad_norm": 0.47947773337364197, + "learning_rate": 8.994314233656958e-05, + "loss": 0.9427, + "step": 2566 + }, + { + "epoch": 0.2293653807492126, + "grad_norm": 0.4561311602592468, + "learning_rate": 8.993443633632116e-05, + "loss": 0.9793, + "step": 2567 + }, + { + "epoch": 0.22945473228047444, + "grad_norm": 0.3957562744617462, + "learning_rate": 8.992572699113804e-05, + "loss": 1.0227, + "step": 2568 + }, + { + "epoch": 0.22954408381173633, + "grad_norm": 0.3969632685184479, + "learning_rate": 8.99170143017497e-05, + "loss": 1.001, + "step": 2569 + }, + { + "epoch": 0.22963343534299818, + "grad_norm": 0.4251578748226166, + "learning_rate": 8.990829826888592e-05, + "loss": 1.0243, + "step": 2570 + }, + { + "epoch": 0.22972278687426007, + "grad_norm": 0.48977455496788025, + "learning_rate": 8.989957889327678e-05, + "loss": 0.9637, + "step": 2571 + }, + { + "epoch": 0.22981213840552192, + "grad_norm": 0.42375892400741577, + "learning_rate": 8.98908561756526e-05, + "loss": 0.9662, + "step": 2572 + }, + { + "epoch": 0.2299014899367838, + "grad_norm": 0.3831101655960083, + "learning_rate": 8.988213011674402e-05, + "loss": 1.0247, + "step": 2573 + }, + { + "epoch": 0.22999084146804566, + "grad_norm": 0.4255763590335846, + "learning_rate": 8.987340071728192e-05, + "loss": 1.0308, + "step": 2574 + }, + { + "epoch": 0.23008019299930751, + "grad_norm": 0.4933546185493469, + "learning_rate": 8.986466797799749e-05, + "loss": 0.9334, + "step": 2575 + }, + { + "epoch": 0.2301695445305694, + "grad_norm": 0.4253925085067749, + "learning_rate": 8.985593189962221e-05, + "loss": 1.0554, + "step": 2576 + }, + { + "epoch": 0.23025889606183125, + "grad_norm": 0.4756745398044586, + "learning_rate": 8.984719248288778e-05, + "loss": 0.9267, + "step": 2577 + }, + { + "epoch": 0.23034824759309314, + "grad_norm": 0.41270750761032104, + "learning_rate": 8.983844972852625e-05, + "loss": 0.9618, + "step": 2578 + }, + { + "epoch": 0.230437599124355, + "grad_norm": 0.4245428144931793, + "learning_rate": 8.982970363726989e-05, + "loss": 1.0846, + "step": 2579 + }, + { + "epoch": 0.23052695065561685, + "grad_norm": 0.595262348651886, + "learning_rate": 8.98209542098513e-05, + "loss": 0.9651, + "step": 2580 + }, + { + "epoch": 0.23061630218687873, + "grad_norm": 0.4939884543418884, + "learning_rate": 8.981220144700335e-05, + "loss": 1.0355, + "step": 2581 + }, + { + "epoch": 0.23070565371814059, + "grad_norm": 0.430448979139328, + "learning_rate": 8.980344534945915e-05, + "loss": 1.0304, + "step": 2582 + }, + { + "epoch": 0.23079500524940247, + "grad_norm": 0.4117549657821655, + "learning_rate": 8.979468591795213e-05, + "loss": 1.024, + "step": 2583 + }, + { + "epoch": 0.23088435678066432, + "grad_norm": 0.3863102197647095, + "learning_rate": 8.978592315321597e-05, + "loss": 1.0221, + "step": 2584 + }, + { + "epoch": 0.2309737083119262, + "grad_norm": 0.37732282280921936, + "learning_rate": 8.977715705598469e-05, + "loss": 1.0347, + "step": 2585 + }, + { + "epoch": 0.23106305984318806, + "grad_norm": 0.3825008273124695, + "learning_rate": 8.976838762699249e-05, + "loss": 1.1106, + "step": 2586 + }, + { + "epoch": 0.23115241137444992, + "grad_norm": 0.38581812381744385, + "learning_rate": 8.975961486697392e-05, + "loss": 0.9931, + "step": 2587 + }, + { + "epoch": 0.2312417629057118, + "grad_norm": 0.48402634263038635, + "learning_rate": 8.975083877666382e-05, + "loss": 0.9774, + "step": 2588 + }, + { + "epoch": 0.23133111443697366, + "grad_norm": 0.4185434877872467, + "learning_rate": 8.974205935679725e-05, + "loss": 1.0566, + "step": 2589 + }, + { + "epoch": 0.23142046596823554, + "grad_norm": 0.37079721689224243, + "learning_rate": 8.973327660810958e-05, + "loss": 1.0591, + "step": 2590 + }, + { + "epoch": 0.2315098174994974, + "grad_norm": 0.44048118591308594, + "learning_rate": 8.972449053133647e-05, + "loss": 1.0344, + "step": 2591 + }, + { + "epoch": 0.23159916903075928, + "grad_norm": 0.3690267503261566, + "learning_rate": 8.971570112721385e-05, + "loss": 1.0284, + "step": 2592 + }, + { + "epoch": 0.23168852056202113, + "grad_norm": 0.38595226407051086, + "learning_rate": 8.970690839647792e-05, + "loss": 1.0338, + "step": 2593 + }, + { + "epoch": 0.231777872093283, + "grad_norm": 0.427207350730896, + "learning_rate": 8.969811233986519e-05, + "loss": 0.9762, + "step": 2594 + }, + { + "epoch": 0.23186722362454487, + "grad_norm": 0.4339447617530823, + "learning_rate": 8.968931295811236e-05, + "loss": 0.9732, + "step": 2595 + }, + { + "epoch": 0.23195657515580673, + "grad_norm": 0.5293307304382324, + "learning_rate": 8.968051025195653e-05, + "loss": 0.9787, + "step": 2596 + }, + { + "epoch": 0.2320459266870686, + "grad_norm": 0.44257649779319763, + "learning_rate": 8.9671704222135e-05, + "loss": 0.9879, + "step": 2597 + }, + { + "epoch": 0.23213527821833047, + "grad_norm": 0.4917474091053009, + "learning_rate": 8.966289486938538e-05, + "loss": 1.0853, + "step": 2598 + }, + { + "epoch": 0.23222462974959232, + "grad_norm": 0.41678449511528015, + "learning_rate": 8.965408219444554e-05, + "loss": 1.0143, + "step": 2599 + }, + { + "epoch": 0.2323139812808542, + "grad_norm": 0.49743688106536865, + "learning_rate": 8.964526619805362e-05, + "loss": 0.9425, + "step": 2600 + }, + { + "epoch": 0.23240333281211606, + "grad_norm": 0.4517749845981598, + "learning_rate": 8.963644688094807e-05, + "loss": 1.0329, + "step": 2601 + }, + { + "epoch": 0.23249268434337794, + "grad_norm": 0.48534515500068665, + "learning_rate": 8.96276242438676e-05, + "loss": 0.9492, + "step": 2602 + }, + { + "epoch": 0.2325820358746398, + "grad_norm": 0.44116154313087463, + "learning_rate": 8.96187982875512e-05, + "loss": 0.9327, + "step": 2603 + }, + { + "epoch": 0.23267138740590168, + "grad_norm": 0.377189576625824, + "learning_rate": 8.960996901273815e-05, + "loss": 1.0739, + "step": 2604 + }, + { + "epoch": 0.23276073893716354, + "grad_norm": 0.40555354952812195, + "learning_rate": 8.960113642016797e-05, + "loss": 1.0098, + "step": 2605 + }, + { + "epoch": 0.2328500904684254, + "grad_norm": 0.64093416929245, + "learning_rate": 8.95923005105805e-05, + "loss": 0.8996, + "step": 2606 + }, + { + "epoch": 0.23293944199968727, + "grad_norm": 0.4546898305416107, + "learning_rate": 8.958346128471584e-05, + "loss": 0.979, + "step": 2607 + }, + { + "epoch": 0.23302879353094913, + "grad_norm": 0.4061562120914459, + "learning_rate": 8.957461874331436e-05, + "loss": 1.0129, + "step": 2608 + }, + { + "epoch": 0.233118145062211, + "grad_norm": 0.45059117674827576, + "learning_rate": 8.956577288711673e-05, + "loss": 1.0107, + "step": 2609 + }, + { + "epoch": 0.23320749659347287, + "grad_norm": 0.4028307795524597, + "learning_rate": 8.955692371686388e-05, + "loss": 1.0118, + "step": 2610 + }, + { + "epoch": 0.23329684812473475, + "grad_norm": 0.5479282736778259, + "learning_rate": 8.954807123329704e-05, + "loss": 0.9751, + "step": 2611 + }, + { + "epoch": 0.2333861996559966, + "grad_norm": 0.4075985550880432, + "learning_rate": 8.953921543715767e-05, + "loss": 1.0192, + "step": 2612 + }, + { + "epoch": 0.23347555118725846, + "grad_norm": 0.41962021589279175, + "learning_rate": 8.953035632918754e-05, + "loss": 0.9887, + "step": 2613 + }, + { + "epoch": 0.23356490271852035, + "grad_norm": 0.46383902430534363, + "learning_rate": 8.952149391012872e-05, + "loss": 1.0025, + "step": 2614 + }, + { + "epoch": 0.2336542542497822, + "grad_norm": 0.51430743932724, + "learning_rate": 8.95126281807235e-05, + "loss": 1.0262, + "step": 2615 + }, + { + "epoch": 0.23374360578104408, + "grad_norm": 0.45170411467552185, + "learning_rate": 8.95037591417145e-05, + "loss": 0.9814, + "step": 2616 + }, + { + "epoch": 0.23383295731230594, + "grad_norm": 0.4477570652961731, + "learning_rate": 8.94948867938446e-05, + "loss": 1.0245, + "step": 2617 + }, + { + "epoch": 0.2339223088435678, + "grad_norm": 0.3824307918548584, + "learning_rate": 8.948601113785693e-05, + "loss": 1.0084, + "step": 2618 + }, + { + "epoch": 0.23401166037482968, + "grad_norm": 0.5118735432624817, + "learning_rate": 8.947713217449495e-05, + "loss": 0.9164, + "step": 2619 + }, + { + "epoch": 0.23410101190609153, + "grad_norm": 0.4788876175880432, + "learning_rate": 8.946824990450236e-05, + "loss": 1.0392, + "step": 2620 + }, + { + "epoch": 0.23419036343735342, + "grad_norm": 0.43572187423706055, + "learning_rate": 8.945936432862312e-05, + "loss": 0.9694, + "step": 2621 + }, + { + "epoch": 0.23427971496861527, + "grad_norm": 0.437338650226593, + "learning_rate": 8.945047544760153e-05, + "loss": 0.9954, + "step": 2622 + }, + { + "epoch": 0.23436906649987715, + "grad_norm": 0.40690669417381287, + "learning_rate": 8.944158326218208e-05, + "loss": 1.0038, + "step": 2623 + }, + { + "epoch": 0.234458418031139, + "grad_norm": 0.40458807349205017, + "learning_rate": 8.943268777310964e-05, + "loss": 0.9944, + "step": 2624 + }, + { + "epoch": 0.23454776956240087, + "grad_norm": 0.396119624376297, + "learning_rate": 8.942378898112928e-05, + "loss": 0.9914, + "step": 2625 + }, + { + "epoch": 0.23463712109366275, + "grad_norm": 0.4142768085002899, + "learning_rate": 8.941488688698634e-05, + "loss": 1.1052, + "step": 2626 + }, + { + "epoch": 0.2347264726249246, + "grad_norm": 0.516160786151886, + "learning_rate": 8.940598149142652e-05, + "loss": 1.0511, + "step": 2627 + }, + { + "epoch": 0.2348158241561865, + "grad_norm": 0.41457611322402954, + "learning_rate": 8.93970727951957e-05, + "loss": 0.9767, + "step": 2628 + }, + { + "epoch": 0.23490517568744834, + "grad_norm": 0.41176801919937134, + "learning_rate": 8.938816079904009e-05, + "loss": 1.0566, + "step": 2629 + }, + { + "epoch": 0.2349945272187102, + "grad_norm": 0.42432960867881775, + "learning_rate": 8.937924550370618e-05, + "loss": 1.0717, + "step": 2630 + }, + { + "epoch": 0.23508387874997208, + "grad_norm": 0.5278024077415466, + "learning_rate": 8.937032690994068e-05, + "loss": 0.9483, + "step": 2631 + }, + { + "epoch": 0.23517323028123394, + "grad_norm": 0.39076733589172363, + "learning_rate": 8.936140501849066e-05, + "loss": 1.0294, + "step": 2632 + }, + { + "epoch": 0.23526258181249582, + "grad_norm": 0.41586795449256897, + "learning_rate": 8.935247983010339e-05, + "loss": 0.9949, + "step": 2633 + }, + { + "epoch": 0.23535193334375767, + "grad_norm": 0.4252259135246277, + "learning_rate": 8.93435513455265e-05, + "loss": 0.9886, + "step": 2634 + }, + { + "epoch": 0.23544128487501956, + "grad_norm": 0.41890689730644226, + "learning_rate": 8.93346195655078e-05, + "loss": 1.0233, + "step": 2635 + }, + { + "epoch": 0.2355306364062814, + "grad_norm": 0.38035479187965393, + "learning_rate": 8.932568449079541e-05, + "loss": 1.0627, + "step": 2636 + }, + { + "epoch": 0.23561998793754327, + "grad_norm": 0.414369136095047, + "learning_rate": 8.931674612213778e-05, + "loss": 0.974, + "step": 2637 + }, + { + "epoch": 0.23570933946880515, + "grad_norm": 0.3929893374443054, + "learning_rate": 8.930780446028359e-05, + "loss": 1.031, + "step": 2638 + }, + { + "epoch": 0.235798691000067, + "grad_norm": 0.45517128705978394, + "learning_rate": 8.929885950598177e-05, + "loss": 0.996, + "step": 2639 + }, + { + "epoch": 0.2358880425313289, + "grad_norm": 0.39736053347587585, + "learning_rate": 8.928991125998157e-05, + "loss": 1.001, + "step": 2640 + }, + { + "epoch": 0.23597739406259075, + "grad_norm": 0.38671961426734924, + "learning_rate": 8.92809597230325e-05, + "loss": 1.0809, + "step": 2641 + }, + { + "epoch": 0.23606674559385263, + "grad_norm": 0.4453861117362976, + "learning_rate": 8.927200489588435e-05, + "loss": 1.0721, + "step": 2642 + }, + { + "epoch": 0.23615609712511448, + "grad_norm": 0.4946734309196472, + "learning_rate": 8.926304677928718e-05, + "loss": 0.9619, + "step": 2643 + }, + { + "epoch": 0.23624544865637634, + "grad_norm": 0.34986773133277893, + "learning_rate": 8.925408537399133e-05, + "loss": 1.0727, + "step": 2644 + }, + { + "epoch": 0.23633480018763822, + "grad_norm": 0.5045349597930908, + "learning_rate": 8.924512068074742e-05, + "loss": 0.9678, + "step": 2645 + }, + { + "epoch": 0.23642415171890008, + "grad_norm": 0.4377862215042114, + "learning_rate": 8.923615270030632e-05, + "loss": 0.9723, + "step": 2646 + }, + { + "epoch": 0.23651350325016196, + "grad_norm": 0.5501147508621216, + "learning_rate": 8.922718143341921e-05, + "loss": 0.9724, + "step": 2647 + }, + { + "epoch": 0.23660285478142382, + "grad_norm": 0.42605921626091003, + "learning_rate": 8.921820688083753e-05, + "loss": 1.0548, + "step": 2648 + }, + { + "epoch": 0.23669220631268567, + "grad_norm": 0.36013084650039673, + "learning_rate": 8.920922904331297e-05, + "loss": 1.0721, + "step": 2649 + }, + { + "epoch": 0.23678155784394755, + "grad_norm": 0.4354332685470581, + "learning_rate": 8.920024792159754e-05, + "loss": 0.9691, + "step": 2650 + }, + { + "epoch": 0.2368709093752094, + "grad_norm": 0.45081332325935364, + "learning_rate": 8.919126351644351e-05, + "loss": 0.8911, + "step": 2651 + }, + { + "epoch": 0.2369602609064713, + "grad_norm": 0.41693955659866333, + "learning_rate": 8.918227582860341e-05, + "loss": 1.0437, + "step": 2652 + }, + { + "epoch": 0.23704961243773315, + "grad_norm": 0.4205388128757477, + "learning_rate": 8.917328485883005e-05, + "loss": 1.0024, + "step": 2653 + }, + { + "epoch": 0.23713896396899503, + "grad_norm": 0.3820997476577759, + "learning_rate": 8.916429060787654e-05, + "loss": 1.0151, + "step": 2654 + }, + { + "epoch": 0.2372283155002569, + "grad_norm": 0.5409601330757141, + "learning_rate": 8.915529307649621e-05, + "loss": 0.931, + "step": 2655 + }, + { + "epoch": 0.23731766703151874, + "grad_norm": 0.40477868914604187, + "learning_rate": 8.914629226544273e-05, + "loss": 1.0052, + "step": 2656 + }, + { + "epoch": 0.23740701856278063, + "grad_norm": 0.5156925320625305, + "learning_rate": 8.913728817547002e-05, + "loss": 0.9712, + "step": 2657 + }, + { + "epoch": 0.23749637009404248, + "grad_norm": 0.4694739282131195, + "learning_rate": 8.912828080733223e-05, + "loss": 0.9844, + "step": 2658 + }, + { + "epoch": 0.23758572162530436, + "grad_norm": 0.520553708076477, + "learning_rate": 8.911927016178385e-05, + "loss": 0.9508, + "step": 2659 + }, + { + "epoch": 0.23767507315656622, + "grad_norm": 0.4804941415786743, + "learning_rate": 8.911025623957961e-05, + "loss": 0.9502, + "step": 2660 + }, + { + "epoch": 0.23776442468782807, + "grad_norm": 0.49052369594573975, + "learning_rate": 8.910123904147452e-05, + "loss": 0.9866, + "step": 2661 + }, + { + "epoch": 0.23785377621908996, + "grad_norm": 0.45931488275527954, + "learning_rate": 8.909221856822388e-05, + "loss": 0.9628, + "step": 2662 + }, + { + "epoch": 0.2379431277503518, + "grad_norm": 0.43489325046539307, + "learning_rate": 8.908319482058325e-05, + "loss": 1.0695, + "step": 2663 + }, + { + "epoch": 0.2380324792816137, + "grad_norm": 0.47782567143440247, + "learning_rate": 8.907416779930843e-05, + "loss": 0.9529, + "step": 2664 + }, + { + "epoch": 0.23812183081287555, + "grad_norm": 0.4324779808521271, + "learning_rate": 8.906513750515559e-05, + "loss": 0.9556, + "step": 2665 + }, + { + "epoch": 0.23821118234413743, + "grad_norm": 0.4681794047355652, + "learning_rate": 8.905610393888106e-05, + "loss": 0.9987, + "step": 2666 + }, + { + "epoch": 0.2383005338753993, + "grad_norm": 0.443891704082489, + "learning_rate": 8.904706710124152e-05, + "loss": 1.0109, + "step": 2667 + }, + { + "epoch": 0.23838988540666115, + "grad_norm": 0.46368691325187683, + "learning_rate": 8.90380269929939e-05, + "loss": 0.9511, + "step": 2668 + }, + { + "epoch": 0.23847923693792303, + "grad_norm": 0.5165659189224243, + "learning_rate": 8.90289836148954e-05, + "loss": 0.9759, + "step": 2669 + }, + { + "epoch": 0.23856858846918488, + "grad_norm": 0.42744940519332886, + "learning_rate": 8.90199369677035e-05, + "loss": 0.9876, + "step": 2670 + }, + { + "epoch": 0.23865794000044677, + "grad_norm": 0.41597381234169006, + "learning_rate": 8.901088705217598e-05, + "loss": 1.0021, + "step": 2671 + }, + { + "epoch": 0.23874729153170862, + "grad_norm": 0.3914920687675476, + "learning_rate": 8.900183386907082e-05, + "loss": 1.1311, + "step": 2672 + }, + { + "epoch": 0.2388366430629705, + "grad_norm": 0.49854883551597595, + "learning_rate": 8.899277741914633e-05, + "loss": 0.9727, + "step": 2673 + }, + { + "epoch": 0.23892599459423236, + "grad_norm": 0.4740012586116791, + "learning_rate": 8.898371770316111e-05, + "loss": 0.9915, + "step": 2674 + }, + { + "epoch": 0.23901534612549422, + "grad_norm": 0.4568859040737152, + "learning_rate": 8.897465472187401e-05, + "loss": 0.981, + "step": 2675 + }, + { + "epoch": 0.2391046976567561, + "grad_norm": 0.4102030098438263, + "learning_rate": 8.896558847604414e-05, + "loss": 0.9928, + "step": 2676 + }, + { + "epoch": 0.23919404918801795, + "grad_norm": 0.50264573097229, + "learning_rate": 8.895651896643088e-05, + "loss": 0.9527, + "step": 2677 + }, + { + "epoch": 0.23928340071927984, + "grad_norm": 0.4080445170402527, + "learning_rate": 8.894744619379391e-05, + "loss": 1.1131, + "step": 2678 + }, + { + "epoch": 0.2393727522505417, + "grad_norm": 0.45948582887649536, + "learning_rate": 8.893837015889317e-05, + "loss": 0.9385, + "step": 2679 + }, + { + "epoch": 0.23946210378180355, + "grad_norm": 0.4374150335788727, + "learning_rate": 8.892929086248888e-05, + "loss": 0.9693, + "step": 2680 + }, + { + "epoch": 0.23955145531306543, + "grad_norm": 0.4575091302394867, + "learning_rate": 8.892020830534152e-05, + "loss": 0.9821, + "step": 2681 + }, + { + "epoch": 0.2396408068443273, + "grad_norm": 0.39326122403144836, + "learning_rate": 8.891112248821186e-05, + "loss": 1.0787, + "step": 2682 + }, + { + "epoch": 0.23973015837558917, + "grad_norm": 0.4379619061946869, + "learning_rate": 8.890203341186092e-05, + "loss": 1.0188, + "step": 2683 + }, + { + "epoch": 0.23981950990685102, + "grad_norm": 0.47353631258010864, + "learning_rate": 8.889294107705002e-05, + "loss": 1.0197, + "step": 2684 + }, + { + "epoch": 0.2399088614381129, + "grad_norm": 0.4937237799167633, + "learning_rate": 8.888384548454075e-05, + "loss": 0.9871, + "step": 2685 + }, + { + "epoch": 0.23999821296937476, + "grad_norm": 0.39890146255493164, + "learning_rate": 8.887474663509493e-05, + "loss": 1.0274, + "step": 2686 + }, + { + "epoch": 0.24008756450063662, + "grad_norm": 0.3992265462875366, + "learning_rate": 8.886564452947471e-05, + "loss": 1.0258, + "step": 2687 + }, + { + "epoch": 0.2401769160318985, + "grad_norm": 0.4458499550819397, + "learning_rate": 8.885653916844248e-05, + "loss": 0.9608, + "step": 2688 + }, + { + "epoch": 0.24026626756316036, + "grad_norm": 0.39621952176094055, + "learning_rate": 8.884743055276092e-05, + "loss": 0.9997, + "step": 2689 + }, + { + "epoch": 0.24035561909442224, + "grad_norm": 0.44387656450271606, + "learning_rate": 8.883831868319297e-05, + "loss": 1.0045, + "step": 2690 + }, + { + "epoch": 0.2404449706256841, + "grad_norm": 0.38763174414634705, + "learning_rate": 8.882920356050184e-05, + "loss": 1.0502, + "step": 2691 + }, + { + "epoch": 0.24053432215694595, + "grad_norm": 0.4442698657512665, + "learning_rate": 8.882008518545101e-05, + "loss": 0.9964, + "step": 2692 + }, + { + "epoch": 0.24062367368820783, + "grad_norm": 0.39915651082992554, + "learning_rate": 8.881096355880428e-05, + "loss": 1.0732, + "step": 2693 + }, + { + "epoch": 0.2407130252194697, + "grad_norm": 0.458362877368927, + "learning_rate": 8.880183868132563e-05, + "loss": 1.0264, + "step": 2694 + }, + { + "epoch": 0.24080237675073157, + "grad_norm": 0.4793623685836792, + "learning_rate": 8.87927105537794e-05, + "loss": 0.931, + "step": 2695 + }, + { + "epoch": 0.24089172828199343, + "grad_norm": 0.4859834909439087, + "learning_rate": 8.878357917693016e-05, + "loss": 0.9891, + "step": 2696 + }, + { + "epoch": 0.2409810798132553, + "grad_norm": 0.4228796362876892, + "learning_rate": 8.877444455154278e-05, + "loss": 1.0693, + "step": 2697 + }, + { + "epoch": 0.24107043134451717, + "grad_norm": 0.37542724609375, + "learning_rate": 8.876530667838234e-05, + "loss": 1.0717, + "step": 2698 + }, + { + "epoch": 0.24115978287577902, + "grad_norm": 0.44685491919517517, + "learning_rate": 8.875616555821426e-05, + "loss": 1.0261, + "step": 2699 + }, + { + "epoch": 0.2412491344070409, + "grad_norm": 0.37416496872901917, + "learning_rate": 8.874702119180421e-05, + "loss": 1.1008, + "step": 2700 + }, + { + "epoch": 0.24133848593830276, + "grad_norm": 0.3862450420856476, + "learning_rate": 8.873787357991812e-05, + "loss": 1.0433, + "step": 2701 + }, + { + "epoch": 0.24142783746956464, + "grad_norm": 0.4019640386104584, + "learning_rate": 8.87287227233222e-05, + "loss": 1.0231, + "step": 2702 + }, + { + "epoch": 0.2415171890008265, + "grad_norm": 0.4376417398452759, + "learning_rate": 8.871956862278294e-05, + "loss": 1.0422, + "step": 2703 + }, + { + "epoch": 0.24160654053208838, + "grad_norm": 0.39731988310813904, + "learning_rate": 8.871041127906707e-05, + "loss": 1.0429, + "step": 2704 + }, + { + "epoch": 0.24169589206335024, + "grad_norm": 0.44788575172424316, + "learning_rate": 8.870125069294166e-05, + "loss": 1.0384, + "step": 2705 + }, + { + "epoch": 0.2417852435946121, + "grad_norm": 0.43335989117622375, + "learning_rate": 8.869208686517395e-05, + "loss": 1.0378, + "step": 2706 + }, + { + "epoch": 0.24187459512587398, + "grad_norm": 0.4897671639919281, + "learning_rate": 8.868291979653154e-05, + "loss": 0.9621, + "step": 2707 + }, + { + "epoch": 0.24196394665713583, + "grad_norm": 0.4613479971885681, + "learning_rate": 8.867374948778228e-05, + "loss": 0.9983, + "step": 2708 + }, + { + "epoch": 0.24205329818839771, + "grad_norm": 0.5041788220405579, + "learning_rate": 8.866457593969427e-05, + "loss": 1.0287, + "step": 2709 + }, + { + "epoch": 0.24214264971965957, + "grad_norm": 0.45073777437210083, + "learning_rate": 8.865539915303588e-05, + "loss": 0.9705, + "step": 2710 + }, + { + "epoch": 0.24223200125092142, + "grad_norm": 0.39561158418655396, + "learning_rate": 8.864621912857578e-05, + "loss": 1.0513, + "step": 2711 + }, + { + "epoch": 0.2423213527821833, + "grad_norm": 0.40363892912864685, + "learning_rate": 8.86370358670829e-05, + "loss": 0.9696, + "step": 2712 + }, + { + "epoch": 0.24241070431344516, + "grad_norm": 0.4564259648323059, + "learning_rate": 8.86278493693264e-05, + "loss": 1.0501, + "step": 2713 + }, + { + "epoch": 0.24250005584470705, + "grad_norm": 0.42725691199302673, + "learning_rate": 8.861865963607578e-05, + "loss": 0.9827, + "step": 2714 + }, + { + "epoch": 0.2425894073759689, + "grad_norm": 0.4821705222129822, + "learning_rate": 8.860946666810079e-05, + "loss": 0.952, + "step": 2715 + }, + { + "epoch": 0.24267875890723078, + "grad_norm": 0.42694029211997986, + "learning_rate": 8.86002704661714e-05, + "loss": 0.9642, + "step": 2716 + }, + { + "epoch": 0.24276811043849264, + "grad_norm": 0.45525282621383667, + "learning_rate": 8.85910710310579e-05, + "loss": 0.9894, + "step": 2717 + }, + { + "epoch": 0.2428574619697545, + "grad_norm": 0.4108658730983734, + "learning_rate": 8.858186836353087e-05, + "loss": 0.966, + "step": 2718 + }, + { + "epoch": 0.24294681350101638, + "grad_norm": 0.40456750988960266, + "learning_rate": 8.857266246436111e-05, + "loss": 1.0883, + "step": 2719 + }, + { + "epoch": 0.24303616503227823, + "grad_norm": 0.4102852940559387, + "learning_rate": 8.856345333431971e-05, + "loss": 1.0069, + "step": 2720 + }, + { + "epoch": 0.24312551656354012, + "grad_norm": 0.40076547861099243, + "learning_rate": 8.855424097417802e-05, + "loss": 1.0423, + "step": 2721 + }, + { + "epoch": 0.24321486809480197, + "grad_norm": 0.3830002248287201, + "learning_rate": 8.854502538470771e-05, + "loss": 1.003, + "step": 2722 + }, + { + "epoch": 0.24330421962606383, + "grad_norm": 0.4300539493560791, + "learning_rate": 8.853580656668065e-05, + "loss": 1.0192, + "step": 2723 + }, + { + "epoch": 0.2433935711573257, + "grad_norm": 0.3924980163574219, + "learning_rate": 8.852658452086904e-05, + "loss": 1.021, + "step": 2724 + }, + { + "epoch": 0.24348292268858757, + "grad_norm": 0.3826366364955902, + "learning_rate": 8.851735924804531e-05, + "loss": 1.0014, + "step": 2725 + }, + { + "epoch": 0.24357227421984945, + "grad_norm": 0.48502808809280396, + "learning_rate": 8.850813074898217e-05, + "loss": 1.0423, + "step": 2726 + }, + { + "epoch": 0.2436616257511113, + "grad_norm": 0.40080320835113525, + "learning_rate": 8.849889902445263e-05, + "loss": 1.0273, + "step": 2727 + }, + { + "epoch": 0.2437509772823732, + "grad_norm": 0.4349658489227295, + "learning_rate": 8.848966407522992e-05, + "loss": 1.0189, + "step": 2728 + }, + { + "epoch": 0.24384032881363504, + "grad_norm": 0.4866269826889038, + "learning_rate": 8.848042590208756e-05, + "loss": 0.9429, + "step": 2729 + }, + { + "epoch": 0.2439296803448969, + "grad_norm": 0.38999444246292114, + "learning_rate": 8.847118450579937e-05, + "loss": 0.964, + "step": 2730 + }, + { + "epoch": 0.24401903187615878, + "grad_norm": 0.39741405844688416, + "learning_rate": 8.84619398871394e-05, + "loss": 1.038, + "step": 2731 + }, + { + "epoch": 0.24410838340742064, + "grad_norm": 0.42767640948295593, + "learning_rate": 8.845269204688199e-05, + "loss": 1.041, + "step": 2732 + }, + { + "epoch": 0.24419773493868252, + "grad_norm": 0.42031916975975037, + "learning_rate": 8.844344098580176e-05, + "loss": 0.9304, + "step": 2733 + }, + { + "epoch": 0.24428708646994438, + "grad_norm": 0.34102049469947815, + "learning_rate": 8.843418670467353e-05, + "loss": 1.021, + "step": 2734 + }, + { + "epoch": 0.24437643800120626, + "grad_norm": 0.387407511472702, + "learning_rate": 8.842492920427252e-05, + "loss": 1.0539, + "step": 2735 + }, + { + "epoch": 0.24446578953246811, + "grad_norm": 0.373221218585968, + "learning_rate": 8.84156684853741e-05, + "loss": 1.0037, + "step": 2736 + }, + { + "epoch": 0.24455514106372997, + "grad_norm": 0.3756154477596283, + "learning_rate": 8.840640454875396e-05, + "loss": 1.0619, + "step": 2737 + }, + { + "epoch": 0.24464449259499185, + "grad_norm": 0.47706297039985657, + "learning_rate": 8.839713739518807e-05, + "loss": 0.8904, + "step": 2738 + }, + { + "epoch": 0.2447338441262537, + "grad_norm": 0.49002113938331604, + "learning_rate": 8.838786702545262e-05, + "loss": 1.0568, + "step": 2739 + }, + { + "epoch": 0.2448231956575156, + "grad_norm": 0.4314875304698944, + "learning_rate": 8.837859344032413e-05, + "loss": 0.9789, + "step": 2740 + }, + { + "epoch": 0.24491254718877745, + "grad_norm": 0.4264450967311859, + "learning_rate": 8.836931664057935e-05, + "loss": 0.9799, + "step": 2741 + }, + { + "epoch": 0.2450018987200393, + "grad_norm": 0.4409767985343933, + "learning_rate": 8.836003662699533e-05, + "loss": 0.9571, + "step": 2742 + }, + { + "epoch": 0.24509125025130118, + "grad_norm": 0.47590428590774536, + "learning_rate": 8.835075340034933e-05, + "loss": 0.9076, + "step": 2743 + }, + { + "epoch": 0.24518060178256304, + "grad_norm": 0.4312697947025299, + "learning_rate": 8.834146696141895e-05, + "loss": 0.9393, + "step": 2744 + }, + { + "epoch": 0.24526995331382492, + "grad_norm": 0.42560774087905884, + "learning_rate": 8.833217731098203e-05, + "loss": 1.0549, + "step": 2745 + }, + { + "epoch": 0.24535930484508678, + "grad_norm": 0.44607481360435486, + "learning_rate": 8.832288444981666e-05, + "loss": 0.9967, + "step": 2746 + }, + { + "epoch": 0.24544865637634866, + "grad_norm": 0.49672216176986694, + "learning_rate": 8.831358837870122e-05, + "loss": 1.0662, + "step": 2747 + }, + { + "epoch": 0.24553800790761052, + "grad_norm": 0.44877761602401733, + "learning_rate": 8.830428909841437e-05, + "loss": 1.0782, + "step": 2748 + }, + { + "epoch": 0.24562735943887237, + "grad_norm": 0.5102913975715637, + "learning_rate": 8.829498660973501e-05, + "loss": 0.9089, + "step": 2749 + }, + { + "epoch": 0.24571671097013426, + "grad_norm": 0.4264692962169647, + "learning_rate": 8.828568091344234e-05, + "loss": 1.003, + "step": 2750 + }, + { + "epoch": 0.2458060625013961, + "grad_norm": 0.45728904008865356, + "learning_rate": 8.827637201031577e-05, + "loss": 0.9578, + "step": 2751 + }, + { + "epoch": 0.245895414032658, + "grad_norm": 0.4567210078239441, + "learning_rate": 8.826705990113506e-05, + "loss": 1.0127, + "step": 2752 + }, + { + "epoch": 0.24598476556391985, + "grad_norm": 0.4181562662124634, + "learning_rate": 8.825774458668019e-05, + "loss": 1.0222, + "step": 2753 + }, + { + "epoch": 0.24607411709518173, + "grad_norm": 0.415363609790802, + "learning_rate": 8.824842606773142e-05, + "loss": 0.9997, + "step": 2754 + }, + { + "epoch": 0.2461634686264436, + "grad_norm": 0.43965113162994385, + "learning_rate": 8.823910434506925e-05, + "loss": 1.0537, + "step": 2755 + }, + { + "epoch": 0.24625282015770544, + "grad_norm": 0.36847561597824097, + "learning_rate": 8.82297794194745e-05, + "loss": 1.0743, + "step": 2756 + }, + { + "epoch": 0.24634217168896733, + "grad_norm": 0.5313237309455872, + "learning_rate": 8.822045129172822e-05, + "loss": 1.0059, + "step": 2757 + }, + { + "epoch": 0.24643152322022918, + "grad_norm": 0.41933882236480713, + "learning_rate": 8.821111996261176e-05, + "loss": 0.9882, + "step": 2758 + }, + { + "epoch": 0.24652087475149106, + "grad_norm": 0.43498140573501587, + "learning_rate": 8.820178543290668e-05, + "loss": 1.0393, + "step": 2759 + }, + { + "epoch": 0.24661022628275292, + "grad_norm": 0.5251235365867615, + "learning_rate": 8.819244770339488e-05, + "loss": 1.0354, + "step": 2760 + }, + { + "epoch": 0.24669957781401478, + "grad_norm": 0.4627592861652374, + "learning_rate": 8.818310677485848e-05, + "loss": 0.9657, + "step": 2761 + }, + { + "epoch": 0.24678892934527666, + "grad_norm": 0.4479570984840393, + "learning_rate": 8.817376264807989e-05, + "loss": 0.9823, + "step": 2762 + }, + { + "epoch": 0.24687828087653851, + "grad_norm": 0.40790751576423645, + "learning_rate": 8.816441532384177e-05, + "loss": 1.0679, + "step": 2763 + }, + { + "epoch": 0.2469676324078004, + "grad_norm": 0.5192294120788574, + "learning_rate": 8.815506480292706e-05, + "loss": 1.0415, + "step": 2764 + }, + { + "epoch": 0.24705698393906225, + "grad_norm": 0.5589134097099304, + "learning_rate": 8.814571108611896e-05, + "loss": 0.9845, + "step": 2765 + }, + { + "epoch": 0.24714633547032414, + "grad_norm": 0.38889801502227783, + "learning_rate": 8.813635417420096e-05, + "loss": 1.066, + "step": 2766 + }, + { + "epoch": 0.247235687001586, + "grad_norm": 0.41053298115730286, + "learning_rate": 8.812699406795682e-05, + "loss": 0.99, + "step": 2767 + }, + { + "epoch": 0.24732503853284785, + "grad_norm": 0.4181362986564636, + "learning_rate": 8.81176307681705e-05, + "loss": 0.9906, + "step": 2768 + }, + { + "epoch": 0.24741439006410973, + "grad_norm": 0.42989280819892883, + "learning_rate": 8.810826427562629e-05, + "loss": 0.9261, + "step": 2769 + }, + { + "epoch": 0.24750374159537158, + "grad_norm": 0.3895682096481323, + "learning_rate": 8.809889459110875e-05, + "loss": 0.9863, + "step": 2770 + }, + { + "epoch": 0.24759309312663347, + "grad_norm": 0.4124111235141754, + "learning_rate": 8.808952171540268e-05, + "loss": 0.9897, + "step": 2771 + }, + { + "epoch": 0.24768244465789532, + "grad_norm": 0.4256143271923065, + "learning_rate": 8.808014564929316e-05, + "loss": 0.9599, + "step": 2772 + }, + { + "epoch": 0.24777179618915718, + "grad_norm": 0.4850587248802185, + "learning_rate": 8.807076639356556e-05, + "loss": 0.9145, + "step": 2773 + }, + { + "epoch": 0.24786114772041906, + "grad_norm": 0.46698129177093506, + "learning_rate": 8.806138394900544e-05, + "loss": 1.0141, + "step": 2774 + }, + { + "epoch": 0.24795049925168092, + "grad_norm": 0.4754186272621155, + "learning_rate": 8.805199831639872e-05, + "loss": 1.0272, + "step": 2775 + }, + { + "epoch": 0.2480398507829428, + "grad_norm": 0.4320179224014282, + "learning_rate": 8.804260949653154e-05, + "loss": 0.9598, + "step": 2776 + }, + { + "epoch": 0.24812920231420466, + "grad_norm": 0.5230510830879211, + "learning_rate": 8.80332174901903e-05, + "loss": 0.9854, + "step": 2777 + }, + { + "epoch": 0.24821855384546654, + "grad_norm": 0.4366409182548523, + "learning_rate": 8.80238222981617e-05, + "loss": 1.0322, + "step": 2778 + }, + { + "epoch": 0.2483079053767284, + "grad_norm": 0.46669307351112366, + "learning_rate": 8.801442392123267e-05, + "loss": 1.0078, + "step": 2779 + }, + { + "epoch": 0.24839725690799025, + "grad_norm": 0.570253849029541, + "learning_rate": 8.800502236019044e-05, + "loss": 1.0106, + "step": 2780 + }, + { + "epoch": 0.24848660843925213, + "grad_norm": 0.4147552251815796, + "learning_rate": 8.799561761582247e-05, + "loss": 1.0206, + "step": 2781 + }, + { + "epoch": 0.248575959970514, + "grad_norm": 0.39246666431427, + "learning_rate": 8.798620968891653e-05, + "loss": 1.0486, + "step": 2782 + }, + { + "epoch": 0.24866531150177587, + "grad_norm": 0.4565700590610504, + "learning_rate": 8.797679858026062e-05, + "loss": 0.9924, + "step": 2783 + }, + { + "epoch": 0.24875466303303773, + "grad_norm": 0.38775768876075745, + "learning_rate": 8.796738429064303e-05, + "loss": 1.0482, + "step": 2784 + }, + { + "epoch": 0.2488440145642996, + "grad_norm": 0.4036676287651062, + "learning_rate": 8.795796682085231e-05, + "loss": 1.0342, + "step": 2785 + }, + { + "epoch": 0.24893336609556146, + "grad_norm": 0.5216470956802368, + "learning_rate": 8.794854617167725e-05, + "loss": 1.0855, + "step": 2786 + }, + { + "epoch": 0.24902271762682332, + "grad_norm": 0.4413949251174927, + "learning_rate": 8.793912234390695e-05, + "loss": 0.9762, + "step": 2787 + }, + { + "epoch": 0.2491120691580852, + "grad_norm": 0.4308202862739563, + "learning_rate": 8.792969533833076e-05, + "loss": 0.9775, + "step": 2788 + }, + { + "epoch": 0.24920142068934706, + "grad_norm": 0.5357450246810913, + "learning_rate": 8.792026515573828e-05, + "loss": 1.0041, + "step": 2789 + }, + { + "epoch": 0.24929077222060894, + "grad_norm": 0.35964709520339966, + "learning_rate": 8.791083179691939e-05, + "loss": 1.012, + "step": 2790 + }, + { + "epoch": 0.2493801237518708, + "grad_norm": 0.535650908946991, + "learning_rate": 8.790139526266423e-05, + "loss": 1.0273, + "step": 2791 + }, + { + "epoch": 0.24946947528313265, + "grad_norm": 0.4403007924556732, + "learning_rate": 8.789195555376323e-05, + "loss": 0.9549, + "step": 2792 + }, + { + "epoch": 0.24955882681439454, + "grad_norm": 0.3707777261734009, + "learning_rate": 8.788251267100704e-05, + "loss": 1.0049, + "step": 2793 + }, + { + "epoch": 0.2496481783456564, + "grad_norm": 0.4524572491645813, + "learning_rate": 8.787306661518662e-05, + "loss": 1.0156, + "step": 2794 + }, + { + "epoch": 0.24973752987691827, + "grad_norm": 0.4172338545322418, + "learning_rate": 8.786361738709319e-05, + "loss": 0.9847, + "step": 2795 + }, + { + "epoch": 0.24982688140818013, + "grad_norm": 0.40239232778549194, + "learning_rate": 8.78541649875182e-05, + "loss": 1.0954, + "step": 2796 + }, + { + "epoch": 0.249916232939442, + "grad_norm": 0.44025593996047974, + "learning_rate": 8.784470941725338e-05, + "loss": 1.0388, + "step": 2797 + }, + { + "epoch": 0.25000558447070387, + "grad_norm": 0.3682102859020233, + "learning_rate": 8.783525067709075e-05, + "loss": 1.0017, + "step": 2798 + }, + { + "epoch": 0.25009493600196575, + "grad_norm": 0.5301668643951416, + "learning_rate": 8.782578876782259e-05, + "loss": 0.8846, + "step": 2799 + }, + { + "epoch": 0.2501842875332276, + "grad_norm": 0.5186197757720947, + "learning_rate": 8.781632369024141e-05, + "loss": 0.9255, + "step": 2800 + }, + { + "epoch": 0.25027363906448946, + "grad_norm": 0.4258555769920349, + "learning_rate": 8.780685544514006e-05, + "loss": 0.9937, + "step": 2801 + }, + { + "epoch": 0.25036299059575134, + "grad_norm": 0.44377055764198303, + "learning_rate": 8.779738403331157e-05, + "loss": 1.0151, + "step": 2802 + }, + { + "epoch": 0.2504523421270132, + "grad_norm": 0.3976726830005646, + "learning_rate": 8.778790945554926e-05, + "loss": 1.0148, + "step": 2803 + }, + { + "epoch": 0.25054169365827506, + "grad_norm": 0.34984302520751953, + "learning_rate": 8.777843171264675e-05, + "loss": 1.0027, + "step": 2804 + }, + { + "epoch": 0.25063104518953694, + "grad_norm": 0.36528366804122925, + "learning_rate": 8.776895080539789e-05, + "loss": 1.0559, + "step": 2805 + }, + { + "epoch": 0.2507203967207988, + "grad_norm": 0.48383477330207825, + "learning_rate": 8.775946673459681e-05, + "loss": 1.0472, + "step": 2806 + }, + { + "epoch": 0.25080974825206065, + "grad_norm": 0.3872027099132538, + "learning_rate": 8.774997950103791e-05, + "loss": 1.0441, + "step": 2807 + }, + { + "epoch": 0.25089909978332253, + "grad_norm": 0.45994165539741516, + "learning_rate": 8.774048910551584e-05, + "loss": 0.9451, + "step": 2808 + }, + { + "epoch": 0.2509884513145844, + "grad_norm": 0.39448827505111694, + "learning_rate": 8.773099554882552e-05, + "loss": 1.0241, + "step": 2809 + }, + { + "epoch": 0.25107780284584624, + "grad_norm": 0.39023977518081665, + "learning_rate": 8.772149883176215e-05, + "loss": 0.9925, + "step": 2810 + }, + { + "epoch": 0.2511671543771081, + "grad_norm": 0.49682340025901794, + "learning_rate": 8.771199895512115e-05, + "loss": 0.9678, + "step": 2811 + }, + { + "epoch": 0.25125650590837, + "grad_norm": 0.5279167890548706, + "learning_rate": 8.770249591969829e-05, + "loss": 0.9971, + "step": 2812 + }, + { + "epoch": 0.2513458574396319, + "grad_norm": 0.46056196093559265, + "learning_rate": 8.769298972628948e-05, + "loss": 1.0181, + "step": 2813 + }, + { + "epoch": 0.2514352089708937, + "grad_norm": 0.40414974093437195, + "learning_rate": 8.768348037569102e-05, + "loss": 0.9936, + "step": 2814 + }, + { + "epoch": 0.2515245605021556, + "grad_norm": 0.45471900701522827, + "learning_rate": 8.76739678686994e-05, + "loss": 1.0091, + "step": 2815 + }, + { + "epoch": 0.2516139120334175, + "grad_norm": 0.4152718186378479, + "learning_rate": 8.766445220611139e-05, + "loss": 1.0165, + "step": 2816 + }, + { + "epoch": 0.2517032635646793, + "grad_norm": 0.3915504515171051, + "learning_rate": 8.765493338872403e-05, + "loss": 1.028, + "step": 2817 + }, + { + "epoch": 0.2517926150959412, + "grad_norm": 0.4392106533050537, + "learning_rate": 8.764541141733464e-05, + "loss": 0.9371, + "step": 2818 + }, + { + "epoch": 0.2518819666272031, + "grad_norm": 0.39547622203826904, + "learning_rate": 8.763588629274077e-05, + "loss": 1.018, + "step": 2819 + }, + { + "epoch": 0.25197131815846496, + "grad_norm": 0.35008326172828674, + "learning_rate": 8.762635801574025e-05, + "loss": 0.9658, + "step": 2820 + }, + { + "epoch": 0.2520606696897268, + "grad_norm": 0.4563378393650055, + "learning_rate": 8.761682658713119e-05, + "loss": 0.966, + "step": 2821 + }, + { + "epoch": 0.2521500212209887, + "grad_norm": 0.3921094536781311, + "learning_rate": 8.760729200771192e-05, + "loss": 1.0282, + "step": 2822 + }, + { + "epoch": 0.25223937275225056, + "grad_norm": 0.4152204990386963, + "learning_rate": 8.759775427828108e-05, + "loss": 1.0107, + "step": 2823 + }, + { + "epoch": 0.2523287242835124, + "grad_norm": 0.47061243653297424, + "learning_rate": 8.758821339963756e-05, + "loss": 1.0095, + "step": 2824 + }, + { + "epoch": 0.25241807581477427, + "grad_norm": 0.49046018719673157, + "learning_rate": 8.75786693725805e-05, + "loss": 1.0251, + "step": 2825 + }, + { + "epoch": 0.25250742734603615, + "grad_norm": 0.43877169489860535, + "learning_rate": 8.756912219790933e-05, + "loss": 0.9794, + "step": 2826 + }, + { + "epoch": 0.25259677887729803, + "grad_norm": 0.4174705445766449, + "learning_rate": 8.755957187642372e-05, + "loss": 0.9845, + "step": 2827 + }, + { + "epoch": 0.25268613040855986, + "grad_norm": 0.3626020550727844, + "learning_rate": 8.755001840892361e-05, + "loss": 1.0018, + "step": 2828 + }, + { + "epoch": 0.25277548193982174, + "grad_norm": 0.48767632246017456, + "learning_rate": 8.754046179620919e-05, + "loss": 1.0055, + "step": 2829 + }, + { + "epoch": 0.2528648334710836, + "grad_norm": 0.4239792823791504, + "learning_rate": 8.753090203908095e-05, + "loss": 0.9721, + "step": 2830 + }, + { + "epoch": 0.25295418500234546, + "grad_norm": 0.4804183542728424, + "learning_rate": 8.752133913833962e-05, + "loss": 0.9585, + "step": 2831 + }, + { + "epoch": 0.25304353653360734, + "grad_norm": 0.45902127027511597, + "learning_rate": 8.751177309478618e-05, + "loss": 1.0059, + "step": 2832 + }, + { + "epoch": 0.2531328880648692, + "grad_norm": 0.473166823387146, + "learning_rate": 8.750220390922188e-05, + "loss": 0.9744, + "step": 2833 + }, + { + "epoch": 0.2532222395961311, + "grad_norm": 0.475212037563324, + "learning_rate": 8.74926315824483e-05, + "loss": 1.0587, + "step": 2834 + }, + { + "epoch": 0.25331159112739293, + "grad_norm": 0.4180943965911865, + "learning_rate": 8.748305611526715e-05, + "loss": 0.9937, + "step": 2835 + }, + { + "epoch": 0.2534009426586548, + "grad_norm": 0.4551735520362854, + "learning_rate": 8.747347750848052e-05, + "loss": 0.968, + "step": 2836 + }, + { + "epoch": 0.2534902941899167, + "grad_norm": 0.3868304491043091, + "learning_rate": 8.74638957628907e-05, + "loss": 0.9912, + "step": 2837 + }, + { + "epoch": 0.2535796457211785, + "grad_norm": 0.43097737431526184, + "learning_rate": 8.745431087930028e-05, + "loss": 1.0214, + "step": 2838 + }, + { + "epoch": 0.2536689972524404, + "grad_norm": 0.37458547949790955, + "learning_rate": 8.74447228585121e-05, + "loss": 1.017, + "step": 2839 + }, + { + "epoch": 0.2537583487837023, + "grad_norm": 0.5047030448913574, + "learning_rate": 8.743513170132924e-05, + "loss": 0.9561, + "step": 2840 + }, + { + "epoch": 0.2538477003149642, + "grad_norm": 0.3879890441894531, + "learning_rate": 8.742553740855506e-05, + "loss": 1.0405, + "step": 2841 + }, + { + "epoch": 0.253937051846226, + "grad_norm": 0.44273656606674194, + "learning_rate": 8.74159399809932e-05, + "loss": 1.0599, + "step": 2842 + }, + { + "epoch": 0.2540264033774879, + "grad_norm": 0.4389507472515106, + "learning_rate": 8.740633941944754e-05, + "loss": 1.0179, + "step": 2843 + }, + { + "epoch": 0.25411575490874977, + "grad_norm": 0.44529905915260315, + "learning_rate": 8.739673572472225e-05, + "loss": 1.0689, + "step": 2844 + }, + { + "epoch": 0.2542051064400116, + "grad_norm": 0.4119149148464203, + "learning_rate": 8.738712889762171e-05, + "loss": 0.9636, + "step": 2845 + }, + { + "epoch": 0.2542944579712735, + "grad_norm": 0.4760536551475525, + "learning_rate": 8.73775189389506e-05, + "loss": 0.963, + "step": 2846 + }, + { + "epoch": 0.25438380950253536, + "grad_norm": 0.4643342196941376, + "learning_rate": 8.736790584951387e-05, + "loss": 0.9906, + "step": 2847 + }, + { + "epoch": 0.2544731610337972, + "grad_norm": 0.402329683303833, + "learning_rate": 8.735828963011671e-05, + "loss": 1.0063, + "step": 2848 + }, + { + "epoch": 0.2545625125650591, + "grad_norm": 0.41884222626686096, + "learning_rate": 8.734867028156458e-05, + "loss": 1.0401, + "step": 2849 + }, + { + "epoch": 0.25465186409632096, + "grad_norm": 0.3897888660430908, + "learning_rate": 8.733904780466321e-05, + "loss": 1.057, + "step": 2850 + }, + { + "epoch": 0.25474121562758284, + "grad_norm": 0.4296242296695709, + "learning_rate": 8.732942220021858e-05, + "loss": 0.9684, + "step": 2851 + }, + { + "epoch": 0.25483056715884467, + "grad_norm": 0.47800928354263306, + "learning_rate": 8.731979346903693e-05, + "loss": 1.0239, + "step": 2852 + }, + { + "epoch": 0.25491991869010655, + "grad_norm": 0.4393937587738037, + "learning_rate": 8.731016161192479e-05, + "loss": 1.0716, + "step": 2853 + }, + { + "epoch": 0.25500927022136843, + "grad_norm": 0.46051809191703796, + "learning_rate": 8.730052662968891e-05, + "loss": 1.0035, + "step": 2854 + }, + { + "epoch": 0.25509862175263026, + "grad_norm": 0.3697504997253418, + "learning_rate": 8.729088852313633e-05, + "loss": 0.9667, + "step": 2855 + }, + { + "epoch": 0.25518797328389214, + "grad_norm": 0.4968481659889221, + "learning_rate": 8.728124729307434e-05, + "loss": 0.9228, + "step": 2856 + }, + { + "epoch": 0.255277324815154, + "grad_norm": 0.46425577998161316, + "learning_rate": 8.727160294031051e-05, + "loss": 1.0125, + "step": 2857 + }, + { + "epoch": 0.2553666763464159, + "grad_norm": 0.40039297938346863, + "learning_rate": 8.726195546565263e-05, + "loss": 1.0081, + "step": 2858 + }, + { + "epoch": 0.25545602787767774, + "grad_norm": 0.48892778158187866, + "learning_rate": 8.725230486990882e-05, + "loss": 0.9115, + "step": 2859 + }, + { + "epoch": 0.2555453794089396, + "grad_norm": 0.3827580511569977, + "learning_rate": 8.724265115388739e-05, + "loss": 1.0488, + "step": 2860 + }, + { + "epoch": 0.2556347309402015, + "grad_norm": 0.45204082131385803, + "learning_rate": 8.723299431839693e-05, + "loss": 1.0264, + "step": 2861 + }, + { + "epoch": 0.25572408247146333, + "grad_norm": 0.46111056208610535, + "learning_rate": 8.722333436424633e-05, + "loss": 0.9979, + "step": 2862 + }, + { + "epoch": 0.2558134340027252, + "grad_norm": 0.40219980478286743, + "learning_rate": 8.721367129224471e-05, + "loss": 1.0384, + "step": 2863 + }, + { + "epoch": 0.2559027855339871, + "grad_norm": 0.37858057022094727, + "learning_rate": 8.720400510320146e-05, + "loss": 1.0113, + "step": 2864 + }, + { + "epoch": 0.255992137065249, + "grad_norm": 0.4286115765571594, + "learning_rate": 8.71943357979262e-05, + "loss": 0.9691, + "step": 2865 + }, + { + "epoch": 0.2560814885965108, + "grad_norm": 0.4207289218902588, + "learning_rate": 8.718466337722885e-05, + "loss": 0.9316, + "step": 2866 + }, + { + "epoch": 0.2561708401277727, + "grad_norm": 0.4365525245666504, + "learning_rate": 8.717498784191958e-05, + "loss": 0.99, + "step": 2867 + }, + { + "epoch": 0.2562601916590346, + "grad_norm": 0.4580113887786865, + "learning_rate": 8.716530919280883e-05, + "loss": 0.9109, + "step": 2868 + }, + { + "epoch": 0.2563495431902964, + "grad_norm": 0.4263405203819275, + "learning_rate": 8.715562743070729e-05, + "loss": 0.9391, + "step": 2869 + }, + { + "epoch": 0.2564388947215583, + "grad_norm": 0.4391098916530609, + "learning_rate": 8.71459425564259e-05, + "loss": 1.0104, + "step": 2870 + }, + { + "epoch": 0.25652824625282017, + "grad_norm": 0.45276468992233276, + "learning_rate": 8.713625457077585e-05, + "loss": 1.1036, + "step": 2871 + }, + { + "epoch": 0.25661759778408205, + "grad_norm": 0.45625758171081543, + "learning_rate": 8.712656347456867e-05, + "loss": 0.9503, + "step": 2872 + }, + { + "epoch": 0.2567069493153439, + "grad_norm": 0.40368926525115967, + "learning_rate": 8.711686926861604e-05, + "loss": 1.0154, + "step": 2873 + }, + { + "epoch": 0.25679630084660576, + "grad_norm": 0.43693703413009644, + "learning_rate": 8.710717195372997e-05, + "loss": 1.0454, + "step": 2874 + }, + { + "epoch": 0.25688565237786765, + "grad_norm": 0.38676273822784424, + "learning_rate": 8.709747153072272e-05, + "loss": 1.0724, + "step": 2875 + }, + { + "epoch": 0.2569750039091295, + "grad_norm": 0.441709041595459, + "learning_rate": 8.708776800040679e-05, + "loss": 0.9994, + "step": 2876 + }, + { + "epoch": 0.25706435544039136, + "grad_norm": 0.40171146392822266, + "learning_rate": 8.707806136359497e-05, + "loss": 1.0599, + "step": 2877 + }, + { + "epoch": 0.25715370697165324, + "grad_norm": 0.40110501646995544, + "learning_rate": 8.706835162110028e-05, + "loss": 1.015, + "step": 2878 + }, + { + "epoch": 0.25724305850291507, + "grad_norm": 0.4818125069141388, + "learning_rate": 8.705863877373603e-05, + "loss": 1.0023, + "step": 2879 + }, + { + "epoch": 0.25733241003417695, + "grad_norm": 0.3713032901287079, + "learning_rate": 8.704892282231575e-05, + "loss": 1.0398, + "step": 2880 + }, + { + "epoch": 0.25742176156543883, + "grad_norm": 0.509074330329895, + "learning_rate": 8.70392037676533e-05, + "loss": 0.9623, + "step": 2881 + }, + { + "epoch": 0.2575111130967007, + "grad_norm": 0.4063872694969177, + "learning_rate": 8.70294816105627e-05, + "loss": 0.9864, + "step": 2882 + }, + { + "epoch": 0.25760046462796254, + "grad_norm": 0.4200826585292816, + "learning_rate": 8.701975635185833e-05, + "loss": 0.9905, + "step": 2883 + }, + { + "epoch": 0.2576898161592244, + "grad_norm": 0.44294077157974243, + "learning_rate": 8.701002799235475e-05, + "loss": 0.9429, + "step": 2884 + }, + { + "epoch": 0.2577791676904863, + "grad_norm": 0.3688490092754364, + "learning_rate": 8.700029653286684e-05, + "loss": 1.0168, + "step": 2885 + }, + { + "epoch": 0.25786851922174814, + "grad_norm": 0.381213903427124, + "learning_rate": 8.699056197420967e-05, + "loss": 1.0477, + "step": 2886 + }, + { + "epoch": 0.25795787075301, + "grad_norm": 0.4321887195110321, + "learning_rate": 8.698082431719867e-05, + "loss": 0.9478, + "step": 2887 + }, + { + "epoch": 0.2580472222842719, + "grad_norm": 0.39009973406791687, + "learning_rate": 8.697108356264944e-05, + "loss": 0.9456, + "step": 2888 + }, + { + "epoch": 0.2581365738155338, + "grad_norm": 0.39922964572906494, + "learning_rate": 8.696133971137788e-05, + "loss": 1.0529, + "step": 2889 + }, + { + "epoch": 0.2582259253467956, + "grad_norm": 0.45770078897476196, + "learning_rate": 8.695159276420013e-05, + "loss": 0.9618, + "step": 2890 + }, + { + "epoch": 0.2583152768780575, + "grad_norm": 0.4154913127422333, + "learning_rate": 8.694184272193262e-05, + "loss": 1.016, + "step": 2891 + }, + { + "epoch": 0.2584046284093194, + "grad_norm": 0.3861916661262512, + "learning_rate": 8.6932089585392e-05, + "loss": 1.0038, + "step": 2892 + }, + { + "epoch": 0.2584939799405812, + "grad_norm": 0.39484497904777527, + "learning_rate": 8.692233335539521e-05, + "loss": 0.9882, + "step": 2893 + }, + { + "epoch": 0.2585833314718431, + "grad_norm": 0.36382368206977844, + "learning_rate": 8.691257403275945e-05, + "loss": 1.087, + "step": 2894 + }, + { + "epoch": 0.258672683003105, + "grad_norm": 0.42892515659332275, + "learning_rate": 8.690281161830216e-05, + "loss": 1.1341, + "step": 2895 + }, + { + "epoch": 0.25876203453436686, + "grad_norm": 0.42802634835243225, + "learning_rate": 8.689304611284103e-05, + "loss": 0.993, + "step": 2896 + }, + { + "epoch": 0.2588513860656287, + "grad_norm": 0.6133880019187927, + "learning_rate": 8.688327751719403e-05, + "loss": 0.9809, + "step": 2897 + }, + { + "epoch": 0.25894073759689057, + "grad_norm": 0.4578530788421631, + "learning_rate": 8.68735058321794e-05, + "loss": 0.9413, + "step": 2898 + }, + { + "epoch": 0.25903008912815245, + "grad_norm": 0.4118945896625519, + "learning_rate": 8.68637310586156e-05, + "loss": 0.9512, + "step": 2899 + }, + { + "epoch": 0.2591194406594143, + "grad_norm": 0.4931046962738037, + "learning_rate": 8.685395319732141e-05, + "loss": 1.0285, + "step": 2900 + }, + { + "epoch": 0.25920879219067616, + "grad_norm": 0.4491409361362457, + "learning_rate": 8.684417224911578e-05, + "loss": 1.0044, + "step": 2901 + }, + { + "epoch": 0.25929814372193805, + "grad_norm": 0.37248972058296204, + "learning_rate": 8.683438821481802e-05, + "loss": 1.0201, + "step": 2902 + }, + { + "epoch": 0.25938749525319993, + "grad_norm": 0.46489718556404114, + "learning_rate": 8.68246010952476e-05, + "loss": 0.9569, + "step": 2903 + }, + { + "epoch": 0.25947684678446176, + "grad_norm": 0.4184110164642334, + "learning_rate": 8.681481089122432e-05, + "loss": 0.9632, + "step": 2904 + }, + { + "epoch": 0.25956619831572364, + "grad_norm": 0.38718414306640625, + "learning_rate": 8.68050176035682e-05, + "loss": 1.0251, + "step": 2905 + }, + { + "epoch": 0.2596555498469855, + "grad_norm": 0.3973325490951538, + "learning_rate": 8.679522123309956e-05, + "loss": 1.0239, + "step": 2906 + }, + { + "epoch": 0.25974490137824735, + "grad_norm": 0.37302878499031067, + "learning_rate": 8.678542178063893e-05, + "loss": 1.0898, + "step": 2907 + }, + { + "epoch": 0.25983425290950923, + "grad_norm": 0.46653488278388977, + "learning_rate": 8.677561924700713e-05, + "loss": 1.0144, + "step": 2908 + }, + { + "epoch": 0.2599236044407711, + "grad_norm": 0.43336376547813416, + "learning_rate": 8.676581363302518e-05, + "loss": 0.9507, + "step": 2909 + }, + { + "epoch": 0.26001295597203294, + "grad_norm": 0.5737956762313843, + "learning_rate": 8.675600493951448e-05, + "loss": 0.9631, + "step": 2910 + }, + { + "epoch": 0.2601023075032948, + "grad_norm": 0.460018128156662, + "learning_rate": 8.674619316729657e-05, + "loss": 0.9619, + "step": 2911 + }, + { + "epoch": 0.2601916590345567, + "grad_norm": 0.420095831155777, + "learning_rate": 8.673637831719328e-05, + "loss": 0.9587, + "step": 2912 + }, + { + "epoch": 0.2602810105658186, + "grad_norm": 0.46309584379196167, + "learning_rate": 8.672656039002674e-05, + "loss": 0.9664, + "step": 2913 + }, + { + "epoch": 0.2603703620970804, + "grad_norm": 0.5068766474723816, + "learning_rate": 8.671673938661929e-05, + "loss": 1.0086, + "step": 2914 + }, + { + "epoch": 0.2604597136283423, + "grad_norm": 0.46311527490615845, + "learning_rate": 8.670691530779354e-05, + "loss": 0.9858, + "step": 2915 + }, + { + "epoch": 0.2605490651596042, + "grad_norm": 0.4441559612751007, + "learning_rate": 8.669708815437237e-05, + "loss": 0.9832, + "step": 2916 + }, + { + "epoch": 0.260638416690866, + "grad_norm": 0.4474044442176819, + "learning_rate": 8.668725792717889e-05, + "loss": 1.0313, + "step": 2917 + }, + { + "epoch": 0.2607277682221279, + "grad_norm": 0.5184552669525146, + "learning_rate": 8.667742462703649e-05, + "loss": 1.0102, + "step": 2918 + }, + { + "epoch": 0.2608171197533898, + "grad_norm": 0.4744894206523895, + "learning_rate": 8.666758825476886e-05, + "loss": 0.9695, + "step": 2919 + }, + { + "epoch": 0.26090647128465166, + "grad_norm": 0.40858200192451477, + "learning_rate": 8.665774881119985e-05, + "loss": 0.9797, + "step": 2920 + }, + { + "epoch": 0.2609958228159135, + "grad_norm": 0.394593745470047, + "learning_rate": 8.664790629715363e-05, + "loss": 1.0465, + "step": 2921 + }, + { + "epoch": 0.2610851743471754, + "grad_norm": 0.4352075457572937, + "learning_rate": 8.663806071345462e-05, + "loss": 0.9832, + "step": 2922 + }, + { + "epoch": 0.26117452587843726, + "grad_norm": 0.4699738323688507, + "learning_rate": 8.662821206092748e-05, + "loss": 1.064, + "step": 2923 + }, + { + "epoch": 0.2612638774096991, + "grad_norm": 0.4367484450340271, + "learning_rate": 8.661836034039717e-05, + "loss": 0.9967, + "step": 2924 + }, + { + "epoch": 0.26135322894096097, + "grad_norm": 0.3603065609931946, + "learning_rate": 8.660850555268886e-05, + "loss": 1.0077, + "step": 2925 + }, + { + "epoch": 0.26144258047222285, + "grad_norm": 0.3636876344680786, + "learning_rate": 8.659864769862798e-05, + "loss": 1.0128, + "step": 2926 + }, + { + "epoch": 0.26153193200348474, + "grad_norm": 0.37722858786582947, + "learning_rate": 8.658878677904024e-05, + "loss": 0.9895, + "step": 2927 + }, + { + "epoch": 0.26162128353474656, + "grad_norm": 0.4058395028114319, + "learning_rate": 8.65789227947516e-05, + "loss": 0.965, + "step": 2928 + }, + { + "epoch": 0.26171063506600845, + "grad_norm": 0.3944026529788971, + "learning_rate": 8.656905574658829e-05, + "loss": 1.0081, + "step": 2929 + }, + { + "epoch": 0.26179998659727033, + "grad_norm": 0.5204667448997498, + "learning_rate": 8.655918563537675e-05, + "loss": 1.0273, + "step": 2930 + }, + { + "epoch": 0.26188933812853216, + "grad_norm": 0.466569721698761, + "learning_rate": 8.654931246194372e-05, + "loss": 1.0126, + "step": 2931 + }, + { + "epoch": 0.26197868965979404, + "grad_norm": 0.49643585085868835, + "learning_rate": 8.653943622711618e-05, + "loss": 0.9925, + "step": 2932 + }, + { + "epoch": 0.2620680411910559, + "grad_norm": 0.42305490374565125, + "learning_rate": 8.652955693172137e-05, + "loss": 0.9461, + "step": 2933 + }, + { + "epoch": 0.2621573927223178, + "grad_norm": 0.5041871666908264, + "learning_rate": 8.65196745765868e-05, + "loss": 0.9714, + "step": 2934 + }, + { + "epoch": 0.26224674425357963, + "grad_norm": 0.4066472351551056, + "learning_rate": 8.65097891625402e-05, + "loss": 1.0175, + "step": 2935 + }, + { + "epoch": 0.2623360957848415, + "grad_norm": 0.41952818632125854, + "learning_rate": 8.649990069040961e-05, + "loss": 0.998, + "step": 2936 + }, + { + "epoch": 0.2624254473161034, + "grad_norm": 0.44271522760391235, + "learning_rate": 8.649000916102325e-05, + "loss": 0.9336, + "step": 2937 + }, + { + "epoch": 0.2625147988473652, + "grad_norm": 0.47163864970207214, + "learning_rate": 8.64801145752097e-05, + "loss": 1.0735, + "step": 2938 + }, + { + "epoch": 0.2626041503786271, + "grad_norm": 0.40220436453819275, + "learning_rate": 8.647021693379768e-05, + "loss": 0.99, + "step": 2939 + }, + { + "epoch": 0.262693501909889, + "grad_norm": 0.37812304496765137, + "learning_rate": 8.646031623761626e-05, + "loss": 1.0802, + "step": 2940 + }, + { + "epoch": 0.2627828534411508, + "grad_norm": 0.4696109890937805, + "learning_rate": 8.645041248749471e-05, + "loss": 0.9638, + "step": 2941 + }, + { + "epoch": 0.2628722049724127, + "grad_norm": 0.4705543518066406, + "learning_rate": 8.644050568426259e-05, + "loss": 0.9096, + "step": 2942 + }, + { + "epoch": 0.2629615565036746, + "grad_norm": 0.4492562413215637, + "learning_rate": 8.643059582874969e-05, + "loss": 0.9718, + "step": 2943 + }, + { + "epoch": 0.26305090803493647, + "grad_norm": 0.44293731451034546, + "learning_rate": 8.642068292178605e-05, + "loss": 0.9978, + "step": 2944 + }, + { + "epoch": 0.2631402595661983, + "grad_norm": 0.4119532108306885, + "learning_rate": 8.641076696420201e-05, + "loss": 1.0805, + "step": 2945 + }, + { + "epoch": 0.2632296110974602, + "grad_norm": 0.5227283239364624, + "learning_rate": 8.640084795682813e-05, + "loss": 0.9369, + "step": 2946 + }, + { + "epoch": 0.26331896262872206, + "grad_norm": 0.3896709382534027, + "learning_rate": 8.639092590049521e-05, + "loss": 0.9635, + "step": 2947 + }, + { + "epoch": 0.2634083141599839, + "grad_norm": 0.3982143998146057, + "learning_rate": 8.638100079603437e-05, + "loss": 1.0144, + "step": 2948 + }, + { + "epoch": 0.2634976656912458, + "grad_norm": 0.40874722599983215, + "learning_rate": 8.63710726442769e-05, + "loss": 1.0396, + "step": 2949 + }, + { + "epoch": 0.26358701722250766, + "grad_norm": 0.4392448365688324, + "learning_rate": 8.636114144605442e-05, + "loss": 1.0296, + "step": 2950 + }, + { + "epoch": 0.26367636875376954, + "grad_norm": 0.44011494517326355, + "learning_rate": 8.635120720219876e-05, + "loss": 1.0233, + "step": 2951 + }, + { + "epoch": 0.26376572028503137, + "grad_norm": 0.4407230615615845, + "learning_rate": 8.634126991354202e-05, + "loss": 1.0963, + "step": 2952 + }, + { + "epoch": 0.26385507181629325, + "grad_norm": 0.562527596950531, + "learning_rate": 8.633132958091655e-05, + "loss": 0.9788, + "step": 2953 + }, + { + "epoch": 0.26394442334755513, + "grad_norm": 0.44428539276123047, + "learning_rate": 8.632138620515498e-05, + "loss": 1.0182, + "step": 2954 + }, + { + "epoch": 0.26403377487881696, + "grad_norm": 0.4478207230567932, + "learning_rate": 8.631143978709013e-05, + "loss": 0.981, + "step": 2955 + }, + { + "epoch": 0.26412312641007885, + "grad_norm": 0.48832935094833374, + "learning_rate": 8.630149032755517e-05, + "loss": 1.0158, + "step": 2956 + }, + { + "epoch": 0.26421247794134073, + "grad_norm": 0.47887471318244934, + "learning_rate": 8.629153782738344e-05, + "loss": 0.9785, + "step": 2957 + }, + { + "epoch": 0.2643018294726026, + "grad_norm": 0.481168657541275, + "learning_rate": 8.628158228740857e-05, + "loss": 0.949, + "step": 2958 + }, + { + "epoch": 0.26439118100386444, + "grad_norm": 0.42248255014419556, + "learning_rate": 8.627162370846446e-05, + "loss": 1.0021, + "step": 2959 + }, + { + "epoch": 0.2644805325351263, + "grad_norm": 0.3783293664455414, + "learning_rate": 8.626166209138524e-05, + "loss": 1.0025, + "step": 2960 + }, + { + "epoch": 0.2645698840663882, + "grad_norm": 0.40361276268959045, + "learning_rate": 8.62516974370053e-05, + "loss": 0.9737, + "step": 2961 + }, + { + "epoch": 0.26465923559765003, + "grad_norm": 0.4194972813129425, + "learning_rate": 8.624172974615926e-05, + "loss": 1.0377, + "step": 2962 + }, + { + "epoch": 0.2647485871289119, + "grad_norm": 0.4278354346752167, + "learning_rate": 8.623175901968206e-05, + "loss": 1.0373, + "step": 2963 + }, + { + "epoch": 0.2648379386601738, + "grad_norm": 0.430337518453598, + "learning_rate": 8.622178525840885e-05, + "loss": 0.9437, + "step": 2964 + }, + { + "epoch": 0.2649272901914357, + "grad_norm": 0.39128750562667847, + "learning_rate": 8.6211808463175e-05, + "loss": 1.03, + "step": 2965 + }, + { + "epoch": 0.2650166417226975, + "grad_norm": 0.4273848235607147, + "learning_rate": 8.620182863481622e-05, + "loss": 0.967, + "step": 2966 + }, + { + "epoch": 0.2651059932539594, + "grad_norm": 0.43095484375953674, + "learning_rate": 8.619184577416842e-05, + "loss": 1.0008, + "step": 2967 + }, + { + "epoch": 0.2651953447852213, + "grad_norm": 0.4028542935848236, + "learning_rate": 8.618185988206775e-05, + "loss": 1.0687, + "step": 2968 + }, + { + "epoch": 0.2652846963164831, + "grad_norm": 0.45364266633987427, + "learning_rate": 8.617187095935065e-05, + "loss": 0.9845, + "step": 2969 + }, + { + "epoch": 0.265374047847745, + "grad_norm": 0.4723089039325714, + "learning_rate": 8.616187900685377e-05, + "loss": 1.0094, + "step": 2970 + }, + { + "epoch": 0.26546339937900687, + "grad_norm": 0.45047450065612793, + "learning_rate": 8.615188402541408e-05, + "loss": 1.025, + "step": 2971 + }, + { + "epoch": 0.2655527509102687, + "grad_norm": 0.44591468572616577, + "learning_rate": 8.614188601586875e-05, + "loss": 0.9725, + "step": 2972 + }, + { + "epoch": 0.2656421024415306, + "grad_norm": 0.5437367558479309, + "learning_rate": 8.613188497905523e-05, + "loss": 0.9072, + "step": 2973 + }, + { + "epoch": 0.26573145397279246, + "grad_norm": 0.36033257842063904, + "learning_rate": 8.612188091581119e-05, + "loss": 0.9875, + "step": 2974 + }, + { + "epoch": 0.26582080550405435, + "grad_norm": 0.4065442979335785, + "learning_rate": 8.611187382697458e-05, + "loss": 1.0593, + "step": 2975 + }, + { + "epoch": 0.2659101570353162, + "grad_norm": 0.42632168531417847, + "learning_rate": 8.610186371338365e-05, + "loss": 1.0284, + "step": 2976 + }, + { + "epoch": 0.26599950856657806, + "grad_norm": 0.45668208599090576, + "learning_rate": 8.609185057587678e-05, + "loss": 1.0186, + "step": 2977 + }, + { + "epoch": 0.26608886009783994, + "grad_norm": 0.4407256245613098, + "learning_rate": 8.608183441529274e-05, + "loss": 0.9508, + "step": 2978 + }, + { + "epoch": 0.26617821162910177, + "grad_norm": 0.4113801419734955, + "learning_rate": 8.607181523247045e-05, + "loss": 1.0019, + "step": 2979 + }, + { + "epoch": 0.26626756316036365, + "grad_norm": 0.5497756004333496, + "learning_rate": 8.606179302824914e-05, + "loss": 1.0021, + "step": 2980 + }, + { + "epoch": 0.26635691469162553, + "grad_norm": 0.3906475901603699, + "learning_rate": 8.605176780346826e-05, + "loss": 1.003, + "step": 2981 + }, + { + "epoch": 0.2664462662228874, + "grad_norm": 0.4320390820503235, + "learning_rate": 8.604173955896756e-05, + "loss": 0.9869, + "step": 2982 + }, + { + "epoch": 0.26653561775414925, + "grad_norm": 0.47781896591186523, + "learning_rate": 8.6031708295587e-05, + "loss": 0.9723, + "step": 2983 + }, + { + "epoch": 0.26662496928541113, + "grad_norm": 0.4359282851219177, + "learning_rate": 8.602167401416678e-05, + "loss": 1.0909, + "step": 2984 + }, + { + "epoch": 0.266714320816673, + "grad_norm": 0.3747998774051666, + "learning_rate": 8.601163671554739e-05, + "loss": 0.9894, + "step": 2985 + }, + { + "epoch": 0.26680367234793484, + "grad_norm": 0.4070686101913452, + "learning_rate": 8.60015964005696e-05, + "loss": 0.9766, + "step": 2986 + }, + { + "epoch": 0.2668930238791967, + "grad_norm": 0.43301576375961304, + "learning_rate": 8.599155307007434e-05, + "loss": 0.9774, + "step": 2987 + }, + { + "epoch": 0.2669823754104586, + "grad_norm": 0.436482697725296, + "learning_rate": 8.598150672490288e-05, + "loss": 1.0462, + "step": 2988 + }, + { + "epoch": 0.2670717269417205, + "grad_norm": 0.5423337817192078, + "learning_rate": 8.59714573658967e-05, + "loss": 0.9865, + "step": 2989 + }, + { + "epoch": 0.2671610784729823, + "grad_norm": 0.42116260528564453, + "learning_rate": 8.596140499389751e-05, + "loss": 1.0272, + "step": 2990 + }, + { + "epoch": 0.2672504300042442, + "grad_norm": 0.4866194725036621, + "learning_rate": 8.595134960974735e-05, + "loss": 1.0235, + "step": 2991 + }, + { + "epoch": 0.2673397815355061, + "grad_norm": 0.39662399888038635, + "learning_rate": 8.594129121428844e-05, + "loss": 0.9877, + "step": 2992 + }, + { + "epoch": 0.2674291330667679, + "grad_norm": 0.48044952750205994, + "learning_rate": 8.59312298083633e-05, + "loss": 0.9672, + "step": 2993 + }, + { + "epoch": 0.2675184845980298, + "grad_norm": 0.40628960728645325, + "learning_rate": 8.592116539281466e-05, + "loss": 1.0034, + "step": 2994 + }, + { + "epoch": 0.2676078361292917, + "grad_norm": 0.3931500017642975, + "learning_rate": 8.591109796848551e-05, + "loss": 1.0643, + "step": 2995 + }, + { + "epoch": 0.26769718766055356, + "grad_norm": 0.4383373260498047, + "learning_rate": 8.590102753621913e-05, + "loss": 0.943, + "step": 2996 + }, + { + "epoch": 0.2677865391918154, + "grad_norm": 0.41193974018096924, + "learning_rate": 8.5890954096859e-05, + "loss": 1.0634, + "step": 2997 + }, + { + "epoch": 0.26787589072307727, + "grad_norm": 0.480523020029068, + "learning_rate": 8.588087765124893e-05, + "loss": 0.9083, + "step": 2998 + }, + { + "epoch": 0.26796524225433915, + "grad_norm": 0.592286229133606, + "learning_rate": 8.587079820023287e-05, + "loss": 0.9664, + "step": 2999 + }, + { + "epoch": 0.268054593785601, + "grad_norm": 0.361767053604126, + "learning_rate": 8.586071574465511e-05, + "loss": 1.0011, + "step": 3000 + }, + { + "epoch": 0.26814394531686286, + "grad_norm": 0.38865599036216736, + "learning_rate": 8.585063028536016e-05, + "loss": 1.0153, + "step": 3001 + }, + { + "epoch": 0.26823329684812475, + "grad_norm": 0.4458223879337311, + "learning_rate": 8.584054182319279e-05, + "loss": 1.1077, + "step": 3002 + }, + { + "epoch": 0.2683226483793866, + "grad_norm": 0.4879177212715149, + "learning_rate": 8.583045035899799e-05, + "loss": 0.9969, + "step": 3003 + }, + { + "epoch": 0.26841199991064846, + "grad_norm": 0.47598427534103394, + "learning_rate": 8.582035589362107e-05, + "loss": 0.9383, + "step": 3004 + }, + { + "epoch": 0.26850135144191034, + "grad_norm": 0.5034103393554688, + "learning_rate": 8.581025842790751e-05, + "loss": 1.068, + "step": 3005 + }, + { + "epoch": 0.2685907029731722, + "grad_norm": 0.3905804753303528, + "learning_rate": 8.58001579627031e-05, + "loss": 1.0858, + "step": 3006 + }, + { + "epoch": 0.26868005450443405, + "grad_norm": 0.3951498866081238, + "learning_rate": 8.579005449885385e-05, + "loss": 0.9513, + "step": 3007 + }, + { + "epoch": 0.26876940603569593, + "grad_norm": 0.44809505343437195, + "learning_rate": 8.577994803720606e-05, + "loss": 0.9417, + "step": 3008 + }, + { + "epoch": 0.2688587575669578, + "grad_norm": 0.39570125937461853, + "learning_rate": 8.576983857860624e-05, + "loss": 1.0068, + "step": 3009 + }, + { + "epoch": 0.26894810909821965, + "grad_norm": 0.4790659546852112, + "learning_rate": 8.575972612390114e-05, + "loss": 0.9794, + "step": 3010 + }, + { + "epoch": 0.26903746062948153, + "grad_norm": 0.4047996699810028, + "learning_rate": 8.57496106739378e-05, + "loss": 0.9618, + "step": 3011 + }, + { + "epoch": 0.2691268121607434, + "grad_norm": 0.4825955629348755, + "learning_rate": 8.573949222956352e-05, + "loss": 0.9828, + "step": 3012 + }, + { + "epoch": 0.2692161636920053, + "grad_norm": 0.4488469362258911, + "learning_rate": 8.57293707916258e-05, + "loss": 0.9346, + "step": 3013 + }, + { + "epoch": 0.2693055152232671, + "grad_norm": 0.41295936703681946, + "learning_rate": 8.571924636097243e-05, + "loss": 1.007, + "step": 3014 + }, + { + "epoch": 0.269394866754529, + "grad_norm": 0.43881046772003174, + "learning_rate": 8.570911893845144e-05, + "loss": 1.061, + "step": 3015 + }, + { + "epoch": 0.2694842182857909, + "grad_norm": 0.386459618806839, + "learning_rate": 8.569898852491112e-05, + "loss": 1.0555, + "step": 3016 + }, + { + "epoch": 0.2695735698170527, + "grad_norm": 0.3662417531013489, + "learning_rate": 8.568885512119997e-05, + "loss": 1.056, + "step": 3017 + }, + { + "epoch": 0.2696629213483146, + "grad_norm": 0.38702377676963806, + "learning_rate": 8.56787187281668e-05, + "loss": 1.0434, + "step": 3018 + }, + { + "epoch": 0.2697522728795765, + "grad_norm": 0.40131479501724243, + "learning_rate": 8.566857934666062e-05, + "loss": 0.9943, + "step": 3019 + }, + { + "epoch": 0.26984162441083837, + "grad_norm": 0.47031551599502563, + "learning_rate": 8.565843697753072e-05, + "loss": 0.973, + "step": 3020 + }, + { + "epoch": 0.2699309759421002, + "grad_norm": 0.47709816694259644, + "learning_rate": 8.564829162162664e-05, + "loss": 0.9718, + "step": 3021 + }, + { + "epoch": 0.2700203274733621, + "grad_norm": 0.4619588553905487, + "learning_rate": 8.563814327979814e-05, + "loss": 0.979, + "step": 3022 + }, + { + "epoch": 0.27010967900462396, + "grad_norm": 0.4841311573982239, + "learning_rate": 8.562799195289527e-05, + "loss": 0.9617, + "step": 3023 + }, + { + "epoch": 0.2701990305358858, + "grad_norm": 0.385013222694397, + "learning_rate": 8.561783764176832e-05, + "loss": 1.0135, + "step": 3024 + }, + { + "epoch": 0.27028838206714767, + "grad_norm": 0.48148104548454285, + "learning_rate": 8.560768034726778e-05, + "loss": 0.9748, + "step": 3025 + }, + { + "epoch": 0.27037773359840955, + "grad_norm": 0.42841196060180664, + "learning_rate": 8.559752007024448e-05, + "loss": 0.9614, + "step": 3026 + }, + { + "epoch": 0.27046708512967144, + "grad_norm": 0.42978012561798096, + "learning_rate": 8.558735681154943e-05, + "loss": 1.0068, + "step": 3027 + }, + { + "epoch": 0.27055643666093326, + "grad_norm": 0.4296395778656006, + "learning_rate": 8.557719057203391e-05, + "loss": 1.0504, + "step": 3028 + }, + { + "epoch": 0.27064578819219515, + "grad_norm": 0.41547369956970215, + "learning_rate": 8.556702135254946e-05, + "loss": 1.0337, + "step": 3029 + }, + { + "epoch": 0.27073513972345703, + "grad_norm": 0.445972204208374, + "learning_rate": 8.555684915394786e-05, + "loss": 0.9933, + "step": 3030 + }, + { + "epoch": 0.27082449125471886, + "grad_norm": 0.5036845207214355, + "learning_rate": 8.554667397708112e-05, + "loss": 0.9972, + "step": 3031 + }, + { + "epoch": 0.27091384278598074, + "grad_norm": 0.4610268175601959, + "learning_rate": 8.553649582280155e-05, + "loss": 0.9926, + "step": 3032 + }, + { + "epoch": 0.2710031943172426, + "grad_norm": 0.5067682266235352, + "learning_rate": 8.552631469196164e-05, + "loss": 1.0285, + "step": 3033 + }, + { + "epoch": 0.27109254584850445, + "grad_norm": 0.3850218951702118, + "learning_rate": 8.551613058541421e-05, + "loss": 1.001, + "step": 3034 + }, + { + "epoch": 0.27118189737976633, + "grad_norm": 0.39576455950737, + "learning_rate": 8.550594350401225e-05, + "loss": 0.9882, + "step": 3035 + }, + { + "epoch": 0.2712712489110282, + "grad_norm": 0.4152897894382477, + "learning_rate": 8.549575344860907e-05, + "loss": 0.9561, + "step": 3036 + }, + { + "epoch": 0.2713606004422901, + "grad_norm": 0.6863245964050293, + "learning_rate": 8.548556042005819e-05, + "loss": 0.9862, + "step": 3037 + }, + { + "epoch": 0.27144995197355193, + "grad_norm": 0.3862869441509247, + "learning_rate": 8.547536441921336e-05, + "loss": 1.0157, + "step": 3038 + }, + { + "epoch": 0.2715393035048138, + "grad_norm": 0.37958061695098877, + "learning_rate": 8.546516544692861e-05, + "loss": 1.0548, + "step": 3039 + }, + { + "epoch": 0.2716286550360757, + "grad_norm": 0.4158911108970642, + "learning_rate": 8.545496350405825e-05, + "loss": 1.0377, + "step": 3040 + }, + { + "epoch": 0.2717180065673375, + "grad_norm": 0.43958452343940735, + "learning_rate": 8.544475859145676e-05, + "loss": 1.0113, + "step": 3041 + }, + { + "epoch": 0.2718073580985994, + "grad_norm": 0.4888957440853119, + "learning_rate": 8.543455070997892e-05, + "loss": 1.0082, + "step": 3042 + }, + { + "epoch": 0.2718967096298613, + "grad_norm": 0.4653918147087097, + "learning_rate": 8.542433986047977e-05, + "loss": 0.9827, + "step": 3043 + }, + { + "epoch": 0.27198606116112317, + "grad_norm": 0.4065740704536438, + "learning_rate": 8.541412604381454e-05, + "loss": 1.0425, + "step": 3044 + }, + { + "epoch": 0.272075412692385, + "grad_norm": 0.3832918703556061, + "learning_rate": 8.540390926083876e-05, + "loss": 0.9736, + "step": 3045 + }, + { + "epoch": 0.2721647642236469, + "grad_norm": 0.42073532938957214, + "learning_rate": 8.539368951240823e-05, + "loss": 1.0916, + "step": 3046 + }, + { + "epoch": 0.27225411575490877, + "grad_norm": 0.4821849763393402, + "learning_rate": 8.538346679937891e-05, + "loss": 1.0015, + "step": 3047 + }, + { + "epoch": 0.2723434672861706, + "grad_norm": 0.3975672125816345, + "learning_rate": 8.53732411226071e-05, + "loss": 0.9954, + "step": 3048 + }, + { + "epoch": 0.2724328188174325, + "grad_norm": 0.4945456385612488, + "learning_rate": 8.536301248294929e-05, + "loss": 1.0665, + "step": 3049 + }, + { + "epoch": 0.27252217034869436, + "grad_norm": 0.451886922121048, + "learning_rate": 8.535278088126225e-05, + "loss": 0.9445, + "step": 3050 + }, + { + "epoch": 0.27261152187995624, + "grad_norm": 0.4293072819709778, + "learning_rate": 8.534254631840296e-05, + "loss": 0.9888, + "step": 3051 + }, + { + "epoch": 0.27270087341121807, + "grad_norm": 0.4669989347457886, + "learning_rate": 8.53323087952287e-05, + "loss": 0.9923, + "step": 3052 + }, + { + "epoch": 0.27279022494247995, + "grad_norm": 0.4209103286266327, + "learning_rate": 8.532206831259696e-05, + "loss": 0.9666, + "step": 3053 + }, + { + "epoch": 0.27287957647374184, + "grad_norm": 0.41052526235580444, + "learning_rate": 8.531182487136549e-05, + "loss": 1.0202, + "step": 3054 + }, + { + "epoch": 0.27296892800500366, + "grad_norm": 0.46915799379348755, + "learning_rate": 8.53015784723923e-05, + "loss": 0.8682, + "step": 3055 + }, + { + "epoch": 0.27305827953626555, + "grad_norm": 0.3582940697669983, + "learning_rate": 8.529132911653563e-05, + "loss": 0.9678, + "step": 3056 + }, + { + "epoch": 0.27314763106752743, + "grad_norm": 0.4747898280620575, + "learning_rate": 8.528107680465394e-05, + "loss": 0.9526, + "step": 3057 + }, + { + "epoch": 0.2732369825987893, + "grad_norm": 0.43291470408439636, + "learning_rate": 8.527082153760601e-05, + "loss": 0.9883, + "step": 3058 + }, + { + "epoch": 0.27332633413005114, + "grad_norm": 0.4209512770175934, + "learning_rate": 8.526056331625083e-05, + "loss": 0.9941, + "step": 3059 + }, + { + "epoch": 0.273415685661313, + "grad_norm": 0.4599602520465851, + "learning_rate": 8.525030214144763e-05, + "loss": 1.0283, + "step": 3060 + }, + { + "epoch": 0.2735050371925749, + "grad_norm": 0.3656606078147888, + "learning_rate": 8.524003801405587e-05, + "loss": 1.017, + "step": 3061 + }, + { + "epoch": 0.27359438872383673, + "grad_norm": 0.43189749121665955, + "learning_rate": 8.522977093493528e-05, + "loss": 0.9632, + "step": 3062 + }, + { + "epoch": 0.2736837402550986, + "grad_norm": 0.5547308325767517, + "learning_rate": 8.521950090494587e-05, + "loss": 1.0423, + "step": 3063 + }, + { + "epoch": 0.2737730917863605, + "grad_norm": 0.48824217915534973, + "learning_rate": 8.520922792494783e-05, + "loss": 0.919, + "step": 3064 + }, + { + "epoch": 0.27386244331762233, + "grad_norm": 0.4252563416957855, + "learning_rate": 8.519895199580168e-05, + "loss": 0.9322, + "step": 3065 + }, + { + "epoch": 0.2739517948488842, + "grad_norm": 0.40307125449180603, + "learning_rate": 8.518867311836808e-05, + "loss": 1.0316, + "step": 3066 + }, + { + "epoch": 0.2740411463801461, + "grad_norm": 0.38896623253822327, + "learning_rate": 8.517839129350802e-05, + "loss": 1.0407, + "step": 3067 + }, + { + "epoch": 0.274130497911408, + "grad_norm": 0.40794113278388977, + "learning_rate": 8.516810652208272e-05, + "loss": 0.9766, + "step": 3068 + }, + { + "epoch": 0.2742198494426698, + "grad_norm": 0.4464515745639801, + "learning_rate": 8.515781880495363e-05, + "loss": 0.9997, + "step": 3069 + }, + { + "epoch": 0.2743092009739317, + "grad_norm": 0.3915344476699829, + "learning_rate": 8.514752814298248e-05, + "loss": 0.9563, + "step": 3070 + }, + { + "epoch": 0.27439855250519357, + "grad_norm": 0.3987552225589752, + "learning_rate": 8.513723453703119e-05, + "loss": 1.0145, + "step": 3071 + }, + { + "epoch": 0.2744879040364554, + "grad_norm": 0.3828818202018738, + "learning_rate": 8.512693798796196e-05, + "loss": 0.993, + "step": 3072 + }, + { + "epoch": 0.2745772555677173, + "grad_norm": 0.47670143842697144, + "learning_rate": 8.511663849663727e-05, + "loss": 0.9994, + "step": 3073 + }, + { + "epoch": 0.27466660709897917, + "grad_norm": 0.5165907144546509, + "learning_rate": 8.510633606391977e-05, + "loss": 0.974, + "step": 3074 + }, + { + "epoch": 0.27475595863024105, + "grad_norm": 0.49067389965057373, + "learning_rate": 8.509603069067243e-05, + "loss": 0.9133, + "step": 3075 + }, + { + "epoch": 0.2748453101615029, + "grad_norm": 0.3810344636440277, + "learning_rate": 8.50857223777584e-05, + "loss": 0.988, + "step": 3076 + }, + { + "epoch": 0.27493466169276476, + "grad_norm": 0.4502268135547638, + "learning_rate": 8.507541112604116e-05, + "loss": 0.9268, + "step": 3077 + }, + { + "epoch": 0.27502401322402664, + "grad_norm": 0.42537322640419006, + "learning_rate": 8.506509693638435e-05, + "loss": 1.0575, + "step": 3078 + }, + { + "epoch": 0.27511336475528847, + "grad_norm": 0.37440937757492065, + "learning_rate": 8.505477980965191e-05, + "loss": 1.0067, + "step": 3079 + }, + { + "epoch": 0.27520271628655035, + "grad_norm": 0.4221092164516449, + "learning_rate": 8.5044459746708e-05, + "loss": 0.9788, + "step": 3080 + }, + { + "epoch": 0.27529206781781224, + "grad_norm": 0.5111864805221558, + "learning_rate": 8.503413674841703e-05, + "loss": 0.9639, + "step": 3081 + }, + { + "epoch": 0.2753814193490741, + "grad_norm": 0.4051772654056549, + "learning_rate": 8.502381081564369e-05, + "loss": 1.0491, + "step": 3082 + }, + { + "epoch": 0.27547077088033595, + "grad_norm": 0.4286561608314514, + "learning_rate": 8.501348194925285e-05, + "loss": 0.9884, + "step": 3083 + }, + { + "epoch": 0.27556012241159783, + "grad_norm": 0.4711834490299225, + "learning_rate": 8.500315015010968e-05, + "loss": 1.025, + "step": 3084 + }, + { + "epoch": 0.2756494739428597, + "grad_norm": 0.43955665826797485, + "learning_rate": 8.499281541907959e-05, + "loss": 0.9961, + "step": 3085 + }, + { + "epoch": 0.27573882547412154, + "grad_norm": 0.38096296787261963, + "learning_rate": 8.498247775702821e-05, + "loss": 1.0081, + "step": 3086 + }, + { + "epoch": 0.2758281770053834, + "grad_norm": 0.40169036388397217, + "learning_rate": 8.497213716482142e-05, + "loss": 1.0185, + "step": 3087 + }, + { + "epoch": 0.2759175285366453, + "grad_norm": 0.4468224346637726, + "learning_rate": 8.496179364332539e-05, + "loss": 1.0679, + "step": 3088 + }, + { + "epoch": 0.2760068800679072, + "grad_norm": 0.5090979337692261, + "learning_rate": 8.495144719340646e-05, + "loss": 1.0041, + "step": 3089 + }, + { + "epoch": 0.276096231599169, + "grad_norm": 0.4413253366947174, + "learning_rate": 8.494109781593127e-05, + "loss": 0.998, + "step": 3090 + }, + { + "epoch": 0.2761855831304309, + "grad_norm": 0.5128616094589233, + "learning_rate": 8.49307455117667e-05, + "loss": 0.9282, + "step": 3091 + }, + { + "epoch": 0.2762749346616928, + "grad_norm": 0.4610481858253479, + "learning_rate": 8.492039028177986e-05, + "loss": 0.9687, + "step": 3092 + }, + { + "epoch": 0.2763642861929546, + "grad_norm": 0.42069047689437866, + "learning_rate": 8.491003212683811e-05, + "loss": 0.9726, + "step": 3093 + }, + { + "epoch": 0.2764536377242165, + "grad_norm": 0.42152658104896545, + "learning_rate": 8.489967104780902e-05, + "loss": 0.968, + "step": 3094 + }, + { + "epoch": 0.2765429892554784, + "grad_norm": 0.40881800651550293, + "learning_rate": 8.48893070455605e-05, + "loss": 1.0844, + "step": 3095 + }, + { + "epoch": 0.2766323407867402, + "grad_norm": 0.4400191009044647, + "learning_rate": 8.487894012096065e-05, + "loss": 0.9969, + "step": 3096 + }, + { + "epoch": 0.2767216923180021, + "grad_norm": 0.4783930480480194, + "learning_rate": 8.486857027487775e-05, + "loss": 0.9458, + "step": 3097 + }, + { + "epoch": 0.27681104384926397, + "grad_norm": 0.35160624980926514, + "learning_rate": 8.485819750818043e-05, + "loss": 1.0798, + "step": 3098 + }, + { + "epoch": 0.27690039538052585, + "grad_norm": 0.4336118996143341, + "learning_rate": 8.484782182173749e-05, + "loss": 1.0089, + "step": 3099 + }, + { + "epoch": 0.2769897469117877, + "grad_norm": 0.3933138847351074, + "learning_rate": 8.483744321641805e-05, + "loss": 1.0987, + "step": 3100 + }, + { + "epoch": 0.27707909844304957, + "grad_norm": 0.44518598914146423, + "learning_rate": 8.482706169309138e-05, + "loss": 0.9875, + "step": 3101 + }, + { + "epoch": 0.27716844997431145, + "grad_norm": 0.4272666275501251, + "learning_rate": 8.481667725262708e-05, + "loss": 0.986, + "step": 3102 + }, + { + "epoch": 0.2772578015055733, + "grad_norm": 0.40855729579925537, + "learning_rate": 8.480628989589491e-05, + "loss": 0.9967, + "step": 3103 + }, + { + "epoch": 0.27734715303683516, + "grad_norm": 0.42171216011047363, + "learning_rate": 8.479589962376497e-05, + "loss": 1.0869, + "step": 3104 + }, + { + "epoch": 0.27743650456809704, + "grad_norm": 0.39265212416648865, + "learning_rate": 8.478550643710754e-05, + "loss": 1.0718, + "step": 3105 + }, + { + "epoch": 0.2775258560993589, + "grad_norm": 0.4548582136631012, + "learning_rate": 8.477511033679317e-05, + "loss": 0.9926, + "step": 3106 + }, + { + "epoch": 0.27761520763062075, + "grad_norm": 0.37569287419319153, + "learning_rate": 8.476471132369262e-05, + "loss": 1.022, + "step": 3107 + }, + { + "epoch": 0.27770455916188264, + "grad_norm": 0.3934466242790222, + "learning_rate": 8.475430939867693e-05, + "loss": 0.9714, + "step": 3108 + }, + { + "epoch": 0.2777939106931445, + "grad_norm": 0.4344901144504547, + "learning_rate": 8.474390456261738e-05, + "loss": 0.9954, + "step": 3109 + }, + { + "epoch": 0.27788326222440635, + "grad_norm": 0.4985685646533966, + "learning_rate": 8.473349681638547e-05, + "loss": 0.949, + "step": 3110 + }, + { + "epoch": 0.27797261375566823, + "grad_norm": 0.40631502866744995, + "learning_rate": 8.472308616085298e-05, + "loss": 1.0339, + "step": 3111 + }, + { + "epoch": 0.2780619652869301, + "grad_norm": 0.4710947871208191, + "learning_rate": 8.47126725968919e-05, + "loss": 0.9509, + "step": 3112 + }, + { + "epoch": 0.278151316818192, + "grad_norm": 0.4414302110671997, + "learning_rate": 8.470225612537448e-05, + "loss": 0.9992, + "step": 3113 + }, + { + "epoch": 0.2782406683494538, + "grad_norm": 0.3889710605144501, + "learning_rate": 8.46918367471732e-05, + "loss": 0.976, + "step": 3114 + }, + { + "epoch": 0.2783300198807157, + "grad_norm": 0.41074416041374207, + "learning_rate": 8.468141446316082e-05, + "loss": 1.0087, + "step": 3115 + }, + { + "epoch": 0.2784193714119776, + "grad_norm": 0.47910332679748535, + "learning_rate": 8.46709892742103e-05, + "loss": 0.9827, + "step": 3116 + }, + { + "epoch": 0.2785087229432394, + "grad_norm": 0.3920195400714874, + "learning_rate": 8.466056118119485e-05, + "loss": 1.0017, + "step": 3117 + }, + { + "epoch": 0.2785980744745013, + "grad_norm": 0.4296383261680603, + "learning_rate": 8.465013018498795e-05, + "loss": 0.9948, + "step": 3118 + }, + { + "epoch": 0.2786874260057632, + "grad_norm": 0.408699631690979, + "learning_rate": 8.463969628646332e-05, + "loss": 1.0166, + "step": 3119 + }, + { + "epoch": 0.27877677753702507, + "grad_norm": 0.46718454360961914, + "learning_rate": 8.462925948649488e-05, + "loss": 0.9799, + "step": 3120 + }, + { + "epoch": 0.2788661290682869, + "grad_norm": 0.40248745679855347, + "learning_rate": 8.461881978595683e-05, + "loss": 1.0439, + "step": 3121 + }, + { + "epoch": 0.2789554805995488, + "grad_norm": 0.4227806329727173, + "learning_rate": 8.460837718572361e-05, + "loss": 1.0268, + "step": 3122 + }, + { + "epoch": 0.27904483213081066, + "grad_norm": 0.44259488582611084, + "learning_rate": 8.45979316866699e-05, + "loss": 1.0697, + "step": 3123 + }, + { + "epoch": 0.2791341836620725, + "grad_norm": 0.43050849437713623, + "learning_rate": 8.458748328967065e-05, + "loss": 1.0158, + "step": 3124 + }, + { + "epoch": 0.27922353519333437, + "grad_norm": 0.4049340486526489, + "learning_rate": 8.457703199560098e-05, + "loss": 0.9342, + "step": 3125 + }, + { + "epoch": 0.27931288672459625, + "grad_norm": 0.43839579820632935, + "learning_rate": 8.456657780533632e-05, + "loss": 0.951, + "step": 3126 + }, + { + "epoch": 0.27940223825585814, + "grad_norm": 0.44916754961013794, + "learning_rate": 8.45561207197523e-05, + "loss": 0.9717, + "step": 3127 + }, + { + "epoch": 0.27949158978711997, + "grad_norm": 0.4199538230895996, + "learning_rate": 8.454566073972485e-05, + "loss": 0.9371, + "step": 3128 + }, + { + "epoch": 0.27958094131838185, + "grad_norm": 0.4070294201374054, + "learning_rate": 8.453519786613007e-05, + "loss": 0.9972, + "step": 3129 + }, + { + "epoch": 0.27967029284964373, + "grad_norm": 0.4184653162956238, + "learning_rate": 8.452473209984435e-05, + "loss": 0.9807, + "step": 3130 + }, + { + "epoch": 0.27975964438090556, + "grad_norm": 0.4994652271270752, + "learning_rate": 8.451426344174433e-05, + "loss": 1.0829, + "step": 3131 + }, + { + "epoch": 0.27984899591216744, + "grad_norm": 0.4521782696247101, + "learning_rate": 8.450379189270683e-05, + "loss": 1.0126, + "step": 3132 + }, + { + "epoch": 0.2799383474434293, + "grad_norm": 0.43609222769737244, + "learning_rate": 8.449331745360898e-05, + "loss": 1.0078, + "step": 3133 + }, + { + "epoch": 0.28002769897469115, + "grad_norm": 0.4149281680583954, + "learning_rate": 8.448284012532812e-05, + "loss": 0.9348, + "step": 3134 + }, + { + "epoch": 0.28011705050595304, + "grad_norm": 0.4595349431037903, + "learning_rate": 8.447235990874182e-05, + "loss": 1.0162, + "step": 3135 + }, + { + "epoch": 0.2802064020372149, + "grad_norm": 0.48324936628341675, + "learning_rate": 8.446187680472797e-05, + "loss": 0.9531, + "step": 3136 + }, + { + "epoch": 0.2802957535684768, + "grad_norm": 0.40329238772392273, + "learning_rate": 8.445139081416458e-05, + "loss": 0.977, + "step": 3137 + }, + { + "epoch": 0.28038510509973863, + "grad_norm": 0.5319374203681946, + "learning_rate": 8.444090193792997e-05, + "loss": 0.9266, + "step": 3138 + }, + { + "epoch": 0.2804744566310005, + "grad_norm": 0.43720632791519165, + "learning_rate": 8.443041017690271e-05, + "loss": 1.049, + "step": 3139 + }, + { + "epoch": 0.2805638081622624, + "grad_norm": 0.4096292555332184, + "learning_rate": 8.441991553196162e-05, + "loss": 1.027, + "step": 3140 + }, + { + "epoch": 0.2806531596935242, + "grad_norm": 0.4749426543712616, + "learning_rate": 8.440941800398571e-05, + "loss": 0.9608, + "step": 3141 + }, + { + "epoch": 0.2807425112247861, + "grad_norm": 0.40563464164733887, + "learning_rate": 8.439891759385425e-05, + "loss": 1.025, + "step": 3142 + }, + { + "epoch": 0.280831862756048, + "grad_norm": 0.4449627101421356, + "learning_rate": 8.438841430244678e-05, + "loss": 0.9698, + "step": 3143 + }, + { + "epoch": 0.2809212142873099, + "grad_norm": 0.4511934816837311, + "learning_rate": 8.437790813064305e-05, + "loss": 0.9556, + "step": 3144 + }, + { + "epoch": 0.2810105658185717, + "grad_norm": 0.4390813112258911, + "learning_rate": 8.43673990793231e-05, + "loss": 1.0457, + "step": 3145 + }, + { + "epoch": 0.2810999173498336, + "grad_norm": 0.4567374885082245, + "learning_rate": 8.435688714936711e-05, + "loss": 0.9987, + "step": 3146 + }, + { + "epoch": 0.28118926888109547, + "grad_norm": 0.39277079701423645, + "learning_rate": 8.434637234165563e-05, + "loss": 1.0383, + "step": 3147 + }, + { + "epoch": 0.2812786204123573, + "grad_norm": 0.36896324157714844, + "learning_rate": 8.433585465706934e-05, + "loss": 0.9997, + "step": 3148 + }, + { + "epoch": 0.2813679719436192, + "grad_norm": 0.40107041597366333, + "learning_rate": 8.432533409648925e-05, + "loss": 1.0192, + "step": 3149 + }, + { + "epoch": 0.28145732347488106, + "grad_norm": 0.5373698472976685, + "learning_rate": 8.431481066079654e-05, + "loss": 0.9916, + "step": 3150 + }, + { + "epoch": 0.28154667500614294, + "grad_norm": 0.4341030418872833, + "learning_rate": 8.430428435087267e-05, + "loss": 1.0095, + "step": 3151 + }, + { + "epoch": 0.28163602653740477, + "grad_norm": 0.4856875240802765, + "learning_rate": 8.429375516759933e-05, + "loss": 0.9375, + "step": 3152 + }, + { + "epoch": 0.28172537806866665, + "grad_norm": 0.4683944880962372, + "learning_rate": 8.428322311185844e-05, + "loss": 1.0203, + "step": 3153 + }, + { + "epoch": 0.28181472959992854, + "grad_norm": 0.39955028891563416, + "learning_rate": 8.427268818453218e-05, + "loss": 1.0851, + "step": 3154 + }, + { + "epoch": 0.28190408113119036, + "grad_norm": 0.4985107481479645, + "learning_rate": 8.426215038650296e-05, + "loss": 0.9497, + "step": 3155 + }, + { + "epoch": 0.28199343266245225, + "grad_norm": 0.40328237414360046, + "learning_rate": 8.425160971865345e-05, + "loss": 1.019, + "step": 3156 + }, + { + "epoch": 0.28208278419371413, + "grad_norm": 0.4598018527030945, + "learning_rate": 8.424106618186653e-05, + "loss": 0.8932, + "step": 3157 + }, + { + "epoch": 0.282172135724976, + "grad_norm": 0.46386003494262695, + "learning_rate": 8.423051977702532e-05, + "loss": 1.0043, + "step": 3158 + }, + { + "epoch": 0.28226148725623784, + "grad_norm": 0.4086954593658447, + "learning_rate": 8.421997050501323e-05, + "loss": 1.0223, + "step": 3159 + }, + { + "epoch": 0.2823508387874997, + "grad_norm": 0.4212207794189453, + "learning_rate": 8.420941836671384e-05, + "loss": 0.9869, + "step": 3160 + }, + { + "epoch": 0.2824401903187616, + "grad_norm": 0.3890816271305084, + "learning_rate": 8.419886336301099e-05, + "loss": 1.0268, + "step": 3161 + }, + { + "epoch": 0.28252954185002344, + "grad_norm": 0.39444929361343384, + "learning_rate": 8.418830549478885e-05, + "loss": 1.0987, + "step": 3162 + }, + { + "epoch": 0.2826188933812853, + "grad_norm": 0.43861931562423706, + "learning_rate": 8.417774476293167e-05, + "loss": 0.922, + "step": 3163 + }, + { + "epoch": 0.2827082449125472, + "grad_norm": 0.42500317096710205, + "learning_rate": 8.416718116832406e-05, + "loss": 0.9877, + "step": 3164 + }, + { + "epoch": 0.28279759644380903, + "grad_norm": 0.4124399721622467, + "learning_rate": 8.415661471185084e-05, + "loss": 1.0055, + "step": 3165 + }, + { + "epoch": 0.2828869479750709, + "grad_norm": 0.41332152485847473, + "learning_rate": 8.414604539439704e-05, + "loss": 1.0422, + "step": 3166 + }, + { + "epoch": 0.2829762995063328, + "grad_norm": 0.42248642444610596, + "learning_rate": 8.413547321684798e-05, + "loss": 1.0358, + "step": 3167 + }, + { + "epoch": 0.2830656510375947, + "grad_norm": 0.42159736156463623, + "learning_rate": 8.412489818008918e-05, + "loss": 1.0113, + "step": 3168 + }, + { + "epoch": 0.2831550025688565, + "grad_norm": 0.41881147027015686, + "learning_rate": 8.41143202850064e-05, + "loss": 1.0388, + "step": 3169 + }, + { + "epoch": 0.2832443541001184, + "grad_norm": 0.4449400007724762, + "learning_rate": 8.41037395324857e-05, + "loss": 1.0147, + "step": 3170 + }, + { + "epoch": 0.2833337056313803, + "grad_norm": 0.4245634377002716, + "learning_rate": 8.409315592341324e-05, + "loss": 1.0661, + "step": 3171 + }, + { + "epoch": 0.2834230571626421, + "grad_norm": 0.4041271209716797, + "learning_rate": 8.408256945867561e-05, + "loss": 1.0927, + "step": 3172 + }, + { + "epoch": 0.283512408693904, + "grad_norm": 0.5533398389816284, + "learning_rate": 8.407198013915947e-05, + "loss": 0.9166, + "step": 3173 + }, + { + "epoch": 0.28360176022516587, + "grad_norm": 0.39345189929008484, + "learning_rate": 8.406138796575183e-05, + "loss": 0.9596, + "step": 3174 + }, + { + "epoch": 0.28369111175642775, + "grad_norm": 0.39271482825279236, + "learning_rate": 8.405079293933986e-05, + "loss": 1.0173, + "step": 3175 + }, + { + "epoch": 0.2837804632876896, + "grad_norm": 0.38316190242767334, + "learning_rate": 8.404019506081104e-05, + "loss": 0.9547, + "step": 3176 + }, + { + "epoch": 0.28386981481895146, + "grad_norm": 0.3933804929256439, + "learning_rate": 8.402959433105303e-05, + "loss": 0.9311, + "step": 3177 + }, + { + "epoch": 0.28395916635021334, + "grad_norm": 0.39881816506385803, + "learning_rate": 8.401899075095378e-05, + "loss": 0.9956, + "step": 3178 + }, + { + "epoch": 0.28404851788147517, + "grad_norm": 0.4697539806365967, + "learning_rate": 8.400838432140142e-05, + "loss": 0.935, + "step": 3179 + }, + { + "epoch": 0.28413786941273705, + "grad_norm": 0.4180028438568115, + "learning_rate": 8.399777504328438e-05, + "loss": 1.0263, + "step": 3180 + }, + { + "epoch": 0.28422722094399894, + "grad_norm": 0.4734386205673218, + "learning_rate": 8.398716291749129e-05, + "loss": 0.911, + "step": 3181 + }, + { + "epoch": 0.2843165724752608, + "grad_norm": 0.41403621435165405, + "learning_rate": 8.397654794491103e-05, + "loss": 1.0165, + "step": 3182 + }, + { + "epoch": 0.28440592400652265, + "grad_norm": 0.5192162990570068, + "learning_rate": 8.396593012643272e-05, + "loss": 0.9693, + "step": 3183 + }, + { + "epoch": 0.28449527553778453, + "grad_norm": 0.40429311990737915, + "learning_rate": 8.39553094629457e-05, + "loss": 1.0161, + "step": 3184 + }, + { + "epoch": 0.2845846270690464, + "grad_norm": 0.43155255913734436, + "learning_rate": 8.394468595533958e-05, + "loss": 1.0094, + "step": 3185 + }, + { + "epoch": 0.28467397860030824, + "grad_norm": 0.4038214683532715, + "learning_rate": 8.393405960450418e-05, + "loss": 1.0635, + "step": 3186 + }, + { + "epoch": 0.2847633301315701, + "grad_norm": 0.5239593982696533, + "learning_rate": 8.392343041132959e-05, + "loss": 0.9088, + "step": 3187 + }, + { + "epoch": 0.284852681662832, + "grad_norm": 0.4169578552246094, + "learning_rate": 8.391279837670609e-05, + "loss": 0.9349, + "step": 3188 + }, + { + "epoch": 0.2849420331940939, + "grad_norm": 0.5473026633262634, + "learning_rate": 8.390216350152425e-05, + "loss": 1.0211, + "step": 3189 + }, + { + "epoch": 0.2850313847253557, + "grad_norm": 0.45344579219818115, + "learning_rate": 8.389152578667485e-05, + "loss": 1.0067, + "step": 3190 + }, + { + "epoch": 0.2851207362566176, + "grad_norm": 0.4618963599205017, + "learning_rate": 8.38808852330489e-05, + "loss": 0.996, + "step": 3191 + }, + { + "epoch": 0.2852100877878795, + "grad_norm": 0.5052065253257751, + "learning_rate": 8.387024184153766e-05, + "loss": 0.9071, + "step": 3192 + }, + { + "epoch": 0.2852994393191413, + "grad_norm": 0.413125604391098, + "learning_rate": 8.385959561303265e-05, + "loss": 1.022, + "step": 3193 + }, + { + "epoch": 0.2853887908504032, + "grad_norm": 0.3971850574016571, + "learning_rate": 8.384894654842559e-05, + "loss": 1.0205, + "step": 3194 + }, + { + "epoch": 0.2854781423816651, + "grad_norm": 0.3980804681777954, + "learning_rate": 8.383829464860844e-05, + "loss": 1.0146, + "step": 3195 + }, + { + "epoch": 0.2855674939129269, + "grad_norm": 0.46941179037094116, + "learning_rate": 8.382763991447344e-05, + "loss": 1.0008, + "step": 3196 + }, + { + "epoch": 0.2856568454441888, + "grad_norm": 0.38151049613952637, + "learning_rate": 8.381698234691301e-05, + "loss": 1.0283, + "step": 3197 + }, + { + "epoch": 0.2857461969754507, + "grad_norm": 0.5456905364990234, + "learning_rate": 8.380632194681986e-05, + "loss": 0.92, + "step": 3198 + }, + { + "epoch": 0.28583554850671256, + "grad_norm": 0.3786112368106842, + "learning_rate": 8.379565871508688e-05, + "loss": 0.9825, + "step": 3199 + }, + { + "epoch": 0.2859249000379744, + "grad_norm": 0.48761385679244995, + "learning_rate": 8.378499265260724e-05, + "loss": 0.9437, + "step": 3200 + }, + { + "epoch": 0.28601425156923627, + "grad_norm": 0.4335952401161194, + "learning_rate": 8.377432376027437e-05, + "loss": 0.9783, + "step": 3201 + }, + { + "epoch": 0.28610360310049815, + "grad_norm": 0.44660720229148865, + "learning_rate": 8.376365203898184e-05, + "loss": 1.0364, + "step": 3202 + }, + { + "epoch": 0.28619295463176, + "grad_norm": 0.4328174889087677, + "learning_rate": 8.375297748962358e-05, + "loss": 1.002, + "step": 3203 + }, + { + "epoch": 0.28628230616302186, + "grad_norm": 0.4029334783554077, + "learning_rate": 8.374230011309368e-05, + "loss": 0.9958, + "step": 3204 + }, + { + "epoch": 0.28637165769428374, + "grad_norm": 0.49282363057136536, + "learning_rate": 8.373161991028646e-05, + "loss": 0.8783, + "step": 3205 + }, + { + "epoch": 0.2864610092255456, + "grad_norm": 0.4617169499397278, + "learning_rate": 8.372093688209655e-05, + "loss": 1.0666, + "step": 3206 + }, + { + "epoch": 0.28655036075680745, + "grad_norm": 0.44560667872428894, + "learning_rate": 8.371025102941872e-05, + "loss": 0.9826, + "step": 3207 + }, + { + "epoch": 0.28663971228806934, + "grad_norm": 0.4756043255329132, + "learning_rate": 8.369956235314802e-05, + "loss": 0.9956, + "step": 3208 + }, + { + "epoch": 0.2867290638193312, + "grad_norm": 0.4700027108192444, + "learning_rate": 8.368887085417977e-05, + "loss": 1.0006, + "step": 3209 + }, + { + "epoch": 0.28681841535059305, + "grad_norm": 0.47527703642845154, + "learning_rate": 8.367817653340951e-05, + "loss": 0.9828, + "step": 3210 + }, + { + "epoch": 0.28690776688185493, + "grad_norm": 0.5730937719345093, + "learning_rate": 8.366747939173297e-05, + "loss": 0.9436, + "step": 3211 + }, + { + "epoch": 0.2869971184131168, + "grad_norm": 0.47630396485328674, + "learning_rate": 8.365677943004617e-05, + "loss": 0.9992, + "step": 3212 + }, + { + "epoch": 0.2870864699443787, + "grad_norm": 0.40669214725494385, + "learning_rate": 8.364607664924533e-05, + "loss": 1.015, + "step": 3213 + }, + { + "epoch": 0.2871758214756405, + "grad_norm": 0.3980100750923157, + "learning_rate": 8.363537105022692e-05, + "loss": 0.968, + "step": 3214 + }, + { + "epoch": 0.2872651730069024, + "grad_norm": 0.4197221100330353, + "learning_rate": 8.362466263388766e-05, + "loss": 1.0064, + "step": 3215 + }, + { + "epoch": 0.2873545245381643, + "grad_norm": 0.42029672861099243, + "learning_rate": 8.36139514011245e-05, + "loss": 1.0118, + "step": 3216 + }, + { + "epoch": 0.2874438760694261, + "grad_norm": 0.3882405757904053, + "learning_rate": 8.36032373528346e-05, + "loss": 1.0255, + "step": 3217 + }, + { + "epoch": 0.287533227600688, + "grad_norm": 0.4027383625507355, + "learning_rate": 8.359252048991539e-05, + "loss": 1.013, + "step": 3218 + }, + { + "epoch": 0.2876225791319499, + "grad_norm": 0.4605361819267273, + "learning_rate": 8.358180081326454e-05, + "loss": 0.9371, + "step": 3219 + }, + { + "epoch": 0.28771193066321177, + "grad_norm": 0.4364441931247711, + "learning_rate": 8.357107832377988e-05, + "loss": 0.9737, + "step": 3220 + }, + { + "epoch": 0.2878012821944736, + "grad_norm": 0.4793910086154938, + "learning_rate": 8.356035302235959e-05, + "loss": 0.9819, + "step": 3221 + }, + { + "epoch": 0.2878906337257355, + "grad_norm": 0.39052850008010864, + "learning_rate": 8.354962490990202e-05, + "loss": 1.0593, + "step": 3222 + }, + { + "epoch": 0.28797998525699736, + "grad_norm": 0.4997557997703552, + "learning_rate": 8.353889398730572e-05, + "loss": 1.0613, + "step": 3223 + }, + { + "epoch": 0.2880693367882592, + "grad_norm": 0.398324191570282, + "learning_rate": 8.352816025546957e-05, + "loss": 0.9801, + "step": 3224 + }, + { + "epoch": 0.2881586883195211, + "grad_norm": 0.5084810256958008, + "learning_rate": 8.351742371529261e-05, + "loss": 1.0013, + "step": 3225 + }, + { + "epoch": 0.28824803985078296, + "grad_norm": 0.4650806486606598, + "learning_rate": 8.350668436767413e-05, + "loss": 0.9618, + "step": 3226 + }, + { + "epoch": 0.2883373913820448, + "grad_norm": 0.43210747838020325, + "learning_rate": 8.349594221351368e-05, + "loss": 0.9195, + "step": 3227 + }, + { + "epoch": 0.28842674291330667, + "grad_norm": 0.4068807363510132, + "learning_rate": 8.348519725371105e-05, + "loss": 1.1076, + "step": 3228 + }, + { + "epoch": 0.28851609444456855, + "grad_norm": 0.4721238911151886, + "learning_rate": 8.347444948916622e-05, + "loss": 1.0048, + "step": 3229 + }, + { + "epoch": 0.28860544597583043, + "grad_norm": 0.4541776776313782, + "learning_rate": 8.346369892077942e-05, + "loss": 0.9736, + "step": 3230 + }, + { + "epoch": 0.28869479750709226, + "grad_norm": 0.5041652917861938, + "learning_rate": 8.345294554945113e-05, + "loss": 0.9833, + "step": 3231 + }, + { + "epoch": 0.28878414903835414, + "grad_norm": 0.48984166979789734, + "learning_rate": 8.344218937608211e-05, + "loss": 0.9144, + "step": 3232 + }, + { + "epoch": 0.288873500569616, + "grad_norm": 0.3938770294189453, + "learning_rate": 8.343143040157322e-05, + "loss": 0.9965, + "step": 3233 + }, + { + "epoch": 0.28896285210087785, + "grad_norm": 0.36259424686431885, + "learning_rate": 8.34206686268257e-05, + "loss": 1.0121, + "step": 3234 + }, + { + "epoch": 0.28905220363213974, + "grad_norm": 0.46610236167907715, + "learning_rate": 8.340990405274091e-05, + "loss": 0.9109, + "step": 3235 + }, + { + "epoch": 0.2891415551634016, + "grad_norm": 0.39891326427459717, + "learning_rate": 8.339913668022057e-05, + "loss": 0.9904, + "step": 3236 + }, + { + "epoch": 0.2892309066946635, + "grad_norm": 0.4756152629852295, + "learning_rate": 8.338836651016652e-05, + "loss": 1.0002, + "step": 3237 + }, + { + "epoch": 0.28932025822592533, + "grad_norm": 0.42526543140411377, + "learning_rate": 8.337759354348087e-05, + "loss": 1.0233, + "step": 3238 + }, + { + "epoch": 0.2894096097571872, + "grad_norm": 0.4663248360157013, + "learning_rate": 8.336681778106598e-05, + "loss": 1.0187, + "step": 3239 + }, + { + "epoch": 0.2894989612884491, + "grad_norm": 0.39710041880607605, + "learning_rate": 8.335603922382444e-05, + "loss": 1.0259, + "step": 3240 + }, + { + "epoch": 0.2895883128197109, + "grad_norm": 0.4082708954811096, + "learning_rate": 8.334525787265905e-05, + "loss": 0.9772, + "step": 3241 + }, + { + "epoch": 0.2896776643509728, + "grad_norm": 0.41001391410827637, + "learning_rate": 8.333447372847291e-05, + "loss": 0.987, + "step": 3242 + }, + { + "epoch": 0.2897670158822347, + "grad_norm": 0.39033588767051697, + "learning_rate": 8.332368679216927e-05, + "loss": 0.9732, + "step": 3243 + }, + { + "epoch": 0.2898563674134966, + "grad_norm": 0.4427189230918884, + "learning_rate": 8.331289706465162e-05, + "loss": 1.0081, + "step": 3244 + }, + { + "epoch": 0.2899457189447584, + "grad_norm": 0.3782840669155121, + "learning_rate": 8.330210454682379e-05, + "loss": 1.0221, + "step": 3245 + }, + { + "epoch": 0.2900350704760203, + "grad_norm": 0.460595041513443, + "learning_rate": 8.32913092395897e-05, + "loss": 1.0137, + "step": 3246 + }, + { + "epoch": 0.29012442200728217, + "grad_norm": 0.45908665657043457, + "learning_rate": 8.328051114385362e-05, + "loss": 0.9745, + "step": 3247 + }, + { + "epoch": 0.290213773538544, + "grad_norm": 0.5565299987792969, + "learning_rate": 8.326971026052e-05, + "loss": 0.87, + "step": 3248 + }, + { + "epoch": 0.2903031250698059, + "grad_norm": 0.37024036049842834, + "learning_rate": 8.32589065904935e-05, + "loss": 1.031, + "step": 3249 + }, + { + "epoch": 0.29039247660106776, + "grad_norm": 0.42501765489578247, + "learning_rate": 8.324810013467905e-05, + "loss": 0.9865, + "step": 3250 + }, + { + "epoch": 0.29048182813232964, + "grad_norm": 0.38723427057266235, + "learning_rate": 8.323729089398181e-05, + "loss": 1.0355, + "step": 3251 + }, + { + "epoch": 0.2905711796635915, + "grad_norm": 0.37960848212242126, + "learning_rate": 8.322647886930718e-05, + "loss": 0.955, + "step": 3252 + }, + { + "epoch": 0.29066053119485336, + "grad_norm": 0.3721432089805603, + "learning_rate": 8.32156640615608e-05, + "loss": 0.982, + "step": 3253 + }, + { + "epoch": 0.29074988272611524, + "grad_norm": 0.43282589316368103, + "learning_rate": 8.320484647164848e-05, + "loss": 1.0314, + "step": 3254 + }, + { + "epoch": 0.29083923425737707, + "grad_norm": 0.4642455577850342, + "learning_rate": 8.319402610047633e-05, + "loss": 0.9311, + "step": 3255 + }, + { + "epoch": 0.29092858578863895, + "grad_norm": 0.3716782331466675, + "learning_rate": 8.318320294895067e-05, + "loss": 1.0325, + "step": 3256 + }, + { + "epoch": 0.29101793731990083, + "grad_norm": 0.4215312600135803, + "learning_rate": 8.317237701797807e-05, + "loss": 0.9463, + "step": 3257 + }, + { + "epoch": 0.29110728885116266, + "grad_norm": 0.49139776825904846, + "learning_rate": 8.316154830846528e-05, + "loss": 0.9384, + "step": 3258 + }, + { + "epoch": 0.29119664038242454, + "grad_norm": 0.4280232787132263, + "learning_rate": 8.315071682131936e-05, + "loss": 1.0093, + "step": 3259 + }, + { + "epoch": 0.2912859919136864, + "grad_norm": 0.42184942960739136, + "learning_rate": 8.313988255744754e-05, + "loss": 0.9151, + "step": 3260 + }, + { + "epoch": 0.2913753434449483, + "grad_norm": 0.44842901825904846, + "learning_rate": 8.312904551775731e-05, + "loss": 1.0307, + "step": 3261 + }, + { + "epoch": 0.29146469497621014, + "grad_norm": 0.39945071935653687, + "learning_rate": 8.311820570315639e-05, + "loss": 1.0681, + "step": 3262 + }, + { + "epoch": 0.291554046507472, + "grad_norm": 0.4484541416168213, + "learning_rate": 8.310736311455271e-05, + "loss": 0.9827, + "step": 3263 + }, + { + "epoch": 0.2916433980387339, + "grad_norm": 0.39152559638023376, + "learning_rate": 8.30965177528545e-05, + "loss": 1.0095, + "step": 3264 + }, + { + "epoch": 0.29173274956999573, + "grad_norm": 0.4719206392765045, + "learning_rate": 8.308566961897012e-05, + "loss": 0.9007, + "step": 3265 + }, + { + "epoch": 0.2918221011012576, + "grad_norm": 0.46025243401527405, + "learning_rate": 8.307481871380824e-05, + "loss": 1.0282, + "step": 3266 + }, + { + "epoch": 0.2919114526325195, + "grad_norm": 0.41656240820884705, + "learning_rate": 8.306396503827775e-05, + "loss": 0.9827, + "step": 3267 + }, + { + "epoch": 0.2920008041637814, + "grad_norm": 0.440974622964859, + "learning_rate": 8.305310859328777e-05, + "loss": 0.9863, + "step": 3268 + }, + { + "epoch": 0.2920901556950432, + "grad_norm": 0.3946766257286072, + "learning_rate": 8.30422493797476e-05, + "loss": 0.9565, + "step": 3269 + }, + { + "epoch": 0.2921795072263051, + "grad_norm": 0.47271499037742615, + "learning_rate": 8.303138739856684e-05, + "loss": 0.9492, + "step": 3270 + }, + { + "epoch": 0.292268858757567, + "grad_norm": 0.4621809124946594, + "learning_rate": 8.302052265065531e-05, + "loss": 0.926, + "step": 3271 + }, + { + "epoch": 0.2923582102888288, + "grad_norm": 0.4402875006198883, + "learning_rate": 8.300965513692303e-05, + "loss": 1.0155, + "step": 3272 + }, + { + "epoch": 0.2924475618200907, + "grad_norm": 0.4282824695110321, + "learning_rate": 8.299878485828028e-05, + "loss": 1.0171, + "step": 3273 + }, + { + "epoch": 0.29253691335135257, + "grad_norm": 0.47499823570251465, + "learning_rate": 8.298791181563754e-05, + "loss": 0.9616, + "step": 3274 + }, + { + "epoch": 0.29262626488261445, + "grad_norm": 0.4334074556827545, + "learning_rate": 8.297703600990556e-05, + "loss": 0.9689, + "step": 3275 + }, + { + "epoch": 0.2927156164138763, + "grad_norm": 0.42571699619293213, + "learning_rate": 8.296615744199532e-05, + "loss": 1.152, + "step": 3276 + }, + { + "epoch": 0.29280496794513816, + "grad_norm": 0.473748117685318, + "learning_rate": 8.295527611281799e-05, + "loss": 0.9805, + "step": 3277 + }, + { + "epoch": 0.29289431947640004, + "grad_norm": 0.4350266456604004, + "learning_rate": 8.2944392023285e-05, + "loss": 1.0552, + "step": 3278 + }, + { + "epoch": 0.29298367100766187, + "grad_norm": 0.43737518787384033, + "learning_rate": 8.293350517430805e-05, + "loss": 0.9802, + "step": 3279 + }, + { + "epoch": 0.29307302253892376, + "grad_norm": 0.45549145340919495, + "learning_rate": 8.292261556679897e-05, + "loss": 0.9499, + "step": 3280 + }, + { + "epoch": 0.29316237407018564, + "grad_norm": 0.4989168345928192, + "learning_rate": 8.29117232016699e-05, + "loss": 0.9162, + "step": 3281 + }, + { + "epoch": 0.2932517256014475, + "grad_norm": 0.4217113256454468, + "learning_rate": 8.29008280798332e-05, + "loss": 0.9838, + "step": 3282 + }, + { + "epoch": 0.29334107713270935, + "grad_norm": 0.382791668176651, + "learning_rate": 8.288993020220147e-05, + "loss": 1.0894, + "step": 3283 + }, + { + "epoch": 0.29343042866397123, + "grad_norm": 0.4287773072719574, + "learning_rate": 8.287902956968748e-05, + "loss": 0.9894, + "step": 3284 + }, + { + "epoch": 0.2935197801952331, + "grad_norm": 0.4374977648258209, + "learning_rate": 8.286812618320431e-05, + "loss": 0.9899, + "step": 3285 + }, + { + "epoch": 0.29360913172649494, + "grad_norm": 0.37822476029396057, + "learning_rate": 8.285722004366522e-05, + "loss": 1.0091, + "step": 3286 + }, + { + "epoch": 0.2936984832577568, + "grad_norm": 0.4243577718734741, + "learning_rate": 8.284631115198371e-05, + "loss": 0.9796, + "step": 3287 + }, + { + "epoch": 0.2937878347890187, + "grad_norm": 0.556516706943512, + "learning_rate": 8.283539950907356e-05, + "loss": 0.9766, + "step": 3288 + }, + { + "epoch": 0.29387718632028054, + "grad_norm": 0.4003793001174927, + "learning_rate": 8.282448511584866e-05, + "loss": 0.9797, + "step": 3289 + }, + { + "epoch": 0.2939665378515424, + "grad_norm": 0.39269542694091797, + "learning_rate": 8.281356797322327e-05, + "loss": 0.966, + "step": 3290 + }, + { + "epoch": 0.2940558893828043, + "grad_norm": 0.4365537464618683, + "learning_rate": 8.280264808211179e-05, + "loss": 1.0566, + "step": 3291 + }, + { + "epoch": 0.2941452409140662, + "grad_norm": 0.42626434564590454, + "learning_rate": 8.279172544342888e-05, + "loss": 0.9358, + "step": 3292 + }, + { + "epoch": 0.294234592445328, + "grad_norm": 0.5638636946678162, + "learning_rate": 8.278080005808943e-05, + "loss": 0.9842, + "step": 3293 + }, + { + "epoch": 0.2943239439765899, + "grad_norm": 0.3928448259830475, + "learning_rate": 8.276987192700856e-05, + "loss": 1.0545, + "step": 3294 + }, + { + "epoch": 0.2944132955078518, + "grad_norm": 0.4263659119606018, + "learning_rate": 8.275894105110161e-05, + "loss": 0.9396, + "step": 3295 + }, + { + "epoch": 0.2945026470391136, + "grad_norm": 0.4779483377933502, + "learning_rate": 8.274800743128417e-05, + "loss": 1.0356, + "step": 3296 + }, + { + "epoch": 0.2945919985703755, + "grad_norm": 0.3762441575527191, + "learning_rate": 8.273707106847202e-05, + "loss": 1.0131, + "step": 3297 + }, + { + "epoch": 0.2946813501016374, + "grad_norm": 0.4361181855201721, + "learning_rate": 8.272613196358124e-05, + "loss": 1.0383, + "step": 3298 + }, + { + "epoch": 0.29477070163289926, + "grad_norm": 0.3914470076560974, + "learning_rate": 8.271519011752807e-05, + "loss": 0.9641, + "step": 3299 + }, + { + "epoch": 0.2948600531641611, + "grad_norm": 0.4699605703353882, + "learning_rate": 8.270424553122899e-05, + "loss": 0.9384, + "step": 3300 + }, + { + "epoch": 0.29494940469542297, + "grad_norm": 0.4258545935153961, + "learning_rate": 8.269329820560075e-05, + "loss": 1.0225, + "step": 3301 + }, + { + "epoch": 0.29503875622668485, + "grad_norm": 0.4467248320579529, + "learning_rate": 8.26823481415603e-05, + "loss": 0.9794, + "step": 3302 + }, + { + "epoch": 0.2951281077579467, + "grad_norm": 0.49532341957092285, + "learning_rate": 8.267139534002483e-05, + "loss": 0.9664, + "step": 3303 + }, + { + "epoch": 0.29521745928920856, + "grad_norm": 0.5230326056480408, + "learning_rate": 8.266043980191175e-05, + "loss": 1.001, + "step": 3304 + }, + { + "epoch": 0.29530681082047044, + "grad_norm": 0.46619758009910583, + "learning_rate": 8.26494815281387e-05, + "loss": 0.8984, + "step": 3305 + }, + { + "epoch": 0.2953961623517323, + "grad_norm": 0.514583170413971, + "learning_rate": 8.263852051962356e-05, + "loss": 0.9236, + "step": 3306 + }, + { + "epoch": 0.29548551388299416, + "grad_norm": 0.48377975821495056, + "learning_rate": 8.26275567772844e-05, + "loss": 1.0234, + "step": 3307 + }, + { + "epoch": 0.29557486541425604, + "grad_norm": 0.3904837667942047, + "learning_rate": 8.261659030203961e-05, + "loss": 1.072, + "step": 3308 + }, + { + "epoch": 0.2956642169455179, + "grad_norm": 0.3960415720939636, + "learning_rate": 8.260562109480768e-05, + "loss": 0.9875, + "step": 3309 + }, + { + "epoch": 0.29575356847677975, + "grad_norm": 0.4395782947540283, + "learning_rate": 8.259464915650745e-05, + "loss": 0.9658, + "step": 3310 + }, + { + "epoch": 0.29584292000804163, + "grad_norm": 0.548772394657135, + "learning_rate": 8.258367448805791e-05, + "loss": 0.9851, + "step": 3311 + }, + { + "epoch": 0.2959322715393035, + "grad_norm": 0.38434407114982605, + "learning_rate": 8.257269709037832e-05, + "loss": 1.0152, + "step": 3312 + }, + { + "epoch": 0.2960216230705654, + "grad_norm": 0.44232288002967834, + "learning_rate": 8.256171696438817e-05, + "loss": 0.9907, + "step": 3313 + }, + { + "epoch": 0.2961109746018272, + "grad_norm": 0.4280986487865448, + "learning_rate": 8.25507341110071e-05, + "loss": 0.9695, + "step": 3314 + }, + { + "epoch": 0.2962003261330891, + "grad_norm": 0.5346389412879944, + "learning_rate": 8.253974853115511e-05, + "loss": 0.9704, + "step": 3315 + }, + { + "epoch": 0.296289677664351, + "grad_norm": 0.43099406361579895, + "learning_rate": 8.252876022575234e-05, + "loss": 1.0432, + "step": 3316 + }, + { + "epoch": 0.2963790291956128, + "grad_norm": 0.412399560213089, + "learning_rate": 8.251776919571915e-05, + "loss": 1.0178, + "step": 3317 + }, + { + "epoch": 0.2964683807268747, + "grad_norm": 0.40774041414260864, + "learning_rate": 8.250677544197619e-05, + "loss": 1.0056, + "step": 3318 + }, + { + "epoch": 0.2965577322581366, + "grad_norm": 0.4034601151943207, + "learning_rate": 8.249577896544427e-05, + "loss": 1.0198, + "step": 3319 + }, + { + "epoch": 0.2966470837893984, + "grad_norm": 0.46452754735946655, + "learning_rate": 8.248477976704449e-05, + "loss": 1.0138, + "step": 3320 + }, + { + "epoch": 0.2967364353206603, + "grad_norm": 0.512606143951416, + "learning_rate": 8.247377784769814e-05, + "loss": 1.0394, + "step": 3321 + }, + { + "epoch": 0.2968257868519222, + "grad_norm": 0.557276725769043, + "learning_rate": 8.246277320832676e-05, + "loss": 1.0412, + "step": 3322 + }, + { + "epoch": 0.29691513838318406, + "grad_norm": 0.4517113268375397, + "learning_rate": 8.245176584985208e-05, + "loss": 0.9373, + "step": 3323 + }, + { + "epoch": 0.2970044899144459, + "grad_norm": 0.3471103608608246, + "learning_rate": 8.24407557731961e-05, + "loss": 1.0308, + "step": 3324 + }, + { + "epoch": 0.2970938414457078, + "grad_norm": 0.48082980513572693, + "learning_rate": 8.242974297928105e-05, + "loss": 0.9768, + "step": 3325 + }, + { + "epoch": 0.29718319297696966, + "grad_norm": 0.47311216592788696, + "learning_rate": 8.241872746902935e-05, + "loss": 0.9338, + "step": 3326 + }, + { + "epoch": 0.2972725445082315, + "grad_norm": 0.5065577626228333, + "learning_rate": 8.240770924336364e-05, + "loss": 0.9229, + "step": 3327 + }, + { + "epoch": 0.29736189603949337, + "grad_norm": 0.45983678102493286, + "learning_rate": 8.239668830320686e-05, + "loss": 1.0353, + "step": 3328 + }, + { + "epoch": 0.29745124757075525, + "grad_norm": 0.5233861804008484, + "learning_rate": 8.23856646494821e-05, + "loss": 0.9412, + "step": 3329 + }, + { + "epoch": 0.29754059910201713, + "grad_norm": 0.4303194284439087, + "learning_rate": 8.237463828311272e-05, + "loss": 1.0683, + "step": 3330 + }, + { + "epoch": 0.29762995063327896, + "grad_norm": 0.4418349862098694, + "learning_rate": 8.23636092050223e-05, + "loss": 0.9342, + "step": 3331 + }, + { + "epoch": 0.29771930216454084, + "grad_norm": 0.41626089811325073, + "learning_rate": 8.235257741613463e-05, + "loss": 0.9939, + "step": 3332 + }, + { + "epoch": 0.2978086536958027, + "grad_norm": 0.41136887669563293, + "learning_rate": 8.234154291737375e-05, + "loss": 1.0507, + "step": 3333 + }, + { + "epoch": 0.29789800522706456, + "grad_norm": 0.4649127125740051, + "learning_rate": 8.233050570966392e-05, + "loss": 1.0003, + "step": 3334 + }, + { + "epoch": 0.29798735675832644, + "grad_norm": 0.43239861726760864, + "learning_rate": 8.231946579392961e-05, + "loss": 0.9848, + "step": 3335 + }, + { + "epoch": 0.2980767082895883, + "grad_norm": 0.395386278629303, + "learning_rate": 8.230842317109555e-05, + "loss": 1.0269, + "step": 3336 + }, + { + "epoch": 0.2981660598208502, + "grad_norm": 0.5006843209266663, + "learning_rate": 8.229737784208667e-05, + "loss": 1.0536, + "step": 3337 + }, + { + "epoch": 0.29825541135211203, + "grad_norm": 0.39409273862838745, + "learning_rate": 8.22863298078281e-05, + "loss": 0.9827, + "step": 3338 + }, + { + "epoch": 0.2983447628833739, + "grad_norm": 0.4018579125404358, + "learning_rate": 8.22752790692453e-05, + "loss": 1.0009, + "step": 3339 + }, + { + "epoch": 0.2984341144146358, + "grad_norm": 0.4127073585987091, + "learning_rate": 8.226422562726383e-05, + "loss": 1.0012, + "step": 3340 + }, + { + "epoch": 0.2985234659458976, + "grad_norm": 0.4360758662223816, + "learning_rate": 8.225316948280955e-05, + "loss": 1.0098, + "step": 3341 + }, + { + "epoch": 0.2986128174771595, + "grad_norm": 0.4666980803012848, + "learning_rate": 8.224211063680853e-05, + "loss": 1.0034, + "step": 3342 + }, + { + "epoch": 0.2987021690084214, + "grad_norm": 0.4008931517601013, + "learning_rate": 8.223104909018707e-05, + "loss": 1.0339, + "step": 3343 + }, + { + "epoch": 0.2987915205396833, + "grad_norm": 0.38966014981269836, + "learning_rate": 8.22199848438717e-05, + "loss": 1.0023, + "step": 3344 + }, + { + "epoch": 0.2988808720709451, + "grad_norm": 0.48258697986602783, + "learning_rate": 8.220891789878915e-05, + "loss": 0.9615, + "step": 3345 + }, + { + "epoch": 0.298970223602207, + "grad_norm": 0.4819931387901306, + "learning_rate": 8.21978482558664e-05, + "loss": 0.9779, + "step": 3346 + }, + { + "epoch": 0.29905957513346887, + "grad_norm": 0.48947110772132874, + "learning_rate": 8.218677591603066e-05, + "loss": 0.9124, + "step": 3347 + }, + { + "epoch": 0.2991489266647307, + "grad_norm": 0.39361852407455444, + "learning_rate": 8.217570088020936e-05, + "loss": 1.079, + "step": 3348 + }, + { + "epoch": 0.2992382781959926, + "grad_norm": 0.5385176539421082, + "learning_rate": 8.216462314933013e-05, + "loss": 0.9939, + "step": 3349 + }, + { + "epoch": 0.29932762972725446, + "grad_norm": 0.4513779878616333, + "learning_rate": 8.215354272432086e-05, + "loss": 0.9908, + "step": 3350 + }, + { + "epoch": 0.2994169812585163, + "grad_norm": 0.4059743285179138, + "learning_rate": 8.214245960610967e-05, + "loss": 1.02, + "step": 3351 + }, + { + "epoch": 0.2995063327897782, + "grad_norm": 0.43754130601882935, + "learning_rate": 8.213137379562485e-05, + "loss": 0.9945, + "step": 3352 + }, + { + "epoch": 0.29959568432104006, + "grad_norm": 0.414631724357605, + "learning_rate": 8.2120285293795e-05, + "loss": 1.0089, + "step": 3353 + }, + { + "epoch": 0.29968503585230194, + "grad_norm": 0.518393337726593, + "learning_rate": 8.210919410154888e-05, + "loss": 1.0141, + "step": 3354 + }, + { + "epoch": 0.29977438738356377, + "grad_norm": 0.3928556442260742, + "learning_rate": 8.209810021981548e-05, + "loss": 1.0099, + "step": 3355 + }, + { + "epoch": 0.29986373891482565, + "grad_norm": 0.4404885470867157, + "learning_rate": 8.208700364952404e-05, + "loss": 1.0929, + "step": 3356 + }, + { + "epoch": 0.29995309044608753, + "grad_norm": 0.5233496427536011, + "learning_rate": 8.207590439160404e-05, + "loss": 0.9228, + "step": 3357 + }, + { + "epoch": 0.30004244197734936, + "grad_norm": 0.35965949296951294, + "learning_rate": 8.206480244698514e-05, + "loss": 1.0296, + "step": 3358 + }, + { + "epoch": 0.30013179350861124, + "grad_norm": 0.45257434248924255, + "learning_rate": 8.205369781659724e-05, + "loss": 0.9396, + "step": 3359 + }, + { + "epoch": 0.3002211450398731, + "grad_norm": 0.4101848304271698, + "learning_rate": 8.204259050137048e-05, + "loss": 1.0811, + "step": 3360 + }, + { + "epoch": 0.300310496571135, + "grad_norm": 0.48168009519577026, + "learning_rate": 8.203148050223521e-05, + "loss": 0.9708, + "step": 3361 + }, + { + "epoch": 0.30039984810239684, + "grad_norm": 0.434770405292511, + "learning_rate": 8.202036782012203e-05, + "loss": 0.9487, + "step": 3362 + }, + { + "epoch": 0.3004891996336587, + "grad_norm": 0.3673984706401825, + "learning_rate": 8.200925245596174e-05, + "loss": 1.0221, + "step": 3363 + }, + { + "epoch": 0.3005785511649206, + "grad_norm": 0.4212033152580261, + "learning_rate": 8.199813441068535e-05, + "loss": 1.0128, + "step": 3364 + }, + { + "epoch": 0.30066790269618243, + "grad_norm": 0.42803311347961426, + "learning_rate": 8.198701368522412e-05, + "loss": 1.0614, + "step": 3365 + }, + { + "epoch": 0.3007572542274443, + "grad_norm": 0.48348864912986755, + "learning_rate": 8.197589028050956e-05, + "loss": 0.9918, + "step": 3366 + }, + { + "epoch": 0.3008466057587062, + "grad_norm": 0.455049067735672, + "learning_rate": 8.196476419747333e-05, + "loss": 1.0028, + "step": 3367 + }, + { + "epoch": 0.3009359572899681, + "grad_norm": 0.397243857383728, + "learning_rate": 8.195363543704739e-05, + "loss": 0.9751, + "step": 3368 + }, + { + "epoch": 0.3010253088212299, + "grad_norm": 0.3902703523635864, + "learning_rate": 8.194250400016388e-05, + "loss": 1.0078, + "step": 3369 + }, + { + "epoch": 0.3011146603524918, + "grad_norm": 0.41560593247413635, + "learning_rate": 8.193136988775516e-05, + "loss": 0.9526, + "step": 3370 + }, + { + "epoch": 0.3012040118837537, + "grad_norm": 0.44474300742149353, + "learning_rate": 8.192023310075387e-05, + "loss": 0.8866, + "step": 3371 + }, + { + "epoch": 0.3012933634150155, + "grad_norm": 0.34557124972343445, + "learning_rate": 8.190909364009281e-05, + "loss": 1.0232, + "step": 3372 + }, + { + "epoch": 0.3013827149462774, + "grad_norm": 0.4132451117038727, + "learning_rate": 8.1897951506705e-05, + "loss": 1.0186, + "step": 3373 + }, + { + "epoch": 0.30147206647753927, + "grad_norm": 0.4821149706840515, + "learning_rate": 8.188680670152379e-05, + "loss": 0.9414, + "step": 3374 + }, + { + "epoch": 0.30156141800880115, + "grad_norm": 0.43094414472579956, + "learning_rate": 8.18756592254826e-05, + "loss": 0.9497, + "step": 3375 + }, + { + "epoch": 0.301650769540063, + "grad_norm": 0.3759251832962036, + "learning_rate": 8.18645090795152e-05, + "loss": 1.0405, + "step": 3376 + }, + { + "epoch": 0.30174012107132486, + "grad_norm": 0.5242640376091003, + "learning_rate": 8.185335626455549e-05, + "loss": 0.9464, + "step": 3377 + }, + { + "epoch": 0.30182947260258675, + "grad_norm": 0.4868447780609131, + "learning_rate": 8.184220078153768e-05, + "loss": 0.9646, + "step": 3378 + }, + { + "epoch": 0.3019188241338486, + "grad_norm": 0.441510945558548, + "learning_rate": 8.183104263139613e-05, + "loss": 0.9802, + "step": 3379 + }, + { + "epoch": 0.30200817566511046, + "grad_norm": 0.39666980504989624, + "learning_rate": 8.181988181506546e-05, + "loss": 1.0346, + "step": 3380 + }, + { + "epoch": 0.30209752719637234, + "grad_norm": 0.3387872874736786, + "learning_rate": 8.180871833348052e-05, + "loss": 0.9721, + "step": 3381 + }, + { + "epoch": 0.30218687872763417, + "grad_norm": 0.4515991508960724, + "learning_rate": 8.179755218757636e-05, + "loss": 0.9902, + "step": 3382 + }, + { + "epoch": 0.30227623025889605, + "grad_norm": 0.4831897020339966, + "learning_rate": 8.178638337828828e-05, + "loss": 0.9039, + "step": 3383 + }, + { + "epoch": 0.30236558179015793, + "grad_norm": 0.44388824701309204, + "learning_rate": 8.177521190655178e-05, + "loss": 0.9844, + "step": 3384 + }, + { + "epoch": 0.3024549333214198, + "grad_norm": 0.495235800743103, + "learning_rate": 8.176403777330255e-05, + "loss": 0.9928, + "step": 3385 + }, + { + "epoch": 0.30254428485268164, + "grad_norm": 0.5274266600608826, + "learning_rate": 8.17528609794766e-05, + "loss": 1.0191, + "step": 3386 + }, + { + "epoch": 0.3026336363839435, + "grad_norm": 0.3730257749557495, + "learning_rate": 8.174168152601007e-05, + "loss": 0.9684, + "step": 3387 + }, + { + "epoch": 0.3027229879152054, + "grad_norm": 0.45568758249282837, + "learning_rate": 8.173049941383938e-05, + "loss": 0.9999, + "step": 3388 + }, + { + "epoch": 0.30281233944646724, + "grad_norm": 0.4109109342098236, + "learning_rate": 8.171931464390115e-05, + "loss": 1.0108, + "step": 3389 + }, + { + "epoch": 0.3029016909777291, + "grad_norm": 0.4820617735385895, + "learning_rate": 8.170812721713218e-05, + "loss": 1.0419, + "step": 3390 + }, + { + "epoch": 0.302991042508991, + "grad_norm": 0.4437454044818878, + "learning_rate": 8.169693713446959e-05, + "loss": 0.9256, + "step": 3391 + }, + { + "epoch": 0.3030803940402529, + "grad_norm": 0.4973864257335663, + "learning_rate": 8.168574439685067e-05, + "loss": 0.9822, + "step": 3392 + }, + { + "epoch": 0.3031697455715147, + "grad_norm": 0.41152751445770264, + "learning_rate": 8.167454900521289e-05, + "loss": 1.02, + "step": 3393 + }, + { + "epoch": 0.3032590971027766, + "grad_norm": 0.42877259850502014, + "learning_rate": 8.1663350960494e-05, + "loss": 1.0445, + "step": 3394 + }, + { + "epoch": 0.3033484486340385, + "grad_norm": 0.5001733899116516, + "learning_rate": 8.165215026363198e-05, + "loss": 0.996, + "step": 3395 + }, + { + "epoch": 0.3034378001653003, + "grad_norm": 0.43617990612983704, + "learning_rate": 8.164094691556496e-05, + "loss": 0.9811, + "step": 3396 + }, + { + "epoch": 0.3035271516965622, + "grad_norm": 0.4765497148036957, + "learning_rate": 8.162974091723139e-05, + "loss": 0.976, + "step": 3397 + }, + { + "epoch": 0.3036165032278241, + "grad_norm": 0.45245134830474854, + "learning_rate": 8.161853226956985e-05, + "loss": 1.0029, + "step": 3398 + }, + { + "epoch": 0.30370585475908596, + "grad_norm": 0.4027976989746094, + "learning_rate": 8.160732097351924e-05, + "loss": 1.0238, + "step": 3399 + }, + { + "epoch": 0.3037952062903478, + "grad_norm": 0.4003666341304779, + "learning_rate": 8.159610703001857e-05, + "loss": 1.0662, + "step": 3400 + }, + { + "epoch": 0.30388455782160967, + "grad_norm": 0.4242064952850342, + "learning_rate": 8.158489044000713e-05, + "loss": 1.0077, + "step": 3401 + }, + { + "epoch": 0.30397390935287155, + "grad_norm": 0.3776360750198364, + "learning_rate": 8.157367120442445e-05, + "loss": 1.0848, + "step": 3402 + }, + { + "epoch": 0.3040632608841334, + "grad_norm": 0.40885689854621887, + "learning_rate": 8.156244932421026e-05, + "loss": 0.9957, + "step": 3403 + }, + { + "epoch": 0.30415261241539526, + "grad_norm": 0.44386911392211914, + "learning_rate": 8.155122480030453e-05, + "loss": 1.0783, + "step": 3404 + }, + { + "epoch": 0.30424196394665715, + "grad_norm": 0.38162335753440857, + "learning_rate": 8.153999763364738e-05, + "loss": 1.0232, + "step": 3405 + }, + { + "epoch": 0.30433131547791903, + "grad_norm": 0.3886288106441498, + "learning_rate": 8.152876782517925e-05, + "loss": 1.0595, + "step": 3406 + }, + { + "epoch": 0.30442066700918086, + "grad_norm": 0.43489503860473633, + "learning_rate": 8.151753537584073e-05, + "loss": 1.0397, + "step": 3407 + }, + { + "epoch": 0.30451001854044274, + "grad_norm": 0.4794921278953552, + "learning_rate": 8.150630028657267e-05, + "loss": 0.9464, + "step": 3408 + }, + { + "epoch": 0.3045993700717046, + "grad_norm": 0.5062465071678162, + "learning_rate": 8.149506255831614e-05, + "loss": 0.9363, + "step": 3409 + }, + { + "epoch": 0.30468872160296645, + "grad_norm": 0.4547816514968872, + "learning_rate": 8.148382219201241e-05, + "loss": 0.981, + "step": 3410 + }, + { + "epoch": 0.30477807313422833, + "grad_norm": 0.43893638253211975, + "learning_rate": 8.147257918860296e-05, + "loss": 1.0097, + "step": 3411 + }, + { + "epoch": 0.3048674246654902, + "grad_norm": 0.4645942151546478, + "learning_rate": 8.146133354902954e-05, + "loss": 1.0032, + "step": 3412 + }, + { + "epoch": 0.3049567761967521, + "grad_norm": 0.39374154806137085, + "learning_rate": 8.145008527423409e-05, + "loss": 1.0049, + "step": 3413 + }, + { + "epoch": 0.3050461277280139, + "grad_norm": 0.4022867977619171, + "learning_rate": 8.143883436515875e-05, + "loss": 0.9876, + "step": 3414 + }, + { + "epoch": 0.3051354792592758, + "grad_norm": 0.4411511719226837, + "learning_rate": 8.142758082274593e-05, + "loss": 0.9858, + "step": 3415 + }, + { + "epoch": 0.3052248307905377, + "grad_norm": 0.4204576909542084, + "learning_rate": 8.141632464793822e-05, + "loss": 0.9826, + "step": 3416 + }, + { + "epoch": 0.3053141823217995, + "grad_norm": 0.40963998436927795, + "learning_rate": 8.140506584167845e-05, + "loss": 0.9313, + "step": 3417 + }, + { + "epoch": 0.3054035338530614, + "grad_norm": 0.40934035181999207, + "learning_rate": 8.139380440490965e-05, + "loss": 1.037, + "step": 3418 + }, + { + "epoch": 0.3054928853843233, + "grad_norm": 0.3806384801864624, + "learning_rate": 8.13825403385751e-05, + "loss": 0.9968, + "step": 3419 + }, + { + "epoch": 0.3055822369155851, + "grad_norm": 0.3986847400665283, + "learning_rate": 8.137127364361828e-05, + "loss": 0.9992, + "step": 3420 + }, + { + "epoch": 0.305671588446847, + "grad_norm": 0.44412630796432495, + "learning_rate": 8.136000432098291e-05, + "loss": 1.0355, + "step": 3421 + }, + { + "epoch": 0.3057609399781089, + "grad_norm": 0.43642017245292664, + "learning_rate": 8.13487323716129e-05, + "loss": 1.0421, + "step": 3422 + }, + { + "epoch": 0.30585029150937076, + "grad_norm": 0.39217737317085266, + "learning_rate": 8.133745779645238e-05, + "loss": 1.0206, + "step": 3423 + }, + { + "epoch": 0.3059396430406326, + "grad_norm": 0.456626832485199, + "learning_rate": 8.132618059644577e-05, + "loss": 1.0341, + "step": 3424 + }, + { + "epoch": 0.3060289945718945, + "grad_norm": 0.5178848505020142, + "learning_rate": 8.13149007725376e-05, + "loss": 0.9424, + "step": 3425 + }, + { + "epoch": 0.30611834610315636, + "grad_norm": 0.49753326177597046, + "learning_rate": 8.13036183256727e-05, + "loss": 1.0006, + "step": 3426 + }, + { + "epoch": 0.3062076976344182, + "grad_norm": 0.48190826177597046, + "learning_rate": 8.129233325679611e-05, + "loss": 0.9446, + "step": 3427 + }, + { + "epoch": 0.30629704916568007, + "grad_norm": 0.431306928396225, + "learning_rate": 8.128104556685305e-05, + "loss": 0.9653, + "step": 3428 + }, + { + "epoch": 0.30638640069694195, + "grad_norm": 0.44903165102005005, + "learning_rate": 8.126975525678898e-05, + "loss": 1.0018, + "step": 3429 + }, + { + "epoch": 0.30647575222820383, + "grad_norm": 0.4540453553199768, + "learning_rate": 8.12584623275496e-05, + "loss": 0.9977, + "step": 3430 + }, + { + "epoch": 0.30656510375946566, + "grad_norm": 0.40107932686805725, + "learning_rate": 8.124716678008082e-05, + "loss": 1.058, + "step": 3431 + }, + { + "epoch": 0.30665445529072755, + "grad_norm": 0.3945641815662384, + "learning_rate": 8.123586861532873e-05, + "loss": 1.0387, + "step": 3432 + }, + { + "epoch": 0.30674380682198943, + "grad_norm": 0.45806485414505005, + "learning_rate": 8.12245678342397e-05, + "loss": 1.0011, + "step": 3433 + }, + { + "epoch": 0.30683315835325126, + "grad_norm": 0.49401962757110596, + "learning_rate": 8.12132644377603e-05, + "loss": 1.005, + "step": 3434 + }, + { + "epoch": 0.30692250988451314, + "grad_norm": 0.44378378987312317, + "learning_rate": 8.120195842683728e-05, + "loss": 0.9761, + "step": 3435 + }, + { + "epoch": 0.307011861415775, + "grad_norm": 0.4618094563484192, + "learning_rate": 8.119064980241766e-05, + "loss": 0.9972, + "step": 3436 + }, + { + "epoch": 0.3071012129470369, + "grad_norm": 0.4815685749053955, + "learning_rate": 8.117933856544864e-05, + "loss": 0.9633, + "step": 3437 + }, + { + "epoch": 0.30719056447829873, + "grad_norm": 0.43568116426467896, + "learning_rate": 8.116802471687768e-05, + "loss": 0.9158, + "step": 3438 + }, + { + "epoch": 0.3072799160095606, + "grad_norm": 0.4017315208911896, + "learning_rate": 8.11567082576524e-05, + "loss": 1.0004, + "step": 3439 + }, + { + "epoch": 0.3073692675408225, + "grad_norm": 0.43449345231056213, + "learning_rate": 8.114538918872072e-05, + "loss": 1.0021, + "step": 3440 + }, + { + "epoch": 0.3074586190720843, + "grad_norm": 0.37431442737579346, + "learning_rate": 8.11340675110307e-05, + "loss": 1.0407, + "step": 3441 + }, + { + "epoch": 0.3075479706033462, + "grad_norm": 0.42223483324050903, + "learning_rate": 8.112274322553067e-05, + "loss": 0.9816, + "step": 3442 + }, + { + "epoch": 0.3076373221346081, + "grad_norm": 0.5260857939720154, + "learning_rate": 8.111141633316914e-05, + "loss": 0.9194, + "step": 3443 + }, + { + "epoch": 0.30772667366587, + "grad_norm": 0.4374569356441498, + "learning_rate": 8.110008683489487e-05, + "loss": 1.0595, + "step": 3444 + }, + { + "epoch": 0.3078160251971318, + "grad_norm": 0.45745593309402466, + "learning_rate": 8.108875473165683e-05, + "loss": 0.988, + "step": 3445 + }, + { + "epoch": 0.3079053767283937, + "grad_norm": 0.42707595229148865, + "learning_rate": 8.10774200244042e-05, + "loss": 1.0022, + "step": 3446 + }, + { + "epoch": 0.30799472825965557, + "grad_norm": 0.4703771471977234, + "learning_rate": 8.106608271408638e-05, + "loss": 1.0088, + "step": 3447 + }, + { + "epoch": 0.3080840797909174, + "grad_norm": 0.5333291292190552, + "learning_rate": 8.105474280165298e-05, + "loss": 0.994, + "step": 3448 + }, + { + "epoch": 0.3081734313221793, + "grad_norm": 0.3782300055027008, + "learning_rate": 8.104340028805386e-05, + "loss": 1.0284, + "step": 3449 + }, + { + "epoch": 0.30826278285344116, + "grad_norm": 0.5105333924293518, + "learning_rate": 8.103205517423906e-05, + "loss": 0.9954, + "step": 3450 + }, + { + "epoch": 0.308352134384703, + "grad_norm": 0.37891459465026855, + "learning_rate": 8.102070746115888e-05, + "loss": 0.9678, + "step": 3451 + }, + { + "epoch": 0.3084414859159649, + "grad_norm": 0.47059276700019836, + "learning_rate": 8.100935714976377e-05, + "loss": 0.9004, + "step": 3452 + }, + { + "epoch": 0.30853083744722676, + "grad_norm": 0.38149774074554443, + "learning_rate": 8.099800424100448e-05, + "loss": 0.9746, + "step": 3453 + }, + { + "epoch": 0.30862018897848864, + "grad_norm": 0.4379188120365143, + "learning_rate": 8.09866487358319e-05, + "loss": 1.02, + "step": 3454 + }, + { + "epoch": 0.30870954050975047, + "grad_norm": 0.4322841465473175, + "learning_rate": 8.097529063519723e-05, + "loss": 0.9619, + "step": 3455 + }, + { + "epoch": 0.30879889204101235, + "grad_norm": 0.4443778097629547, + "learning_rate": 8.096392994005177e-05, + "loss": 0.9386, + "step": 3456 + }, + { + "epoch": 0.30888824357227423, + "grad_norm": 0.4804965853691101, + "learning_rate": 8.095256665134712e-05, + "loss": 0.9817, + "step": 3457 + }, + { + "epoch": 0.30897759510353606, + "grad_norm": 0.5008002519607544, + "learning_rate": 8.094120077003509e-05, + "loss": 0.9661, + "step": 3458 + }, + { + "epoch": 0.30906694663479795, + "grad_norm": 0.4188991189002991, + "learning_rate": 8.092983229706767e-05, + "loss": 0.9905, + "step": 3459 + }, + { + "epoch": 0.30915629816605983, + "grad_norm": 0.3990739583969116, + "learning_rate": 8.091846123339715e-05, + "loss": 1.0172, + "step": 3460 + }, + { + "epoch": 0.3092456496973217, + "grad_norm": 0.43267446756362915, + "learning_rate": 8.090708757997591e-05, + "loss": 1.0371, + "step": 3461 + }, + { + "epoch": 0.30933500122858354, + "grad_norm": 0.42393234372138977, + "learning_rate": 8.089571133775663e-05, + "loss": 1.0194, + "step": 3462 + }, + { + "epoch": 0.3094243527598454, + "grad_norm": 0.4767492711544037, + "learning_rate": 8.08843325076922e-05, + "loss": 1.0495, + "step": 3463 + }, + { + "epoch": 0.3095137042911073, + "grad_norm": 0.42636582255363464, + "learning_rate": 8.087295109073574e-05, + "loss": 1.0434, + "step": 3464 + }, + { + "epoch": 0.30960305582236913, + "grad_norm": 0.49471211433410645, + "learning_rate": 8.086156708784054e-05, + "loss": 0.9261, + "step": 3465 + }, + { + "epoch": 0.309692407353631, + "grad_norm": 0.4437471628189087, + "learning_rate": 8.085018049996011e-05, + "loss": 1.0086, + "step": 3466 + }, + { + "epoch": 0.3097817588848929, + "grad_norm": 0.44580212235450745, + "learning_rate": 8.083879132804826e-05, + "loss": 0.9378, + "step": 3467 + }, + { + "epoch": 0.3098711104161548, + "grad_norm": 0.4966757595539093, + "learning_rate": 8.082739957305889e-05, + "loss": 0.9868, + "step": 3468 + }, + { + "epoch": 0.3099604619474166, + "grad_norm": 0.5008419752120972, + "learning_rate": 8.081600523594622e-05, + "loss": 0.9713, + "step": 3469 + }, + { + "epoch": 0.3100498134786785, + "grad_norm": 0.44220054149627686, + "learning_rate": 8.080460831766464e-05, + "loss": 0.9581, + "step": 3470 + }, + { + "epoch": 0.3101391650099404, + "grad_norm": 0.47498950362205505, + "learning_rate": 8.079320881916877e-05, + "loss": 1.0412, + "step": 3471 + }, + { + "epoch": 0.3102285165412022, + "grad_norm": 0.48593392968177795, + "learning_rate": 8.07818067414134e-05, + "loss": 0.9469, + "step": 3472 + }, + { + "epoch": 0.3103178680724641, + "grad_norm": 0.3928394019603729, + "learning_rate": 8.077040208535362e-05, + "loss": 0.9524, + "step": 3473 + }, + { + "epoch": 0.31040721960372597, + "grad_norm": 0.38281822204589844, + "learning_rate": 8.075899485194467e-05, + "loss": 1.0437, + "step": 3474 + }, + { + "epoch": 0.31049657113498785, + "grad_norm": 0.408634752035141, + "learning_rate": 8.074758504214206e-05, + "loss": 0.9995, + "step": 3475 + }, + { + "epoch": 0.3105859226662497, + "grad_norm": 0.4135921001434326, + "learning_rate": 8.073617265690144e-05, + "loss": 1.0512, + "step": 3476 + }, + { + "epoch": 0.31067527419751156, + "grad_norm": 0.45093652606010437, + "learning_rate": 8.072475769717872e-05, + "loss": 0.9089, + "step": 3477 + }, + { + "epoch": 0.31076462572877345, + "grad_norm": 0.38925284147262573, + "learning_rate": 8.071334016393006e-05, + "loss": 0.958, + "step": 3478 + }, + { + "epoch": 0.3108539772600353, + "grad_norm": 0.4077800512313843, + "learning_rate": 8.070192005811177e-05, + "loss": 1.0573, + "step": 3479 + }, + { + "epoch": 0.31094332879129716, + "grad_norm": 0.48132577538490295, + "learning_rate": 8.069049738068041e-05, + "loss": 0.899, + "step": 3480 + }, + { + "epoch": 0.31103268032255904, + "grad_norm": 0.4954333007335663, + "learning_rate": 8.067907213259278e-05, + "loss": 0.9735, + "step": 3481 + }, + { + "epoch": 0.31112203185382087, + "grad_norm": 0.3865114152431488, + "learning_rate": 8.066764431480583e-05, + "loss": 0.9941, + "step": 3482 + }, + { + "epoch": 0.31121138338508275, + "grad_norm": 0.3530360460281372, + "learning_rate": 8.065621392827678e-05, + "loss": 1.0592, + "step": 3483 + }, + { + "epoch": 0.31130073491634463, + "grad_norm": 0.4171468913555145, + "learning_rate": 8.064478097396304e-05, + "loss": 0.9654, + "step": 3484 + }, + { + "epoch": 0.3113900864476065, + "grad_norm": 0.508972704410553, + "learning_rate": 8.063334545282224e-05, + "loss": 0.8936, + "step": 3485 + }, + { + "epoch": 0.31147943797886835, + "grad_norm": 0.3663281500339508, + "learning_rate": 8.062190736581223e-05, + "loss": 1.0223, + "step": 3486 + }, + { + "epoch": 0.31156878951013023, + "grad_norm": 0.4794585704803467, + "learning_rate": 8.061046671389107e-05, + "loss": 0.9199, + "step": 3487 + }, + { + "epoch": 0.3116581410413921, + "grad_norm": 0.4471113383769989, + "learning_rate": 8.059902349801704e-05, + "loss": 0.9633, + "step": 3488 + }, + { + "epoch": 0.31174749257265394, + "grad_norm": 0.3974318504333496, + "learning_rate": 8.058757771914865e-05, + "loss": 0.9769, + "step": 3489 + }, + { + "epoch": 0.3118368441039158, + "grad_norm": 0.4705379605293274, + "learning_rate": 8.057612937824456e-05, + "loss": 0.9843, + "step": 3490 + }, + { + "epoch": 0.3119261956351777, + "grad_norm": 0.3952665328979492, + "learning_rate": 8.056467847626373e-05, + "loss": 1.0787, + "step": 3491 + }, + { + "epoch": 0.3120155471664396, + "grad_norm": 0.451646089553833, + "learning_rate": 8.055322501416527e-05, + "loss": 1.0125, + "step": 3492 + }, + { + "epoch": 0.3121048986977014, + "grad_norm": 0.4705079197883606, + "learning_rate": 8.054176899290855e-05, + "loss": 0.9875, + "step": 3493 + }, + { + "epoch": 0.3121942502289633, + "grad_norm": 0.4389439523220062, + "learning_rate": 8.053031041345312e-05, + "loss": 1.0343, + "step": 3494 + }, + { + "epoch": 0.3122836017602252, + "grad_norm": 0.513843834400177, + "learning_rate": 8.051884927675877e-05, + "loss": 0.9618, + "step": 3495 + }, + { + "epoch": 0.312372953291487, + "grad_norm": 0.4346451163291931, + "learning_rate": 8.050738558378549e-05, + "loss": 1.0277, + "step": 3496 + }, + { + "epoch": 0.3124623048227489, + "grad_norm": 0.5713686347007751, + "learning_rate": 8.049591933549347e-05, + "loss": 0.9842, + "step": 3497 + }, + { + "epoch": 0.3125516563540108, + "grad_norm": 0.4521034359931946, + "learning_rate": 8.048445053284315e-05, + "loss": 0.9809, + "step": 3498 + }, + { + "epoch": 0.31264100788527266, + "grad_norm": 0.4172121584415436, + "learning_rate": 8.047297917679515e-05, + "loss": 1.0247, + "step": 3499 + }, + { + "epoch": 0.3127303594165345, + "grad_norm": 0.4868249297142029, + "learning_rate": 8.046150526831033e-05, + "loss": 0.9791, + "step": 3500 + }, + { + "epoch": 0.31281971094779637, + "grad_norm": 0.34857162833213806, + "learning_rate": 8.045002880834975e-05, + "loss": 1.0506, + "step": 3501 + }, + { + "epoch": 0.31290906247905825, + "grad_norm": 0.4393348693847656, + "learning_rate": 8.043854979787467e-05, + "loss": 1.0026, + "step": 3502 + }, + { + "epoch": 0.3129984140103201, + "grad_norm": 0.38047170639038086, + "learning_rate": 8.04270682378466e-05, + "loss": 1.0297, + "step": 3503 + }, + { + "epoch": 0.31308776554158196, + "grad_norm": 0.39295217394828796, + "learning_rate": 8.041558412922724e-05, + "loss": 0.997, + "step": 3504 + }, + { + "epoch": 0.31317711707284385, + "grad_norm": 0.4318108856678009, + "learning_rate": 8.04040974729785e-05, + "loss": 0.9394, + "step": 3505 + }, + { + "epoch": 0.31326646860410573, + "grad_norm": 0.41878437995910645, + "learning_rate": 8.039260827006252e-05, + "loss": 1.012, + "step": 3506 + }, + { + "epoch": 0.31335582013536756, + "grad_norm": 0.3880411982536316, + "learning_rate": 8.038111652144163e-05, + "loss": 0.962, + "step": 3507 + }, + { + "epoch": 0.31344517166662944, + "grad_norm": 0.39992988109588623, + "learning_rate": 8.036962222807838e-05, + "loss": 0.9838, + "step": 3508 + }, + { + "epoch": 0.3135345231978913, + "grad_norm": 0.3804689347743988, + "learning_rate": 8.035812539093557e-05, + "loss": 1.0372, + "step": 3509 + }, + { + "epoch": 0.31362387472915315, + "grad_norm": 0.4325745403766632, + "learning_rate": 8.034662601097615e-05, + "loss": 1.032, + "step": 3510 + }, + { + "epoch": 0.31371322626041503, + "grad_norm": 0.46068939566612244, + "learning_rate": 8.033512408916334e-05, + "loss": 0.9907, + "step": 3511 + }, + { + "epoch": 0.3138025777916769, + "grad_norm": 0.5449123382568359, + "learning_rate": 8.032361962646053e-05, + "loss": 0.9458, + "step": 3512 + }, + { + "epoch": 0.31389192932293875, + "grad_norm": 0.4366675913333893, + "learning_rate": 8.031211262383136e-05, + "loss": 1.0291, + "step": 3513 + }, + { + "epoch": 0.31398128085420063, + "grad_norm": 0.4750640392303467, + "learning_rate": 8.030060308223964e-05, + "loss": 0.9661, + "step": 3514 + }, + { + "epoch": 0.3140706323854625, + "grad_norm": 0.365343302488327, + "learning_rate": 8.028909100264943e-05, + "loss": 1.0437, + "step": 3515 + }, + { + "epoch": 0.3141599839167244, + "grad_norm": 0.580810546875, + "learning_rate": 8.0277576386025e-05, + "loss": 0.9558, + "step": 3516 + }, + { + "epoch": 0.3142493354479862, + "grad_norm": 0.5106691718101501, + "learning_rate": 8.026605923333081e-05, + "loss": 0.9663, + "step": 3517 + }, + { + "epoch": 0.3143386869792481, + "grad_norm": 0.48174557089805603, + "learning_rate": 8.025453954553155e-05, + "loss": 0.9992, + "step": 3518 + }, + { + "epoch": 0.31442803851051, + "grad_norm": 0.41393786668777466, + "learning_rate": 8.024301732359212e-05, + "loss": 1.0187, + "step": 3519 + }, + { + "epoch": 0.3145173900417718, + "grad_norm": 0.4682253897190094, + "learning_rate": 8.02314925684776e-05, + "loss": 0.9622, + "step": 3520 + }, + { + "epoch": 0.3146067415730337, + "grad_norm": 0.4221019446849823, + "learning_rate": 8.021996528115335e-05, + "loss": 1.0434, + "step": 3521 + }, + { + "epoch": 0.3146960931042956, + "grad_norm": 0.3874817490577698, + "learning_rate": 8.020843546258487e-05, + "loss": 0.9968, + "step": 3522 + }, + { + "epoch": 0.31478544463555747, + "grad_norm": 0.49449774622917175, + "learning_rate": 8.019690311373793e-05, + "loss": 0.9772, + "step": 3523 + }, + { + "epoch": 0.3148747961668193, + "grad_norm": 0.4564272463321686, + "learning_rate": 8.018536823557848e-05, + "loss": 0.9736, + "step": 3524 + }, + { + "epoch": 0.3149641476980812, + "grad_norm": 0.3975536525249481, + "learning_rate": 8.017383082907269e-05, + "loss": 0.988, + "step": 3525 + }, + { + "epoch": 0.31505349922934306, + "grad_norm": 0.42857542634010315, + "learning_rate": 8.016229089518694e-05, + "loss": 0.9846, + "step": 3526 + }, + { + "epoch": 0.3151428507606049, + "grad_norm": 0.34974923729896545, + "learning_rate": 8.01507484348878e-05, + "loss": 0.9828, + "step": 3527 + }, + { + "epoch": 0.31523220229186677, + "grad_norm": 0.39053013920783997, + "learning_rate": 8.013920344914212e-05, + "loss": 1.0341, + "step": 3528 + }, + { + "epoch": 0.31532155382312865, + "grad_norm": 0.44937631487846375, + "learning_rate": 8.012765593891688e-05, + "loss": 0.9921, + "step": 3529 + }, + { + "epoch": 0.31541090535439054, + "grad_norm": 0.5252249836921692, + "learning_rate": 8.011610590517932e-05, + "loss": 0.9491, + "step": 3530 + }, + { + "epoch": 0.31550025688565236, + "grad_norm": 0.3755653500556946, + "learning_rate": 8.010455334889689e-05, + "loss": 1.0336, + "step": 3531 + }, + { + "epoch": 0.31558960841691425, + "grad_norm": 0.47862106561660767, + "learning_rate": 8.00929982710372e-05, + "loss": 1.0282, + "step": 3532 + }, + { + "epoch": 0.31567895994817613, + "grad_norm": 0.47411367297172546, + "learning_rate": 8.008144067256815e-05, + "loss": 0.8744, + "step": 3533 + }, + { + "epoch": 0.31576831147943796, + "grad_norm": 0.41446375846862793, + "learning_rate": 8.006988055445778e-05, + "loss": 0.9799, + "step": 3534 + }, + { + "epoch": 0.31585766301069984, + "grad_norm": 0.45246684551239014, + "learning_rate": 8.00583179176744e-05, + "loss": 0.9723, + "step": 3535 + }, + { + "epoch": 0.3159470145419617, + "grad_norm": 0.46888962388038635, + "learning_rate": 8.004675276318651e-05, + "loss": 0.9581, + "step": 3536 + }, + { + "epoch": 0.3160363660732236, + "grad_norm": 0.4019709527492523, + "learning_rate": 8.00351850919628e-05, + "loss": 0.982, + "step": 3537 + }, + { + "epoch": 0.31612571760448543, + "grad_norm": 0.4450971186161041, + "learning_rate": 8.002361490497217e-05, + "loss": 0.9854, + "step": 3538 + }, + { + "epoch": 0.3162150691357473, + "grad_norm": 0.5295263528823853, + "learning_rate": 8.001204220318377e-05, + "loss": 0.9173, + "step": 3539 + }, + { + "epoch": 0.3163044206670092, + "grad_norm": 0.47927385568618774, + "learning_rate": 8.000046698756694e-05, + "loss": 0.8995, + "step": 3540 + }, + { + "epoch": 0.31639377219827103, + "grad_norm": 0.6096744537353516, + "learning_rate": 7.99888892590912e-05, + "loss": 0.9354, + "step": 3541 + }, + { + "epoch": 0.3164831237295329, + "grad_norm": 0.40292373299598694, + "learning_rate": 7.997730901872635e-05, + "loss": 1.0007, + "step": 3542 + }, + { + "epoch": 0.3165724752607948, + "grad_norm": 0.532189667224884, + "learning_rate": 7.996572626744232e-05, + "loss": 1.0307, + "step": 3543 + }, + { + "epoch": 0.3166618267920566, + "grad_norm": 0.5631407499313354, + "learning_rate": 7.99541410062093e-05, + "loss": 0.9184, + "step": 3544 + }, + { + "epoch": 0.3167511783233185, + "grad_norm": 0.4307250678539276, + "learning_rate": 7.994255323599769e-05, + "loss": 0.9451, + "step": 3545 + }, + { + "epoch": 0.3168405298545804, + "grad_norm": 0.4229245185852051, + "learning_rate": 7.993096295777807e-05, + "loss": 1.0172, + "step": 3546 + }, + { + "epoch": 0.31692988138584227, + "grad_norm": 0.4741484522819519, + "learning_rate": 7.991937017252126e-05, + "loss": 0.9643, + "step": 3547 + }, + { + "epoch": 0.3170192329171041, + "grad_norm": 0.46456822752952576, + "learning_rate": 7.990777488119829e-05, + "loss": 1.0751, + "step": 3548 + }, + { + "epoch": 0.317108584448366, + "grad_norm": 0.4328577518463135, + "learning_rate": 7.989617708478039e-05, + "loss": 0.9881, + "step": 3549 + }, + { + "epoch": 0.31719793597962787, + "grad_norm": 0.48607033491134644, + "learning_rate": 7.988457678423898e-05, + "loss": 1.0788, + "step": 3550 + }, + { + "epoch": 0.3172872875108897, + "grad_norm": 0.4115293622016907, + "learning_rate": 7.987297398054572e-05, + "loss": 1.005, + "step": 3551 + }, + { + "epoch": 0.3173766390421516, + "grad_norm": 0.4423523247241974, + "learning_rate": 7.986136867467247e-05, + "loss": 1.0001, + "step": 3552 + }, + { + "epoch": 0.31746599057341346, + "grad_norm": 0.4115248918533325, + "learning_rate": 7.984976086759128e-05, + "loss": 1.009, + "step": 3553 + }, + { + "epoch": 0.31755534210467534, + "grad_norm": 0.4350251853466034, + "learning_rate": 7.983815056027444e-05, + "loss": 0.9741, + "step": 3554 + }, + { + "epoch": 0.31764469363593717, + "grad_norm": 0.42540842294692993, + "learning_rate": 7.982653775369444e-05, + "loss": 0.9493, + "step": 3555 + }, + { + "epoch": 0.31773404516719905, + "grad_norm": 0.4265209436416626, + "learning_rate": 7.981492244882398e-05, + "loss": 0.9707, + "step": 3556 + }, + { + "epoch": 0.31782339669846094, + "grad_norm": 0.4104164242744446, + "learning_rate": 7.980330464663597e-05, + "loss": 1.0317, + "step": 3557 + }, + { + "epoch": 0.31791274822972276, + "grad_norm": 0.39542466402053833, + "learning_rate": 7.979168434810352e-05, + "loss": 0.9764, + "step": 3558 + }, + { + "epoch": 0.31800209976098465, + "grad_norm": 0.3717448115348816, + "learning_rate": 7.978006155419993e-05, + "loss": 0.9869, + "step": 3559 + }, + { + "epoch": 0.31809145129224653, + "grad_norm": 0.42824071645736694, + "learning_rate": 7.976843626589875e-05, + "loss": 0.9522, + "step": 3560 + }, + { + "epoch": 0.3181808028235084, + "grad_norm": 0.5165953636169434, + "learning_rate": 7.975680848417373e-05, + "loss": 1.0895, + "step": 3561 + }, + { + "epoch": 0.31827015435477024, + "grad_norm": 0.4496411681175232, + "learning_rate": 7.974517820999883e-05, + "loss": 0.9996, + "step": 3562 + }, + { + "epoch": 0.3183595058860321, + "grad_norm": 0.41469454765319824, + "learning_rate": 7.973354544434818e-05, + "loss": 0.9418, + "step": 3563 + }, + { + "epoch": 0.318448857417294, + "grad_norm": 0.4479805529117584, + "learning_rate": 7.972191018819615e-05, + "loss": 0.986, + "step": 3564 + }, + { + "epoch": 0.31853820894855583, + "grad_norm": 0.5563523173332214, + "learning_rate": 7.971027244251734e-05, + "loss": 0.9081, + "step": 3565 + }, + { + "epoch": 0.3186275604798177, + "grad_norm": 0.44090694189071655, + "learning_rate": 7.969863220828654e-05, + "loss": 0.968, + "step": 3566 + }, + { + "epoch": 0.3187169120110796, + "grad_norm": 0.4855956435203552, + "learning_rate": 7.968698948647872e-05, + "loss": 0.8986, + "step": 3567 + }, + { + "epoch": 0.3188062635423415, + "grad_norm": 0.5776668787002563, + "learning_rate": 7.967534427806909e-05, + "loss": 1.0122, + "step": 3568 + }, + { + "epoch": 0.3188956150736033, + "grad_norm": 0.487651526927948, + "learning_rate": 7.966369658403305e-05, + "loss": 0.9403, + "step": 3569 + }, + { + "epoch": 0.3189849666048652, + "grad_norm": 0.5066558122634888, + "learning_rate": 7.965204640534623e-05, + "loss": 0.9553, + "step": 3570 + }, + { + "epoch": 0.3190743181361271, + "grad_norm": 0.47500649094581604, + "learning_rate": 7.964039374298447e-05, + "loss": 0.9324, + "step": 3571 + }, + { + "epoch": 0.3191636696673889, + "grad_norm": 0.3772512376308441, + "learning_rate": 7.962873859792377e-05, + "loss": 0.9869, + "step": 3572 + }, + { + "epoch": 0.3192530211986508, + "grad_norm": 0.44486117362976074, + "learning_rate": 7.96170809711404e-05, + "loss": 1.0367, + "step": 3573 + }, + { + "epoch": 0.31934237272991267, + "grad_norm": 0.6170856356620789, + "learning_rate": 7.96054208636108e-05, + "loss": 1.091, + "step": 3574 + }, + { + "epoch": 0.3194317242611745, + "grad_norm": 0.46623945236206055, + "learning_rate": 7.959375827631165e-05, + "loss": 0.9985, + "step": 3575 + }, + { + "epoch": 0.3195210757924364, + "grad_norm": 0.38250869512557983, + "learning_rate": 7.958209321021979e-05, + "loss": 1.027, + "step": 3576 + }, + { + "epoch": 0.31961042732369827, + "grad_norm": 0.36755305528640747, + "learning_rate": 7.95704256663123e-05, + "loss": 1.0075, + "step": 3577 + }, + { + "epoch": 0.31969977885496015, + "grad_norm": 0.47335994243621826, + "learning_rate": 7.955875564556645e-05, + "loss": 0.9888, + "step": 3578 + }, + { + "epoch": 0.319789130386222, + "grad_norm": 0.4479726254940033, + "learning_rate": 7.954708314895975e-05, + "loss": 0.9571, + "step": 3579 + }, + { + "epoch": 0.31987848191748386, + "grad_norm": 0.4270458519458771, + "learning_rate": 7.953540817746988e-05, + "loss": 0.9431, + "step": 3580 + }, + { + "epoch": 0.31996783344874574, + "grad_norm": 0.42351189255714417, + "learning_rate": 7.952373073207478e-05, + "loss": 0.9591, + "step": 3581 + }, + { + "epoch": 0.32005718498000757, + "grad_norm": 0.44711586833000183, + "learning_rate": 7.951205081375249e-05, + "loss": 0.9685, + "step": 3582 + }, + { + "epoch": 0.32014653651126945, + "grad_norm": 0.44724342226982117, + "learning_rate": 7.950036842348139e-05, + "loss": 0.9795, + "step": 3583 + }, + { + "epoch": 0.32023588804253134, + "grad_norm": 0.4775746762752533, + "learning_rate": 7.948868356223997e-05, + "loss": 0.9054, + "step": 3584 + }, + { + "epoch": 0.3203252395737932, + "grad_norm": 0.4386691451072693, + "learning_rate": 7.9476996231007e-05, + "loss": 0.9949, + "step": 3585 + }, + { + "epoch": 0.32041459110505505, + "grad_norm": 0.40129563212394714, + "learning_rate": 7.946530643076138e-05, + "loss": 1.0254, + "step": 3586 + }, + { + "epoch": 0.32050394263631693, + "grad_norm": 0.4904455542564392, + "learning_rate": 7.945361416248226e-05, + "loss": 0.8934, + "step": 3587 + }, + { + "epoch": 0.3205932941675788, + "grad_norm": 0.4553501009941101, + "learning_rate": 7.9441919427149e-05, + "loss": 0.9707, + "step": 3588 + }, + { + "epoch": 0.32068264569884064, + "grad_norm": 0.38946229219436646, + "learning_rate": 7.943022222574116e-05, + "loss": 1.0248, + "step": 3589 + }, + { + "epoch": 0.3207719972301025, + "grad_norm": 0.39546647667884827, + "learning_rate": 7.941852255923852e-05, + "loss": 1.0132, + "step": 3590 + }, + { + "epoch": 0.3208613487613644, + "grad_norm": 0.4963313937187195, + "learning_rate": 7.940682042862104e-05, + "loss": 0.9254, + "step": 3591 + }, + { + "epoch": 0.3209507002926263, + "grad_norm": 0.42194679379463196, + "learning_rate": 7.939511583486887e-05, + "loss": 0.9982, + "step": 3592 + }, + { + "epoch": 0.3210400518238881, + "grad_norm": 0.4145607054233551, + "learning_rate": 7.938340877896244e-05, + "loss": 1.0461, + "step": 3593 + }, + { + "epoch": 0.32112940335515, + "grad_norm": 0.5402034521102905, + "learning_rate": 7.937169926188232e-05, + "loss": 1.0905, + "step": 3594 + }, + { + "epoch": 0.3212187548864119, + "grad_norm": 0.394927978515625, + "learning_rate": 7.935998728460929e-05, + "loss": 0.9712, + "step": 3595 + }, + { + "epoch": 0.3213081064176737, + "grad_norm": 0.393349289894104, + "learning_rate": 7.934827284812438e-05, + "loss": 0.9541, + "step": 3596 + }, + { + "epoch": 0.3213974579489356, + "grad_norm": 0.4421713352203369, + "learning_rate": 7.93365559534088e-05, + "loss": 1.0013, + "step": 3597 + }, + { + "epoch": 0.3214868094801975, + "grad_norm": 0.46195080876350403, + "learning_rate": 7.932483660144394e-05, + "loss": 0.9468, + "step": 3598 + }, + { + "epoch": 0.32157616101145936, + "grad_norm": 0.419318825006485, + "learning_rate": 7.931311479321144e-05, + "loss": 0.9858, + "step": 3599 + }, + { + "epoch": 0.3216655125427212, + "grad_norm": 0.4154645800590515, + "learning_rate": 7.93013905296931e-05, + "loss": 0.9703, + "step": 3600 + }, + { + "epoch": 0.32175486407398307, + "grad_norm": 0.5366727709770203, + "learning_rate": 7.9289663811871e-05, + "loss": 0.9062, + "step": 3601 + }, + { + "epoch": 0.32184421560524495, + "grad_norm": 0.48142287135124207, + "learning_rate": 7.927793464072734e-05, + "loss": 1.0538, + "step": 3602 + }, + { + "epoch": 0.3219335671365068, + "grad_norm": 0.4503115713596344, + "learning_rate": 7.926620301724459e-05, + "loss": 1.0272, + "step": 3603 + }, + { + "epoch": 0.32202291866776867, + "grad_norm": 0.536191999912262, + "learning_rate": 7.925446894240536e-05, + "loss": 0.9567, + "step": 3604 + }, + { + "epoch": 0.32211227019903055, + "grad_norm": 0.38099348545074463, + "learning_rate": 7.924273241719254e-05, + "loss": 0.9871, + "step": 3605 + }, + { + "epoch": 0.3222016217302924, + "grad_norm": 0.5245529413223267, + "learning_rate": 7.923099344258915e-05, + "loss": 0.9687, + "step": 3606 + }, + { + "epoch": 0.32229097326155426, + "grad_norm": 0.424612432718277, + "learning_rate": 7.921925201957851e-05, + "loss": 1.0479, + "step": 3607 + }, + { + "epoch": 0.32238032479281614, + "grad_norm": 0.4393298625946045, + "learning_rate": 7.920750814914404e-05, + "loss": 1.0369, + "step": 3608 + }, + { + "epoch": 0.322469676324078, + "grad_norm": 0.4350713789463043, + "learning_rate": 7.919576183226945e-05, + "loss": 1.0051, + "step": 3609 + }, + { + "epoch": 0.32255902785533985, + "grad_norm": 0.40692925453186035, + "learning_rate": 7.918401306993858e-05, + "loss": 1.0296, + "step": 3610 + }, + { + "epoch": 0.32264837938660174, + "grad_norm": 0.4172888696193695, + "learning_rate": 7.917226186313554e-05, + "loss": 0.9977, + "step": 3611 + }, + { + "epoch": 0.3227377309178636, + "grad_norm": 0.46099647879600525, + "learning_rate": 7.916050821284462e-05, + "loss": 0.9395, + "step": 3612 + }, + { + "epoch": 0.32282708244912545, + "grad_norm": 0.44035103917121887, + "learning_rate": 7.914875212005032e-05, + "loss": 0.9543, + "step": 3613 + }, + { + "epoch": 0.32291643398038733, + "grad_norm": 0.3710711896419525, + "learning_rate": 7.913699358573732e-05, + "loss": 0.9894, + "step": 3614 + }, + { + "epoch": 0.3230057855116492, + "grad_norm": 0.5159255862236023, + "learning_rate": 7.912523261089051e-05, + "loss": 0.9674, + "step": 3615 + }, + { + "epoch": 0.3230951370429111, + "grad_norm": 0.5290977954864502, + "learning_rate": 7.911346919649504e-05, + "loss": 1.0115, + "step": 3616 + }, + { + "epoch": 0.3231844885741729, + "grad_norm": 0.4615987241268158, + "learning_rate": 7.910170334353619e-05, + "loss": 0.9218, + "step": 3617 + }, + { + "epoch": 0.3232738401054348, + "grad_norm": 0.41924357414245605, + "learning_rate": 7.908993505299948e-05, + "loss": 1.0448, + "step": 3618 + }, + { + "epoch": 0.3233631916366967, + "grad_norm": 0.45864051580429077, + "learning_rate": 7.907816432587062e-05, + "loss": 1.0082, + "step": 3619 + }, + { + "epoch": 0.3234525431679585, + "grad_norm": 0.3755020797252655, + "learning_rate": 7.906639116313558e-05, + "loss": 0.9703, + "step": 3620 + }, + { + "epoch": 0.3235418946992204, + "grad_norm": 0.40216493606567383, + "learning_rate": 7.905461556578043e-05, + "loss": 1.0177, + "step": 3621 + }, + { + "epoch": 0.3236312462304823, + "grad_norm": 0.42985835671424866, + "learning_rate": 7.904283753479154e-05, + "loss": 0.9477, + "step": 3622 + }, + { + "epoch": 0.32372059776174417, + "grad_norm": 0.4799264371395111, + "learning_rate": 7.903105707115543e-05, + "loss": 1.0422, + "step": 3623 + }, + { + "epoch": 0.323809949293006, + "grad_norm": 0.3927200734615326, + "learning_rate": 7.901927417585884e-05, + "loss": 0.9533, + "step": 3624 + }, + { + "epoch": 0.3238993008242679, + "grad_norm": 0.4355437159538269, + "learning_rate": 7.90074888498887e-05, + "loss": 0.9825, + "step": 3625 + }, + { + "epoch": 0.32398865235552976, + "grad_norm": 0.40045395493507385, + "learning_rate": 7.899570109423217e-05, + "loss": 1.0001, + "step": 3626 + }, + { + "epoch": 0.3240780038867916, + "grad_norm": 0.4917450249195099, + "learning_rate": 7.898391090987662e-05, + "loss": 0.9796, + "step": 3627 + }, + { + "epoch": 0.32416735541805347, + "grad_norm": 0.46011456847190857, + "learning_rate": 7.897211829780959e-05, + "loss": 0.9242, + "step": 3628 + }, + { + "epoch": 0.32425670694931535, + "grad_norm": 0.4157116711139679, + "learning_rate": 7.896032325901883e-05, + "loss": 1.0034, + "step": 3629 + }, + { + "epoch": 0.32434605848057724, + "grad_norm": 0.3974458575248718, + "learning_rate": 7.894852579449227e-05, + "loss": 1.0186, + "step": 3630 + }, + { + "epoch": 0.32443541001183906, + "grad_norm": 0.38465315103530884, + "learning_rate": 7.893672590521814e-05, + "loss": 0.9477, + "step": 3631 + }, + { + "epoch": 0.32452476154310095, + "grad_norm": 0.4720938801765442, + "learning_rate": 7.892492359218477e-05, + "loss": 0.9311, + "step": 3632 + }, + { + "epoch": 0.32461411307436283, + "grad_norm": 0.4968278408050537, + "learning_rate": 7.89131188563807e-05, + "loss": 0.9531, + "step": 3633 + }, + { + "epoch": 0.32470346460562466, + "grad_norm": 0.43185052275657654, + "learning_rate": 7.890131169879477e-05, + "loss": 1.0749, + "step": 3634 + }, + { + "epoch": 0.32479281613688654, + "grad_norm": 0.4034889042377472, + "learning_rate": 7.888950212041591e-05, + "loss": 1.0049, + "step": 3635 + }, + { + "epoch": 0.3248821676681484, + "grad_norm": 0.4653837978839874, + "learning_rate": 7.88776901222333e-05, + "loss": 0.9604, + "step": 3636 + }, + { + "epoch": 0.32497151919941025, + "grad_norm": 0.4228241741657257, + "learning_rate": 7.886587570523634e-05, + "loss": 0.9343, + "step": 3637 + }, + { + "epoch": 0.32506087073067214, + "grad_norm": 0.3874700665473938, + "learning_rate": 7.88540588704146e-05, + "loss": 1.0345, + "step": 3638 + }, + { + "epoch": 0.325150222261934, + "grad_norm": 0.501092791557312, + "learning_rate": 7.884223961875785e-05, + "loss": 0.9159, + "step": 3639 + }, + { + "epoch": 0.3252395737931959, + "grad_norm": 0.6081518530845642, + "learning_rate": 7.88304179512561e-05, + "loss": 0.8758, + "step": 3640 + }, + { + "epoch": 0.32532892532445773, + "grad_norm": 0.4688172936439514, + "learning_rate": 7.881859386889954e-05, + "loss": 0.9757, + "step": 3641 + }, + { + "epoch": 0.3254182768557196, + "grad_norm": 0.4610939025878906, + "learning_rate": 7.880676737267857e-05, + "loss": 1.0032, + "step": 3642 + }, + { + "epoch": 0.3255076283869815, + "grad_norm": 0.49634119868278503, + "learning_rate": 7.879493846358377e-05, + "loss": 0.9689, + "step": 3643 + }, + { + "epoch": 0.3255969799182433, + "grad_norm": 0.4354616105556488, + "learning_rate": 7.878310714260593e-05, + "loss": 0.9444, + "step": 3644 + }, + { + "epoch": 0.3256863314495052, + "grad_norm": 0.4469475746154785, + "learning_rate": 7.877127341073606e-05, + "loss": 1.0258, + "step": 3645 + }, + { + "epoch": 0.3257756829807671, + "grad_norm": 0.4689541459083557, + "learning_rate": 7.875943726896538e-05, + "loss": 0.8939, + "step": 3646 + }, + { + "epoch": 0.325865034512029, + "grad_norm": 0.39675360918045044, + "learning_rate": 7.874759871828527e-05, + "loss": 0.9616, + "step": 3647 + }, + { + "epoch": 0.3259543860432908, + "grad_norm": 0.4091286361217499, + "learning_rate": 7.873575775968734e-05, + "loss": 0.8918, + "step": 3648 + }, + { + "epoch": 0.3260437375745527, + "grad_norm": 0.3711673319339752, + "learning_rate": 7.872391439416339e-05, + "loss": 1.0246, + "step": 3649 + }, + { + "epoch": 0.32613308910581457, + "grad_norm": 0.4348282217979431, + "learning_rate": 7.871206862270543e-05, + "loss": 0.9172, + "step": 3650 + }, + { + "epoch": 0.3262224406370764, + "grad_norm": 0.46988198161125183, + "learning_rate": 7.870022044630569e-05, + "loss": 0.9724, + "step": 3651 + }, + { + "epoch": 0.3263117921683383, + "grad_norm": 0.42179208993911743, + "learning_rate": 7.868836986595656e-05, + "loss": 0.9622, + "step": 3652 + }, + { + "epoch": 0.32640114369960016, + "grad_norm": 0.507832407951355, + "learning_rate": 7.867651688265066e-05, + "loss": 0.9428, + "step": 3653 + }, + { + "epoch": 0.32649049523086204, + "grad_norm": 0.458981990814209, + "learning_rate": 7.866466149738079e-05, + "loss": 1.0245, + "step": 3654 + }, + { + "epoch": 0.32657984676212387, + "grad_norm": 0.47192487120628357, + "learning_rate": 7.865280371113998e-05, + "loss": 0.9683, + "step": 3655 + }, + { + "epoch": 0.32666919829338575, + "grad_norm": 0.4830261170864105, + "learning_rate": 7.864094352492143e-05, + "loss": 1.0502, + "step": 3656 + }, + { + "epoch": 0.32675854982464764, + "grad_norm": 0.43681490421295166, + "learning_rate": 7.862908093971859e-05, + "loss": 0.9709, + "step": 3657 + }, + { + "epoch": 0.32684790135590946, + "grad_norm": 0.4974532127380371, + "learning_rate": 7.861721595652507e-05, + "loss": 1.012, + "step": 3658 + }, + { + "epoch": 0.32693725288717135, + "grad_norm": 0.43522706627845764, + "learning_rate": 7.860534857633464e-05, + "loss": 0.9935, + "step": 3659 + }, + { + "epoch": 0.32702660441843323, + "grad_norm": 0.3843841254711151, + "learning_rate": 7.859347880014138e-05, + "loss": 1.01, + "step": 3660 + }, + { + "epoch": 0.3271159559496951, + "grad_norm": 0.38631847500801086, + "learning_rate": 7.858160662893948e-05, + "loss": 1.0152, + "step": 3661 + }, + { + "epoch": 0.32720530748095694, + "grad_norm": 0.4131559431552887, + "learning_rate": 7.856973206372336e-05, + "loss": 0.9994, + "step": 3662 + }, + { + "epoch": 0.3272946590122188, + "grad_norm": 0.6184050440788269, + "learning_rate": 7.855785510548765e-05, + "loss": 0.97, + "step": 3663 + }, + { + "epoch": 0.3273840105434807, + "grad_norm": 0.4109181761741638, + "learning_rate": 7.854597575522717e-05, + "loss": 1.0286, + "step": 3664 + }, + { + "epoch": 0.32747336207474254, + "grad_norm": 0.46452227234840393, + "learning_rate": 7.853409401393694e-05, + "loss": 1.0194, + "step": 3665 + }, + { + "epoch": 0.3275627136060044, + "grad_norm": 0.44786307215690613, + "learning_rate": 7.85222098826122e-05, + "loss": 0.9468, + "step": 3666 + }, + { + "epoch": 0.3276520651372663, + "grad_norm": 0.4409159719944, + "learning_rate": 7.851032336224835e-05, + "loss": 0.9174, + "step": 3667 + }, + { + "epoch": 0.32774141666852813, + "grad_norm": 0.35179105401039124, + "learning_rate": 7.849843445384102e-05, + "loss": 1.0162, + "step": 3668 + }, + { + "epoch": 0.32783076819979, + "grad_norm": 0.45857298374176025, + "learning_rate": 7.848654315838603e-05, + "loss": 0.9139, + "step": 3669 + }, + { + "epoch": 0.3279201197310519, + "grad_norm": 0.3982926309108734, + "learning_rate": 7.84746494768794e-05, + "loss": 1.0394, + "step": 3670 + }, + { + "epoch": 0.3280094712623138, + "grad_norm": 0.48336413502693176, + "learning_rate": 7.846275341031736e-05, + "loss": 0.9504, + "step": 3671 + }, + { + "epoch": 0.3280988227935756, + "grad_norm": 0.46158286929130554, + "learning_rate": 7.845085495969635e-05, + "loss": 0.9069, + "step": 3672 + }, + { + "epoch": 0.3281881743248375, + "grad_norm": 0.5317728519439697, + "learning_rate": 7.843895412601296e-05, + "loss": 0.9455, + "step": 3673 + }, + { + "epoch": 0.3282775258560994, + "grad_norm": 0.4177683889865875, + "learning_rate": 7.842705091026403e-05, + "loss": 0.9303, + "step": 3674 + }, + { + "epoch": 0.3283668773873612, + "grad_norm": 0.4196977913379669, + "learning_rate": 7.841514531344655e-05, + "loss": 0.9551, + "step": 3675 + }, + { + "epoch": 0.3284562289186231, + "grad_norm": 0.48437947034835815, + "learning_rate": 7.840323733655778e-05, + "loss": 0.8823, + "step": 3676 + }, + { + "epoch": 0.32854558044988497, + "grad_norm": 0.38968583941459656, + "learning_rate": 7.839132698059515e-05, + "loss": 0.9749, + "step": 3677 + }, + { + "epoch": 0.32863493198114685, + "grad_norm": 0.41708287596702576, + "learning_rate": 7.837941424655624e-05, + "loss": 0.9639, + "step": 3678 + }, + { + "epoch": 0.3287242835124087, + "grad_norm": 0.43757346272468567, + "learning_rate": 7.836749913543888e-05, + "loss": 0.8771, + "step": 3679 + }, + { + "epoch": 0.32881363504367056, + "grad_norm": 0.513954758644104, + "learning_rate": 7.835558164824108e-05, + "loss": 0.9956, + "step": 3680 + }, + { + "epoch": 0.32890298657493244, + "grad_norm": 0.3900381624698639, + "learning_rate": 7.834366178596109e-05, + "loss": 1.0153, + "step": 3681 + }, + { + "epoch": 0.32899233810619427, + "grad_norm": 0.45596617460250854, + "learning_rate": 7.83317395495973e-05, + "loss": 1.1101, + "step": 3682 + }, + { + "epoch": 0.32908168963745615, + "grad_norm": 0.4868778586387634, + "learning_rate": 7.831981494014833e-05, + "loss": 1.0163, + "step": 3683 + }, + { + "epoch": 0.32917104116871804, + "grad_norm": 0.40496212244033813, + "learning_rate": 7.830788795861296e-05, + "loss": 0.9321, + "step": 3684 + }, + { + "epoch": 0.3292603926999799, + "grad_norm": 0.42820194363594055, + "learning_rate": 7.829595860599026e-05, + "loss": 1.0163, + "step": 3685 + }, + { + "epoch": 0.32934974423124175, + "grad_norm": 0.42636728286743164, + "learning_rate": 7.828402688327941e-05, + "loss": 0.9583, + "step": 3686 + }, + { + "epoch": 0.32943909576250363, + "grad_norm": 0.39956799149513245, + "learning_rate": 7.827209279147982e-05, + "loss": 1.0322, + "step": 3687 + }, + { + "epoch": 0.3295284472937655, + "grad_norm": 0.4089909791946411, + "learning_rate": 7.826015633159112e-05, + "loss": 1.0177, + "step": 3688 + }, + { + "epoch": 0.32961779882502734, + "grad_norm": 0.4677819013595581, + "learning_rate": 7.824821750461308e-05, + "loss": 0.9698, + "step": 3689 + }, + { + "epoch": 0.3297071503562892, + "grad_norm": 0.4185117781162262, + "learning_rate": 7.823627631154571e-05, + "loss": 1.001, + "step": 3690 + }, + { + "epoch": 0.3297965018875511, + "grad_norm": 0.3975558280944824, + "learning_rate": 7.822433275338923e-05, + "loss": 0.9953, + "step": 3691 + }, + { + "epoch": 0.329885853418813, + "grad_norm": 0.3640524744987488, + "learning_rate": 7.821238683114404e-05, + "loss": 1.0064, + "step": 3692 + }, + { + "epoch": 0.3299752049500748, + "grad_norm": 0.4032001197338104, + "learning_rate": 7.820043854581071e-05, + "loss": 1.01, + "step": 3693 + }, + { + "epoch": 0.3300645564813367, + "grad_norm": 0.45397523045539856, + "learning_rate": 7.818848789839008e-05, + "loss": 0.9609, + "step": 3694 + }, + { + "epoch": 0.3301539080125986, + "grad_norm": 0.3662114143371582, + "learning_rate": 7.81765348898831e-05, + "loss": 1.0157, + "step": 3695 + }, + { + "epoch": 0.3302432595438604, + "grad_norm": 0.38710111379623413, + "learning_rate": 7.816457952129099e-05, + "loss": 0.9811, + "step": 3696 + }, + { + "epoch": 0.3303326110751223, + "grad_norm": 0.40280595421791077, + "learning_rate": 7.815262179361514e-05, + "loss": 0.9899, + "step": 3697 + }, + { + "epoch": 0.3304219626063842, + "grad_norm": 0.4115069806575775, + "learning_rate": 7.814066170785714e-05, + "loss": 0.9829, + "step": 3698 + }, + { + "epoch": 0.330511314137646, + "grad_norm": 0.39041948318481445, + "learning_rate": 7.812869926501874e-05, + "loss": 0.9653, + "step": 3699 + }, + { + "epoch": 0.3306006656689079, + "grad_norm": 0.40543365478515625, + "learning_rate": 7.811673446610195e-05, + "loss": 1.001, + "step": 3700 + }, + { + "epoch": 0.3306900172001698, + "grad_norm": 0.36765506863594055, + "learning_rate": 7.810476731210896e-05, + "loss": 1.0908, + "step": 3701 + }, + { + "epoch": 0.33077936873143166, + "grad_norm": 0.45462504029273987, + "learning_rate": 7.809279780404212e-05, + "loss": 1.0355, + "step": 3702 + }, + { + "epoch": 0.3308687202626935, + "grad_norm": 0.5238141417503357, + "learning_rate": 7.808082594290402e-05, + "loss": 0.9617, + "step": 3703 + }, + { + "epoch": 0.33095807179395537, + "grad_norm": 0.4108685851097107, + "learning_rate": 7.806885172969742e-05, + "loss": 0.9806, + "step": 3704 + }, + { + "epoch": 0.33104742332521725, + "grad_norm": 0.320278525352478, + "learning_rate": 7.80568751654253e-05, + "loss": 1.0341, + "step": 3705 + }, + { + "epoch": 0.3311367748564791, + "grad_norm": 0.3879075050354004, + "learning_rate": 7.804489625109083e-05, + "loss": 0.9905, + "step": 3706 + }, + { + "epoch": 0.33122612638774096, + "grad_norm": 0.4919257164001465, + "learning_rate": 7.803291498769735e-05, + "loss": 0.9408, + "step": 3707 + }, + { + "epoch": 0.33131547791900284, + "grad_norm": 0.4680798053741455, + "learning_rate": 7.802093137624844e-05, + "loss": 0.8917, + "step": 3708 + }, + { + "epoch": 0.3314048294502647, + "grad_norm": 0.35378363728523254, + "learning_rate": 7.800894541774783e-05, + "loss": 1.0099, + "step": 3709 + }, + { + "epoch": 0.33149418098152655, + "grad_norm": 0.3704555928707123, + "learning_rate": 7.79969571131995e-05, + "loss": 0.9949, + "step": 3710 + }, + { + "epoch": 0.33158353251278844, + "grad_norm": 0.39882001280784607, + "learning_rate": 7.798496646360758e-05, + "loss": 0.9924, + "step": 3711 + }, + { + "epoch": 0.3316728840440503, + "grad_norm": 0.4005809724330902, + "learning_rate": 7.797297346997643e-05, + "loss": 0.9349, + "step": 3712 + }, + { + "epoch": 0.33176223557531215, + "grad_norm": 0.4036948084831238, + "learning_rate": 7.796097813331059e-05, + "loss": 0.9319, + "step": 3713 + }, + { + "epoch": 0.33185158710657403, + "grad_norm": 0.46617838740348816, + "learning_rate": 7.794898045461476e-05, + "loss": 0.904, + "step": 3714 + }, + { + "epoch": 0.3319409386378359, + "grad_norm": 0.38140738010406494, + "learning_rate": 7.79369804348939e-05, + "loss": 0.9657, + "step": 3715 + }, + { + "epoch": 0.3320302901690978, + "grad_norm": 0.46043315529823303, + "learning_rate": 7.792497807515317e-05, + "loss": 1.045, + "step": 3716 + }, + { + "epoch": 0.3321196417003596, + "grad_norm": 0.4285655617713928, + "learning_rate": 7.791297337639784e-05, + "loss": 1.0517, + "step": 3717 + }, + { + "epoch": 0.3322089932316215, + "grad_norm": 0.43057459592819214, + "learning_rate": 7.790096633963348e-05, + "loss": 0.9226, + "step": 3718 + }, + { + "epoch": 0.3322983447628834, + "grad_norm": 0.5528749823570251, + "learning_rate": 7.788895696586577e-05, + "loss": 0.9763, + "step": 3719 + }, + { + "epoch": 0.3323876962941452, + "grad_norm": 0.49909940361976624, + "learning_rate": 7.787694525610066e-05, + "loss": 1.0276, + "step": 3720 + }, + { + "epoch": 0.3324770478254071, + "grad_norm": 0.458188533782959, + "learning_rate": 7.786493121134423e-05, + "loss": 0.9415, + "step": 3721 + }, + { + "epoch": 0.332566399356669, + "grad_norm": 0.4523871839046478, + "learning_rate": 7.785291483260278e-05, + "loss": 0.9684, + "step": 3722 + }, + { + "epoch": 0.33265575088793087, + "grad_norm": 0.4228318929672241, + "learning_rate": 7.784089612088283e-05, + "loss": 1.042, + "step": 3723 + }, + { + "epoch": 0.3327451024191927, + "grad_norm": 0.3901155889034271, + "learning_rate": 7.782887507719108e-05, + "loss": 1.0081, + "step": 3724 + }, + { + "epoch": 0.3328344539504546, + "grad_norm": 0.39497750997543335, + "learning_rate": 7.781685170253439e-05, + "loss": 1.0522, + "step": 3725 + }, + { + "epoch": 0.33292380548171646, + "grad_norm": 0.43867218494415283, + "learning_rate": 7.780482599791987e-05, + "loss": 0.9623, + "step": 3726 + }, + { + "epoch": 0.3330131570129783, + "grad_norm": 0.43389105796813965, + "learning_rate": 7.779279796435479e-05, + "loss": 1.0464, + "step": 3727 + }, + { + "epoch": 0.3331025085442402, + "grad_norm": 0.4637124538421631, + "learning_rate": 7.778076760284665e-05, + "loss": 0.9743, + "step": 3728 + }, + { + "epoch": 0.33319186007550206, + "grad_norm": 0.48631882667541504, + "learning_rate": 7.776873491440307e-05, + "loss": 1.0403, + "step": 3729 + }, + { + "epoch": 0.33328121160676394, + "grad_norm": 0.4544772505760193, + "learning_rate": 7.775669990003197e-05, + "loss": 1.0384, + "step": 3730 + }, + { + "epoch": 0.33337056313802577, + "grad_norm": 0.41751065850257874, + "learning_rate": 7.774466256074137e-05, + "loss": 0.9482, + "step": 3731 + }, + { + "epoch": 0.33345991466928765, + "grad_norm": 0.42404597997665405, + "learning_rate": 7.773262289753956e-05, + "loss": 0.9436, + "step": 3732 + }, + { + "epoch": 0.33354926620054953, + "grad_norm": 0.400020033121109, + "learning_rate": 7.772058091143497e-05, + "loss": 0.9523, + "step": 3733 + }, + { + "epoch": 0.33363861773181136, + "grad_norm": 0.4418000876903534, + "learning_rate": 7.770853660343625e-05, + "loss": 1.032, + "step": 3734 + }, + { + "epoch": 0.33372796926307324, + "grad_norm": 0.48235222697257996, + "learning_rate": 7.769648997455223e-05, + "loss": 0.9437, + "step": 3735 + }, + { + "epoch": 0.3338173207943351, + "grad_norm": 0.40464332699775696, + "learning_rate": 7.768444102579196e-05, + "loss": 0.9569, + "step": 3736 + }, + { + "epoch": 0.33390667232559695, + "grad_norm": 0.5996637940406799, + "learning_rate": 7.767238975816465e-05, + "loss": 0.9467, + "step": 3737 + }, + { + "epoch": 0.33399602385685884, + "grad_norm": 0.38040077686309814, + "learning_rate": 7.766033617267975e-05, + "loss": 0.9954, + "step": 3738 + }, + { + "epoch": 0.3340853753881207, + "grad_norm": 0.4809688925743103, + "learning_rate": 7.764828027034685e-05, + "loss": 0.8698, + "step": 3739 + }, + { + "epoch": 0.3341747269193826, + "grad_norm": 0.4978516399860382, + "learning_rate": 7.763622205217576e-05, + "loss": 0.9686, + "step": 3740 + }, + { + "epoch": 0.33426407845064443, + "grad_norm": 0.41253742575645447, + "learning_rate": 7.762416151917648e-05, + "loss": 0.9883, + "step": 3741 + }, + { + "epoch": 0.3343534299819063, + "grad_norm": 0.35575124621391296, + "learning_rate": 7.761209867235924e-05, + "loss": 1.0379, + "step": 3742 + }, + { + "epoch": 0.3344427815131682, + "grad_norm": 0.3860713541507721, + "learning_rate": 7.760003351273442e-05, + "loss": 0.9995, + "step": 3743 + }, + { + "epoch": 0.33453213304443, + "grad_norm": 0.3986268639564514, + "learning_rate": 7.758796604131258e-05, + "loss": 0.986, + "step": 3744 + }, + { + "epoch": 0.3346214845756919, + "grad_norm": 0.4240453243255615, + "learning_rate": 7.757589625910452e-05, + "loss": 0.9722, + "step": 3745 + }, + { + "epoch": 0.3347108361069538, + "grad_norm": 0.5454365015029907, + "learning_rate": 7.75638241671212e-05, + "loss": 0.8707, + "step": 3746 + }, + { + "epoch": 0.3348001876382157, + "grad_norm": 0.4972113370895386, + "learning_rate": 7.755174976637381e-05, + "loss": 0.978, + "step": 3747 + }, + { + "epoch": 0.3348895391694775, + "grad_norm": 0.4331071972846985, + "learning_rate": 7.753967305787371e-05, + "loss": 0.9539, + "step": 3748 + }, + { + "epoch": 0.3349788907007394, + "grad_norm": 0.5387925505638123, + "learning_rate": 7.752759404263242e-05, + "loss": 0.8865, + "step": 3749 + }, + { + "epoch": 0.33506824223200127, + "grad_norm": 0.4231729805469513, + "learning_rate": 7.751551272166171e-05, + "loss": 0.9758, + "step": 3750 + }, + { + "epoch": 0.3351575937632631, + "grad_norm": 0.4942473769187927, + "learning_rate": 7.750342909597352e-05, + "loss": 0.9943, + "step": 3751 + }, + { + "epoch": 0.335246945294525, + "grad_norm": 0.4464167356491089, + "learning_rate": 7.749134316657997e-05, + "loss": 1.0107, + "step": 3752 + }, + { + "epoch": 0.33533629682578686, + "grad_norm": 0.4313907027244568, + "learning_rate": 7.747925493449342e-05, + "loss": 1.0031, + "step": 3753 + }, + { + "epoch": 0.33542564835704874, + "grad_norm": 0.42571744322776794, + "learning_rate": 7.746716440072632e-05, + "loss": 1.003, + "step": 3754 + }, + { + "epoch": 0.33551499988831057, + "grad_norm": 0.3863215446472168, + "learning_rate": 7.745507156629145e-05, + "loss": 1.0386, + "step": 3755 + }, + { + "epoch": 0.33560435141957246, + "grad_norm": 0.4166812002658844, + "learning_rate": 7.744297643220168e-05, + "loss": 0.9372, + "step": 3756 + }, + { + "epoch": 0.33569370295083434, + "grad_norm": 0.4798368215560913, + "learning_rate": 7.74308789994701e-05, + "loss": 0.9966, + "step": 3757 + }, + { + "epoch": 0.33578305448209617, + "grad_norm": 0.3768024146556854, + "learning_rate": 7.741877926911003e-05, + "loss": 0.9845, + "step": 3758 + }, + { + "epoch": 0.33587240601335805, + "grad_norm": 0.46540406346321106, + "learning_rate": 7.740667724213493e-05, + "loss": 0.9618, + "step": 3759 + }, + { + "epoch": 0.33596175754461993, + "grad_norm": 0.5055358409881592, + "learning_rate": 7.739457291955847e-05, + "loss": 1.0283, + "step": 3760 + }, + { + "epoch": 0.3360511090758818, + "grad_norm": 0.5068111419677734, + "learning_rate": 7.738246630239452e-05, + "loss": 0.9803, + "step": 3761 + }, + { + "epoch": 0.33614046060714364, + "grad_norm": 0.43414464592933655, + "learning_rate": 7.737035739165715e-05, + "loss": 1.0182, + "step": 3762 + }, + { + "epoch": 0.3362298121384055, + "grad_norm": 0.47884586453437805, + "learning_rate": 7.73582461883606e-05, + "loss": 1.0077, + "step": 3763 + }, + { + "epoch": 0.3363191636696674, + "grad_norm": 0.3981468677520752, + "learning_rate": 7.734613269351931e-05, + "loss": 1.0856, + "step": 3764 + }, + { + "epoch": 0.33640851520092924, + "grad_norm": 0.4908856153488159, + "learning_rate": 7.733401690814793e-05, + "loss": 0.8595, + "step": 3765 + }, + { + "epoch": 0.3364978667321911, + "grad_norm": 0.3917779326438904, + "learning_rate": 7.732189883326125e-05, + "loss": 0.9844, + "step": 3766 + }, + { + "epoch": 0.336587218263453, + "grad_norm": 0.40213218331336975, + "learning_rate": 7.730977846987433e-05, + "loss": 0.968, + "step": 3767 + }, + { + "epoch": 0.33667656979471483, + "grad_norm": 0.3646174967288971, + "learning_rate": 7.729765581900235e-05, + "loss": 1.0258, + "step": 3768 + }, + { + "epoch": 0.3367659213259767, + "grad_norm": 0.41250017285346985, + "learning_rate": 7.728553088166075e-05, + "loss": 0.9605, + "step": 3769 + }, + { + "epoch": 0.3368552728572386, + "grad_norm": 0.384792685508728, + "learning_rate": 7.727340365886506e-05, + "loss": 1.0067, + "step": 3770 + }, + { + "epoch": 0.3369446243885005, + "grad_norm": 0.5328003168106079, + "learning_rate": 7.726127415163113e-05, + "loss": 0.9602, + "step": 3771 + }, + { + "epoch": 0.3370339759197623, + "grad_norm": 0.4145788550376892, + "learning_rate": 7.724914236097489e-05, + "loss": 1.0511, + "step": 3772 + }, + { + "epoch": 0.3371233274510242, + "grad_norm": 0.37632668018341064, + "learning_rate": 7.723700828791252e-05, + "loss": 0.972, + "step": 3773 + }, + { + "epoch": 0.3372126789822861, + "grad_norm": 0.37736862897872925, + "learning_rate": 7.722487193346039e-05, + "loss": 0.99, + "step": 3774 + }, + { + "epoch": 0.3373020305135479, + "grad_norm": 0.4212090075016022, + "learning_rate": 7.721273329863504e-05, + "loss": 0.9789, + "step": 3775 + }, + { + "epoch": 0.3373913820448098, + "grad_norm": 0.5166639089584351, + "learning_rate": 7.72005923844532e-05, + "loss": 0.9856, + "step": 3776 + }, + { + "epoch": 0.33748073357607167, + "grad_norm": 0.46971988677978516, + "learning_rate": 7.71884491919318e-05, + "loss": 1.048, + "step": 3777 + }, + { + "epoch": 0.33757008510733355, + "grad_norm": 0.4689841568470001, + "learning_rate": 7.7176303722088e-05, + "loss": 0.9496, + "step": 3778 + }, + { + "epoch": 0.3376594366385954, + "grad_norm": 0.5772603154182434, + "learning_rate": 7.716415597593907e-05, + "loss": 0.8967, + "step": 3779 + }, + { + "epoch": 0.33774878816985726, + "grad_norm": 0.4483826160430908, + "learning_rate": 7.715200595450253e-05, + "loss": 1.0697, + "step": 3780 + }, + { + "epoch": 0.33783813970111914, + "grad_norm": 0.4816476106643677, + "learning_rate": 7.713985365879606e-05, + "loss": 0.912, + "step": 3781 + }, + { + "epoch": 0.33792749123238097, + "grad_norm": 0.38734710216522217, + "learning_rate": 7.712769908983757e-05, + "loss": 0.9535, + "step": 3782 + }, + { + "epoch": 0.33801684276364286, + "grad_norm": 0.4487532079219818, + "learning_rate": 7.711554224864511e-05, + "loss": 0.9674, + "step": 3783 + }, + { + "epoch": 0.33810619429490474, + "grad_norm": 0.4268551170825958, + "learning_rate": 7.710338313623697e-05, + "loss": 0.9257, + "step": 3784 + }, + { + "epoch": 0.3381955458261666, + "grad_norm": 0.44561639428138733, + "learning_rate": 7.709122175363158e-05, + "loss": 0.9981, + "step": 3785 + }, + { + "epoch": 0.33828489735742845, + "grad_norm": 0.44499921798706055, + "learning_rate": 7.707905810184762e-05, + "loss": 0.9353, + "step": 3786 + }, + { + "epoch": 0.33837424888869033, + "grad_norm": 0.4151676595211029, + "learning_rate": 7.706689218190386e-05, + "loss": 0.9364, + "step": 3787 + }, + { + "epoch": 0.3384636004199522, + "grad_norm": 0.4505821764469147, + "learning_rate": 7.705472399481939e-05, + "loss": 0.9725, + "step": 3788 + }, + { + "epoch": 0.33855295195121404, + "grad_norm": 0.410693883895874, + "learning_rate": 7.704255354161341e-05, + "loss": 1.0559, + "step": 3789 + }, + { + "epoch": 0.3386423034824759, + "grad_norm": 0.38344308733940125, + "learning_rate": 7.703038082330532e-05, + "loss": 0.9554, + "step": 3790 + }, + { + "epoch": 0.3387316550137378, + "grad_norm": 0.3579278588294983, + "learning_rate": 7.70182058409147e-05, + "loss": 0.9861, + "step": 3791 + }, + { + "epoch": 0.3388210065449997, + "grad_norm": 0.4280867278575897, + "learning_rate": 7.700602859546134e-05, + "loss": 0.991, + "step": 3792 + }, + { + "epoch": 0.3389103580762615, + "grad_norm": 0.4199950695037842, + "learning_rate": 7.699384908796523e-05, + "loss": 0.9886, + "step": 3793 + }, + { + "epoch": 0.3389997096075234, + "grad_norm": 0.43950459361076355, + "learning_rate": 7.698166731944654e-05, + "loss": 0.9472, + "step": 3794 + }, + { + "epoch": 0.3390890611387853, + "grad_norm": 0.43579909205436707, + "learning_rate": 7.696948329092559e-05, + "loss": 0.955, + "step": 3795 + }, + { + "epoch": 0.3391784126700471, + "grad_norm": 0.4635678231716156, + "learning_rate": 7.695729700342294e-05, + "loss": 0.9009, + "step": 3796 + }, + { + "epoch": 0.339267764201309, + "grad_norm": 0.4732118248939514, + "learning_rate": 7.694510845795933e-05, + "loss": 0.9975, + "step": 3797 + }, + { + "epoch": 0.3393571157325709, + "grad_norm": 0.4001818299293518, + "learning_rate": 7.693291765555567e-05, + "loss": 0.9729, + "step": 3798 + }, + { + "epoch": 0.3394464672638327, + "grad_norm": 0.39679357409477234, + "learning_rate": 7.692072459723307e-05, + "loss": 1.0314, + "step": 3799 + }, + { + "epoch": 0.3395358187950946, + "grad_norm": 0.40091925859451294, + "learning_rate": 7.690852928401285e-05, + "loss": 0.9526, + "step": 3800 + }, + { + "epoch": 0.3396251703263565, + "grad_norm": 0.46458113193511963, + "learning_rate": 7.689633171691644e-05, + "loss": 0.9608, + "step": 3801 + }, + { + "epoch": 0.33971452185761836, + "grad_norm": 0.5359793305397034, + "learning_rate": 7.688413189696559e-05, + "loss": 0.9501, + "step": 3802 + }, + { + "epoch": 0.3398038733888802, + "grad_norm": 0.43518051505088806, + "learning_rate": 7.68719298251821e-05, + "loss": 0.9698, + "step": 3803 + }, + { + "epoch": 0.33989322492014207, + "grad_norm": 0.42377930879592896, + "learning_rate": 7.685972550258809e-05, + "loss": 0.974, + "step": 3804 + }, + { + "epoch": 0.33998257645140395, + "grad_norm": 0.40546149015426636, + "learning_rate": 7.684751893020574e-05, + "loss": 0.9878, + "step": 3805 + }, + { + "epoch": 0.3400719279826658, + "grad_norm": 0.43093767762184143, + "learning_rate": 7.683531010905748e-05, + "loss": 0.9468, + "step": 3806 + }, + { + "epoch": 0.34016127951392766, + "grad_norm": 0.4087904095649719, + "learning_rate": 7.682309904016601e-05, + "loss": 0.9218, + "step": 3807 + }, + { + "epoch": 0.34025063104518954, + "grad_norm": 0.4545688033103943, + "learning_rate": 7.681088572455405e-05, + "loss": 0.9562, + "step": 3808 + }, + { + "epoch": 0.3403399825764514, + "grad_norm": 0.4550299644470215, + "learning_rate": 7.679867016324465e-05, + "loss": 0.9438, + "step": 3809 + }, + { + "epoch": 0.34042933410771326, + "grad_norm": 0.4195230305194855, + "learning_rate": 7.678645235726094e-05, + "loss": 0.9424, + "step": 3810 + }, + { + "epoch": 0.34051868563897514, + "grad_norm": 0.5254753232002258, + "learning_rate": 7.677423230762632e-05, + "loss": 0.9047, + "step": 3811 + }, + { + "epoch": 0.340608037170237, + "grad_norm": 0.5194946527481079, + "learning_rate": 7.676201001536439e-05, + "loss": 0.9891, + "step": 3812 + }, + { + "epoch": 0.34069738870149885, + "grad_norm": 0.4409390985965729, + "learning_rate": 7.674978548149882e-05, + "loss": 1.0141, + "step": 3813 + }, + { + "epoch": 0.34078674023276073, + "grad_norm": 0.4274226725101471, + "learning_rate": 7.67375587070536e-05, + "loss": 1.0241, + "step": 3814 + }, + { + "epoch": 0.3408760917640226, + "grad_norm": 0.41632789373397827, + "learning_rate": 7.672532969305284e-05, + "loss": 1.006, + "step": 3815 + }, + { + "epoch": 0.3409654432952845, + "grad_norm": 0.41912585496902466, + "learning_rate": 7.671309844052084e-05, + "loss": 1.036, + "step": 3816 + }, + { + "epoch": 0.3410547948265463, + "grad_norm": 0.44865861535072327, + "learning_rate": 7.67008649504821e-05, + "loss": 0.9867, + "step": 3817 + }, + { + "epoch": 0.3411441463578082, + "grad_norm": 0.4446162283420563, + "learning_rate": 7.668862922396131e-05, + "loss": 0.9555, + "step": 3818 + }, + { + "epoch": 0.3412334978890701, + "grad_norm": 0.468001127243042, + "learning_rate": 7.667639126198337e-05, + "loss": 1.0173, + "step": 3819 + }, + { + "epoch": 0.3413228494203319, + "grad_norm": 0.38588184118270874, + "learning_rate": 7.666415106557327e-05, + "loss": 1.0348, + "step": 3820 + }, + { + "epoch": 0.3414122009515938, + "grad_norm": 0.4341317415237427, + "learning_rate": 7.665190863575633e-05, + "loss": 1.0196, + "step": 3821 + }, + { + "epoch": 0.3415015524828557, + "grad_norm": 0.4301292598247528, + "learning_rate": 7.663966397355793e-05, + "loss": 0.9469, + "step": 3822 + }, + { + "epoch": 0.34159090401411757, + "grad_norm": 0.49630647897720337, + "learning_rate": 7.662741708000374e-05, + "loss": 0.9003, + "step": 3823 + }, + { + "epoch": 0.3416802555453794, + "grad_norm": 0.46921518445014954, + "learning_rate": 7.661516795611951e-05, + "loss": 0.9352, + "step": 3824 + }, + { + "epoch": 0.3417696070766413, + "grad_norm": 0.43845120072364807, + "learning_rate": 7.66029166029313e-05, + "loss": 0.9933, + "step": 3825 + }, + { + "epoch": 0.34185895860790316, + "grad_norm": 0.3999157249927521, + "learning_rate": 7.659066302146524e-05, + "loss": 1.0288, + "step": 3826 + }, + { + "epoch": 0.341948310139165, + "grad_norm": 0.38439029455184937, + "learning_rate": 7.657840721274772e-05, + "loss": 1.0599, + "step": 3827 + }, + { + "epoch": 0.3420376616704269, + "grad_norm": 0.4651052951812744, + "learning_rate": 7.656614917780527e-05, + "loss": 1.0204, + "step": 3828 + }, + { + "epoch": 0.34212701320168876, + "grad_norm": 0.41157326102256775, + "learning_rate": 7.655388891766468e-05, + "loss": 0.9886, + "step": 3829 + }, + { + "epoch": 0.3422163647329506, + "grad_norm": 0.4303523302078247, + "learning_rate": 7.654162643335283e-05, + "loss": 0.9888, + "step": 3830 + }, + { + "epoch": 0.34230571626421247, + "grad_norm": 0.3977055549621582, + "learning_rate": 7.652936172589686e-05, + "loss": 1.0203, + "step": 3831 + }, + { + "epoch": 0.34239506779547435, + "grad_norm": 0.44146913290023804, + "learning_rate": 7.651709479632406e-05, + "loss": 1.0111, + "step": 3832 + }, + { + "epoch": 0.34248441932673623, + "grad_norm": 0.532141923904419, + "learning_rate": 7.650482564566193e-05, + "loss": 0.9072, + "step": 3833 + }, + { + "epoch": 0.34257377085799806, + "grad_norm": 0.49662163853645325, + "learning_rate": 7.649255427493812e-05, + "loss": 1.0138, + "step": 3834 + }, + { + "epoch": 0.34266312238925994, + "grad_norm": 0.4919273555278778, + "learning_rate": 7.64802806851805e-05, + "loss": 0.9425, + "step": 3835 + }, + { + "epoch": 0.3427524739205218, + "grad_norm": 0.4210042655467987, + "learning_rate": 7.646800487741711e-05, + "loss": 0.9302, + "step": 3836 + }, + { + "epoch": 0.34284182545178365, + "grad_norm": 0.45006346702575684, + "learning_rate": 7.645572685267619e-05, + "loss": 0.9837, + "step": 3837 + }, + { + "epoch": 0.34293117698304554, + "grad_norm": 0.5554122924804688, + "learning_rate": 7.644344661198615e-05, + "loss": 0.9685, + "step": 3838 + }, + { + "epoch": 0.3430205285143074, + "grad_norm": 0.4323464035987854, + "learning_rate": 7.643116415637559e-05, + "loss": 1.0146, + "step": 3839 + }, + { + "epoch": 0.3431098800455693, + "grad_norm": 0.5363249182701111, + "learning_rate": 7.64188794868733e-05, + "loss": 0.9732, + "step": 3840 + }, + { + "epoch": 0.34319923157683113, + "grad_norm": 0.4187726378440857, + "learning_rate": 7.640659260450823e-05, + "loss": 0.9552, + "step": 3841 + }, + { + "epoch": 0.343288583108093, + "grad_norm": 0.5245155692100525, + "learning_rate": 7.639430351030958e-05, + "loss": 0.8949, + "step": 3842 + }, + { + "epoch": 0.3433779346393549, + "grad_norm": 0.4653611481189728, + "learning_rate": 7.638201220530665e-05, + "loss": 0.9063, + "step": 3843 + }, + { + "epoch": 0.3434672861706167, + "grad_norm": 0.42201900482177734, + "learning_rate": 7.636971869052899e-05, + "loss": 0.9947, + "step": 3844 + }, + { + "epoch": 0.3435566377018786, + "grad_norm": 0.5015077590942383, + "learning_rate": 7.63574229670063e-05, + "loss": 0.9122, + "step": 3845 + }, + { + "epoch": 0.3436459892331405, + "grad_norm": 0.40728330612182617, + "learning_rate": 7.63451250357685e-05, + "loss": 0.9819, + "step": 3846 + }, + { + "epoch": 0.3437353407644024, + "grad_norm": 0.508651852607727, + "learning_rate": 7.633282489784564e-05, + "loss": 0.9947, + "step": 3847 + }, + { + "epoch": 0.3438246922956642, + "grad_norm": 0.47697994112968445, + "learning_rate": 7.632052255426803e-05, + "loss": 0.9991, + "step": 3848 + }, + { + "epoch": 0.3439140438269261, + "grad_norm": 0.566102147102356, + "learning_rate": 7.63082180060661e-05, + "loss": 0.9842, + "step": 3849 + }, + { + "epoch": 0.34400339535818797, + "grad_norm": 0.4584532380104065, + "learning_rate": 7.629591125427047e-05, + "loss": 0.9289, + "step": 3850 + }, + { + "epoch": 0.3440927468894498, + "grad_norm": 0.42579421401023865, + "learning_rate": 7.628360229991199e-05, + "loss": 0.9618, + "step": 3851 + }, + { + "epoch": 0.3441820984207117, + "grad_norm": 0.4023285508155823, + "learning_rate": 7.627129114402164e-05, + "loss": 1.0386, + "step": 3852 + }, + { + "epoch": 0.34427144995197356, + "grad_norm": 0.5076940655708313, + "learning_rate": 7.625897778763062e-05, + "loss": 0.9791, + "step": 3853 + }, + { + "epoch": 0.34436080148323545, + "grad_norm": 0.4704766273498535, + "learning_rate": 7.624666223177033e-05, + "loss": 0.9195, + "step": 3854 + }, + { + "epoch": 0.3444501530144973, + "grad_norm": 0.4094794690608978, + "learning_rate": 7.62343444774723e-05, + "loss": 0.9469, + "step": 3855 + }, + { + "epoch": 0.34453950454575916, + "grad_norm": 0.41486209630966187, + "learning_rate": 7.62220245257683e-05, + "loss": 1.0922, + "step": 3856 + }, + { + "epoch": 0.34462885607702104, + "grad_norm": 0.45725998282432556, + "learning_rate": 7.620970237769022e-05, + "loss": 0.9506, + "step": 3857 + }, + { + "epoch": 0.34471820760828287, + "grad_norm": 0.416765034198761, + "learning_rate": 7.619737803427019e-05, + "loss": 0.9818, + "step": 3858 + }, + { + "epoch": 0.34480755913954475, + "grad_norm": 0.3775465190410614, + "learning_rate": 7.618505149654051e-05, + "loss": 0.9946, + "step": 3859 + }, + { + "epoch": 0.34489691067080663, + "grad_norm": 0.43782955408096313, + "learning_rate": 7.617272276553366e-05, + "loss": 0.983, + "step": 3860 + }, + { + "epoch": 0.34498626220206846, + "grad_norm": 0.40604960918426514, + "learning_rate": 7.61603918422823e-05, + "loss": 1.008, + "step": 3861 + }, + { + "epoch": 0.34507561373333034, + "grad_norm": 0.4615291357040405, + "learning_rate": 7.614805872781926e-05, + "loss": 1.0312, + "step": 3862 + }, + { + "epoch": 0.3451649652645922, + "grad_norm": 0.41947072744369507, + "learning_rate": 7.613572342317758e-05, + "loss": 0.9949, + "step": 3863 + }, + { + "epoch": 0.3452543167958541, + "grad_norm": 0.4784375727176666, + "learning_rate": 7.612338592939049e-05, + "loss": 0.9847, + "step": 3864 + }, + { + "epoch": 0.34534366832711594, + "grad_norm": 0.38383907079696655, + "learning_rate": 7.611104624749137e-05, + "loss": 1.0202, + "step": 3865 + }, + { + "epoch": 0.3454330198583778, + "grad_norm": 0.4683041572570801, + "learning_rate": 7.609870437851381e-05, + "loss": 1.0224, + "step": 3866 + }, + { + "epoch": 0.3455223713896397, + "grad_norm": 0.35911422967910767, + "learning_rate": 7.608636032349155e-05, + "loss": 1.0381, + "step": 3867 + }, + { + "epoch": 0.34561172292090153, + "grad_norm": 0.4072626829147339, + "learning_rate": 7.607401408345855e-05, + "loss": 1.119, + "step": 3868 + }, + { + "epoch": 0.3457010744521634, + "grad_norm": 0.4566546082496643, + "learning_rate": 7.606166565944895e-05, + "loss": 0.9207, + "step": 3869 + }, + { + "epoch": 0.3457904259834253, + "grad_norm": 0.41566282510757446, + "learning_rate": 7.604931505249706e-05, + "loss": 0.9994, + "step": 3870 + }, + { + "epoch": 0.3458797775146872, + "grad_norm": 0.46840545535087585, + "learning_rate": 7.603696226363737e-05, + "loss": 1.0202, + "step": 3871 + }, + { + "epoch": 0.345969129045949, + "grad_norm": 0.5090840458869934, + "learning_rate": 7.602460729390455e-05, + "loss": 0.8464, + "step": 3872 + }, + { + "epoch": 0.3460584805772109, + "grad_norm": 0.39953356981277466, + "learning_rate": 7.601225014433346e-05, + "loss": 1.0418, + "step": 3873 + }, + { + "epoch": 0.3461478321084728, + "grad_norm": 0.48468056321144104, + "learning_rate": 7.599989081595915e-05, + "loss": 0.9746, + "step": 3874 + }, + { + "epoch": 0.3462371836397346, + "grad_norm": 0.45679304003715515, + "learning_rate": 7.598752930981686e-05, + "loss": 0.9614, + "step": 3875 + }, + { + "epoch": 0.3463265351709965, + "grad_norm": 0.39945322275161743, + "learning_rate": 7.597516562694197e-05, + "loss": 1.0286, + "step": 3876 + }, + { + "epoch": 0.34641588670225837, + "grad_norm": 0.37428709864616394, + "learning_rate": 7.59627997683701e-05, + "loss": 1.0062, + "step": 3877 + }, + { + "epoch": 0.34650523823352025, + "grad_norm": 0.36209729313850403, + "learning_rate": 7.595043173513698e-05, + "loss": 1.0071, + "step": 3878 + }, + { + "epoch": 0.3465945897647821, + "grad_norm": 0.3938021659851074, + "learning_rate": 7.593806152827861e-05, + "loss": 0.9865, + "step": 3879 + }, + { + "epoch": 0.34668394129604396, + "grad_norm": 0.3960947096347809, + "learning_rate": 7.592568914883112e-05, + "loss": 0.9646, + "step": 3880 + }, + { + "epoch": 0.34677329282730585, + "grad_norm": 0.4837634563446045, + "learning_rate": 7.591331459783078e-05, + "loss": 0.9538, + "step": 3881 + }, + { + "epoch": 0.3468626443585677, + "grad_norm": 0.44596779346466064, + "learning_rate": 7.590093787631414e-05, + "loss": 0.9657, + "step": 3882 + }, + { + "epoch": 0.34695199588982956, + "grad_norm": 0.39485910534858704, + "learning_rate": 7.588855898531787e-05, + "loss": 1.047, + "step": 3883 + }, + { + "epoch": 0.34704134742109144, + "grad_norm": 0.4514664113521576, + "learning_rate": 7.587617792587884e-05, + "loss": 1.0095, + "step": 3884 + }, + { + "epoch": 0.3471306989523533, + "grad_norm": 0.46183550357818604, + "learning_rate": 7.586379469903408e-05, + "loss": 0.8961, + "step": 3885 + }, + { + "epoch": 0.34722005048361515, + "grad_norm": 0.4102190136909485, + "learning_rate": 7.585140930582085e-05, + "loss": 0.8931, + "step": 3886 + }, + { + "epoch": 0.34730940201487703, + "grad_norm": 0.42231065034866333, + "learning_rate": 7.583902174727651e-05, + "loss": 0.9373, + "step": 3887 + }, + { + "epoch": 0.3473987535461389, + "grad_norm": 0.39157602190971375, + "learning_rate": 7.582663202443867e-05, + "loss": 1.0546, + "step": 3888 + }, + { + "epoch": 0.34748810507740074, + "grad_norm": 0.46377474069595337, + "learning_rate": 7.581424013834511e-05, + "loss": 0.928, + "step": 3889 + }, + { + "epoch": 0.3475774566086626, + "grad_norm": 0.3647017776966095, + "learning_rate": 7.580184609003378e-05, + "loss": 1.0289, + "step": 3890 + }, + { + "epoch": 0.3476668081399245, + "grad_norm": 0.46764108538627625, + "learning_rate": 7.578944988054281e-05, + "loss": 0.9646, + "step": 3891 + }, + { + "epoch": 0.34775615967118634, + "grad_norm": 0.37693339586257935, + "learning_rate": 7.577705151091053e-05, + "loss": 1.0112, + "step": 3892 + }, + { + "epoch": 0.3478455112024482, + "grad_norm": 0.39555221796035767, + "learning_rate": 7.576465098217542e-05, + "loss": 0.9854, + "step": 3893 + }, + { + "epoch": 0.3479348627337101, + "grad_norm": 0.5179402828216553, + "learning_rate": 7.575224829537615e-05, + "loss": 0.9444, + "step": 3894 + }, + { + "epoch": 0.348024214264972, + "grad_norm": 0.42474082112312317, + "learning_rate": 7.573984345155159e-05, + "loss": 1.0315, + "step": 3895 + }, + { + "epoch": 0.3481135657962338, + "grad_norm": 0.46355873346328735, + "learning_rate": 7.572743645174077e-05, + "loss": 0.9543, + "step": 3896 + }, + { + "epoch": 0.3482029173274957, + "grad_norm": 0.4335034489631653, + "learning_rate": 7.571502729698293e-05, + "loss": 0.9763, + "step": 3897 + }, + { + "epoch": 0.3482922688587576, + "grad_norm": 0.4546475112438202, + "learning_rate": 7.570261598831743e-05, + "loss": 0.9754, + "step": 3898 + }, + { + "epoch": 0.3483816203900194, + "grad_norm": 0.4445870518684387, + "learning_rate": 7.569020252678387e-05, + "loss": 0.9814, + "step": 3899 + }, + { + "epoch": 0.3484709719212813, + "grad_norm": 0.39527517557144165, + "learning_rate": 7.567778691342203e-05, + "loss": 1.0592, + "step": 3900 + }, + { + "epoch": 0.3485603234525432, + "grad_norm": 0.442655473947525, + "learning_rate": 7.566536914927181e-05, + "loss": 0.9154, + "step": 3901 + }, + { + "epoch": 0.34864967498380506, + "grad_norm": 0.40641334652900696, + "learning_rate": 7.565294923537336e-05, + "loss": 0.9864, + "step": 3902 + }, + { + "epoch": 0.3487390265150669, + "grad_norm": 0.42094290256500244, + "learning_rate": 7.564052717276696e-05, + "loss": 0.9796, + "step": 3903 + }, + { + "epoch": 0.34882837804632877, + "grad_norm": 0.4103735089302063, + "learning_rate": 7.56281029624931e-05, + "loss": 1.0549, + "step": 3904 + }, + { + "epoch": 0.34891772957759065, + "grad_norm": 0.5410390496253967, + "learning_rate": 7.561567660559246e-05, + "loss": 0.9967, + "step": 3905 + }, + { + "epoch": 0.3490070811088525, + "grad_norm": 0.4145830273628235, + "learning_rate": 7.560324810310586e-05, + "loss": 0.9267, + "step": 3906 + }, + { + "epoch": 0.34909643264011436, + "grad_norm": 0.47264495491981506, + "learning_rate": 7.559081745607431e-05, + "loss": 0.9579, + "step": 3907 + }, + { + "epoch": 0.34918578417137625, + "grad_norm": 0.5321443676948547, + "learning_rate": 7.557838466553902e-05, + "loss": 0.9407, + "step": 3908 + }, + { + "epoch": 0.34927513570263813, + "grad_norm": 0.465622216463089, + "learning_rate": 7.556594973254136e-05, + "loss": 0.9588, + "step": 3909 + }, + { + "epoch": 0.34936448723389996, + "grad_norm": 0.463840514421463, + "learning_rate": 7.555351265812292e-05, + "loss": 0.9034, + "step": 3910 + }, + { + "epoch": 0.34945383876516184, + "grad_norm": 0.4932054281234741, + "learning_rate": 7.55410734433254e-05, + "loss": 0.9503, + "step": 3911 + }, + { + "epoch": 0.3495431902964237, + "grad_norm": 0.3836749494075775, + "learning_rate": 7.552863208919073e-05, + "loss": 0.9432, + "step": 3912 + }, + { + "epoch": 0.34963254182768555, + "grad_norm": 0.43824368715286255, + "learning_rate": 7.551618859676101e-05, + "loss": 1.047, + "step": 3913 + }, + { + "epoch": 0.34972189335894743, + "grad_norm": 0.5028628706932068, + "learning_rate": 7.550374296707851e-05, + "loss": 0.9571, + "step": 3914 + }, + { + "epoch": 0.3498112448902093, + "grad_norm": 0.4159904718399048, + "learning_rate": 7.54912952011857e-05, + "loss": 0.9978, + "step": 3915 + }, + { + "epoch": 0.3499005964214712, + "grad_norm": 0.42895132303237915, + "learning_rate": 7.547884530012517e-05, + "loss": 0.9415, + "step": 3916 + }, + { + "epoch": 0.349989947952733, + "grad_norm": 0.4712909162044525, + "learning_rate": 7.546639326493978e-05, + "loss": 0.971, + "step": 3917 + }, + { + "epoch": 0.3500792994839949, + "grad_norm": 0.41875845193862915, + "learning_rate": 7.545393909667249e-05, + "loss": 1.0202, + "step": 3918 + }, + { + "epoch": 0.3501686510152568, + "grad_norm": 0.32883596420288086, + "learning_rate": 7.54414827963665e-05, + "loss": 1.0405, + "step": 3919 + }, + { + "epoch": 0.3502580025465186, + "grad_norm": 0.5156275033950806, + "learning_rate": 7.542902436506514e-05, + "loss": 0.8699, + "step": 3920 + }, + { + "epoch": 0.3503473540777805, + "grad_norm": 0.3887585997581482, + "learning_rate": 7.541656380381192e-05, + "loss": 0.9707, + "step": 3921 + }, + { + "epoch": 0.3504367056090424, + "grad_norm": 0.4152442216873169, + "learning_rate": 7.540410111365055e-05, + "loss": 0.9871, + "step": 3922 + }, + { + "epoch": 0.3505260571403042, + "grad_norm": 0.4276559054851532, + "learning_rate": 7.539163629562494e-05, + "loss": 0.9426, + "step": 3923 + }, + { + "epoch": 0.3506154086715661, + "grad_norm": 0.5117713212966919, + "learning_rate": 7.537916935077914e-05, + "loss": 0.9238, + "step": 3924 + }, + { + "epoch": 0.350704760202828, + "grad_norm": 0.4770560562610626, + "learning_rate": 7.536670028015737e-05, + "loss": 0.9661, + "step": 3925 + }, + { + "epoch": 0.35079411173408986, + "grad_norm": 0.49051031470298767, + "learning_rate": 7.535422908480408e-05, + "loss": 1.0228, + "step": 3926 + }, + { + "epoch": 0.3508834632653517, + "grad_norm": 0.3898465037345886, + "learning_rate": 7.534175576576384e-05, + "loss": 1.01, + "step": 3927 + }, + { + "epoch": 0.3509728147966136, + "grad_norm": 0.48158106207847595, + "learning_rate": 7.532928032408142e-05, + "loss": 1.0054, + "step": 3928 + }, + { + "epoch": 0.35106216632787546, + "grad_norm": 0.43918222188949585, + "learning_rate": 7.53168027608018e-05, + "loss": 0.9429, + "step": 3929 + }, + { + "epoch": 0.3511515178591373, + "grad_norm": 0.4521641433238983, + "learning_rate": 7.530432307697007e-05, + "loss": 0.8965, + "step": 3930 + }, + { + "epoch": 0.35124086939039917, + "grad_norm": 0.4660329520702362, + "learning_rate": 7.529184127363158e-05, + "loss": 0.9271, + "step": 3931 + }, + { + "epoch": 0.35133022092166105, + "grad_norm": 0.4383411705493927, + "learning_rate": 7.527935735183177e-05, + "loss": 1.0393, + "step": 3932 + }, + { + "epoch": 0.35141957245292293, + "grad_norm": 0.36236387491226196, + "learning_rate": 7.526687131261634e-05, + "loss": 1.0835, + "step": 3933 + }, + { + "epoch": 0.35150892398418476, + "grad_norm": 0.3911649286746979, + "learning_rate": 7.52543831570311e-05, + "loss": 1.0008, + "step": 3934 + }, + { + "epoch": 0.35159827551544665, + "grad_norm": 0.43021535873413086, + "learning_rate": 7.524189288612209e-05, + "loss": 1.0463, + "step": 3935 + }, + { + "epoch": 0.35168762704670853, + "grad_norm": 0.5328584909439087, + "learning_rate": 7.522940050093547e-05, + "loss": 1.0456, + "step": 3936 + }, + { + "epoch": 0.35177697857797036, + "grad_norm": 0.41291365027427673, + "learning_rate": 7.521690600251766e-05, + "loss": 0.9843, + "step": 3937 + }, + { + "epoch": 0.35186633010923224, + "grad_norm": 0.44431254267692566, + "learning_rate": 7.520440939191515e-05, + "loss": 0.9654, + "step": 3938 + }, + { + "epoch": 0.3519556816404941, + "grad_norm": 0.40238481760025024, + "learning_rate": 7.519191067017472e-05, + "loss": 1.0108, + "step": 3939 + }, + { + "epoch": 0.352045033171756, + "grad_norm": 0.41174423694610596, + "learning_rate": 7.517940983834323e-05, + "loss": 1.0082, + "step": 3940 + }, + { + "epoch": 0.35213438470301783, + "grad_norm": 0.44105100631713867, + "learning_rate": 7.516690689746779e-05, + "loss": 0.9419, + "step": 3941 + }, + { + "epoch": 0.3522237362342797, + "grad_norm": 0.4259895980358124, + "learning_rate": 7.515440184859561e-05, + "loss": 1.0009, + "step": 3942 + }, + { + "epoch": 0.3523130877655416, + "grad_norm": 0.45526599884033203, + "learning_rate": 7.514189469277418e-05, + "loss": 1.019, + "step": 3943 + }, + { + "epoch": 0.3524024392968034, + "grad_norm": 0.4207375943660736, + "learning_rate": 7.512938543105105e-05, + "loss": 0.9647, + "step": 3944 + }, + { + "epoch": 0.3524917908280653, + "grad_norm": 0.4691575765609741, + "learning_rate": 7.511687406447406e-05, + "loss": 0.984, + "step": 3945 + }, + { + "epoch": 0.3525811423593272, + "grad_norm": 0.4423462152481079, + "learning_rate": 7.51043605940911e-05, + "loss": 0.9842, + "step": 3946 + }, + { + "epoch": 0.3526704938905891, + "grad_norm": 0.44663453102111816, + "learning_rate": 7.509184502095038e-05, + "loss": 0.9933, + "step": 3947 + }, + { + "epoch": 0.3527598454218509, + "grad_norm": 0.4556397795677185, + "learning_rate": 7.507932734610017e-05, + "loss": 0.9362, + "step": 3948 + }, + { + "epoch": 0.3528491969531128, + "grad_norm": 0.3916218876838684, + "learning_rate": 7.506680757058896e-05, + "loss": 1.0302, + "step": 3949 + }, + { + "epoch": 0.35293854848437467, + "grad_norm": 0.39909112453460693, + "learning_rate": 7.505428569546542e-05, + "loss": 1.0345, + "step": 3950 + }, + { + "epoch": 0.3530279000156365, + "grad_norm": 0.4075121283531189, + "learning_rate": 7.504176172177841e-05, + "loss": 0.9996, + "step": 3951 + }, + { + "epoch": 0.3531172515468984, + "grad_norm": 0.46926334500312805, + "learning_rate": 7.502923565057692e-05, + "loss": 0.9457, + "step": 3952 + }, + { + "epoch": 0.35320660307816026, + "grad_norm": 0.4146152436733246, + "learning_rate": 7.501670748291016e-05, + "loss": 1.0144, + "step": 3953 + }, + { + "epoch": 0.3532959546094221, + "grad_norm": 0.4583793878555298, + "learning_rate": 7.500417721982748e-05, + "loss": 0.9683, + "step": 3954 + }, + { + "epoch": 0.353385306140684, + "grad_norm": 0.42835697531700134, + "learning_rate": 7.499164486237844e-05, + "loss": 0.9714, + "step": 3955 + }, + { + "epoch": 0.35347465767194586, + "grad_norm": 0.4047943949699402, + "learning_rate": 7.497911041161274e-05, + "loss": 1.0431, + "step": 3956 + }, + { + "epoch": 0.35356400920320774, + "grad_norm": 0.3960307836532593, + "learning_rate": 7.496657386858029e-05, + "loss": 0.9681, + "step": 3957 + }, + { + "epoch": 0.35365336073446957, + "grad_norm": 0.37818729877471924, + "learning_rate": 7.495403523433116e-05, + "loss": 1.0108, + "step": 3958 + }, + { + "epoch": 0.35374271226573145, + "grad_norm": 0.44863516092300415, + "learning_rate": 7.494149450991557e-05, + "loss": 0.9547, + "step": 3959 + }, + { + "epoch": 0.35383206379699333, + "grad_norm": 0.38477665185928345, + "learning_rate": 7.492895169638397e-05, + "loss": 1.0091, + "step": 3960 + }, + { + "epoch": 0.35392141532825516, + "grad_norm": 0.45500656962394714, + "learning_rate": 7.491640679478696e-05, + "loss": 0.9557, + "step": 3961 + }, + { + "epoch": 0.35401076685951705, + "grad_norm": 0.3944813013076782, + "learning_rate": 7.490385980617527e-05, + "loss": 1.0622, + "step": 3962 + }, + { + "epoch": 0.35410011839077893, + "grad_norm": 0.3850362300872803, + "learning_rate": 7.489131073159987e-05, + "loss": 0.96, + "step": 3963 + }, + { + "epoch": 0.3541894699220408, + "grad_norm": 0.402127742767334, + "learning_rate": 7.487875957211188e-05, + "loss": 0.8966, + "step": 3964 + }, + { + "epoch": 0.35427882145330264, + "grad_norm": 0.39359238743782043, + "learning_rate": 7.486620632876257e-05, + "loss": 1.0262, + "step": 3965 + }, + { + "epoch": 0.3543681729845645, + "grad_norm": 0.4493246078491211, + "learning_rate": 7.485365100260345e-05, + "loss": 0.9622, + "step": 3966 + }, + { + "epoch": 0.3544575245158264, + "grad_norm": 0.38160157203674316, + "learning_rate": 7.484109359468612e-05, + "loss": 0.9997, + "step": 3967 + }, + { + "epoch": 0.35454687604708823, + "grad_norm": 0.5229949355125427, + "learning_rate": 7.482853410606242e-05, + "loss": 0.8508, + "step": 3968 + }, + { + "epoch": 0.3546362275783501, + "grad_norm": 0.41521862149238586, + "learning_rate": 7.481597253778434e-05, + "loss": 1.0109, + "step": 3969 + }, + { + "epoch": 0.354725579109612, + "grad_norm": 0.4313722252845764, + "learning_rate": 7.480340889090403e-05, + "loss": 0.9793, + "step": 3970 + }, + { + "epoch": 0.3548149306408739, + "grad_norm": 0.47400006651878357, + "learning_rate": 7.479084316647385e-05, + "loss": 1.0458, + "step": 3971 + }, + { + "epoch": 0.3549042821721357, + "grad_norm": 0.44894692301750183, + "learning_rate": 7.477827536554629e-05, + "loss": 0.9617, + "step": 3972 + }, + { + "epoch": 0.3549936337033976, + "grad_norm": 0.40421560406684875, + "learning_rate": 7.476570548917406e-05, + "loss": 1.0115, + "step": 3973 + }, + { + "epoch": 0.3550829852346595, + "grad_norm": 0.4186607897281647, + "learning_rate": 7.475313353841e-05, + "loss": 1.0278, + "step": 3974 + }, + { + "epoch": 0.3551723367659213, + "grad_norm": 0.36971282958984375, + "learning_rate": 7.474055951430717e-05, + "loss": 0.9399, + "step": 3975 + }, + { + "epoch": 0.3552616882971832, + "grad_norm": 0.4785158634185791, + "learning_rate": 7.472798341791877e-05, + "loss": 0.9303, + "step": 3976 + }, + { + "epoch": 0.35535103982844507, + "grad_norm": 0.38244813680648804, + "learning_rate": 7.471540525029817e-05, + "loss": 0.9782, + "step": 3977 + }, + { + "epoch": 0.35544039135970695, + "grad_norm": 0.5407021641731262, + "learning_rate": 7.470282501249893e-05, + "loss": 0.948, + "step": 3978 + }, + { + "epoch": 0.3555297428909688, + "grad_norm": 0.4248984158039093, + "learning_rate": 7.469024270557477e-05, + "loss": 1.0419, + "step": 3979 + }, + { + "epoch": 0.35561909442223066, + "grad_norm": 0.38939395546913147, + "learning_rate": 7.467765833057964e-05, + "loss": 1.0081, + "step": 3980 + }, + { + "epoch": 0.35570844595349255, + "grad_norm": 0.45824161171913147, + "learning_rate": 7.466507188856755e-05, + "loss": 0.9602, + "step": 3981 + }, + { + "epoch": 0.3557977974847544, + "grad_norm": 0.45186036825180054, + "learning_rate": 7.46524833805928e-05, + "loss": 0.9459, + "step": 3982 + }, + { + "epoch": 0.35588714901601626, + "grad_norm": 0.4205573499202728, + "learning_rate": 7.463989280770978e-05, + "loss": 1.0347, + "step": 3983 + }, + { + "epoch": 0.35597650054727814, + "grad_norm": 0.5025462508201599, + "learning_rate": 7.462730017097308e-05, + "loss": 0.9495, + "step": 3984 + }, + { + "epoch": 0.35606585207853997, + "grad_norm": 0.45681893825531006, + "learning_rate": 7.46147054714375e-05, + "loss": 0.9172, + "step": 3985 + }, + { + "epoch": 0.35615520360980185, + "grad_norm": 0.43965548276901245, + "learning_rate": 7.460210871015796e-05, + "loss": 1.0745, + "step": 3986 + }, + { + "epoch": 0.35624455514106373, + "grad_norm": 0.4385167062282562, + "learning_rate": 7.458950988818957e-05, + "loss": 1.0192, + "step": 3987 + }, + { + "epoch": 0.3563339066723256, + "grad_norm": 0.4278493821620941, + "learning_rate": 7.457690900658762e-05, + "loss": 0.9996, + "step": 3988 + }, + { + "epoch": 0.35642325820358745, + "grad_norm": 0.45512884855270386, + "learning_rate": 7.456430606640757e-05, + "loss": 0.9825, + "step": 3989 + }, + { + "epoch": 0.35651260973484933, + "grad_norm": 0.4201413094997406, + "learning_rate": 7.455170106870505e-05, + "loss": 0.949, + "step": 3990 + }, + { + "epoch": 0.3566019612661112, + "grad_norm": 0.39953503012657166, + "learning_rate": 7.453909401453589e-05, + "loss": 0.9882, + "step": 3991 + }, + { + "epoch": 0.35669131279737304, + "grad_norm": 0.4012105464935303, + "learning_rate": 7.452648490495602e-05, + "loss": 0.9762, + "step": 3992 + }, + { + "epoch": 0.3567806643286349, + "grad_norm": 0.39870861172676086, + "learning_rate": 7.451387374102159e-05, + "loss": 0.9714, + "step": 3993 + }, + { + "epoch": 0.3568700158598968, + "grad_norm": 0.3774680197238922, + "learning_rate": 7.450126052378894e-05, + "loss": 0.9913, + "step": 3994 + }, + { + "epoch": 0.3569593673911587, + "grad_norm": 0.4141833186149597, + "learning_rate": 7.448864525431457e-05, + "loss": 0.9495, + "step": 3995 + }, + { + "epoch": 0.3570487189224205, + "grad_norm": 0.44811514019966125, + "learning_rate": 7.447602793365514e-05, + "loss": 1.0848, + "step": 3996 + }, + { + "epoch": 0.3571380704536824, + "grad_norm": 0.44921422004699707, + "learning_rate": 7.446340856286744e-05, + "loss": 1.0281, + "step": 3997 + }, + { + "epoch": 0.3572274219849443, + "grad_norm": 0.4648860991001129, + "learning_rate": 7.445078714300855e-05, + "loss": 1.0197, + "step": 3998 + }, + { + "epoch": 0.3573167735162061, + "grad_norm": 0.4424198865890503, + "learning_rate": 7.443816367513559e-05, + "loss": 1.0069, + "step": 3999 + }, + { + "epoch": 0.357406125047468, + "grad_norm": 0.5149994492530823, + "learning_rate": 7.442553816030592e-05, + "loss": 0.9001, + "step": 4000 + }, + { + "epoch": 0.3574954765787299, + "grad_norm": 0.3967181444168091, + "learning_rate": 7.441291059957708e-05, + "loss": 1.0558, + "step": 4001 + }, + { + "epoch": 0.35758482810999176, + "grad_norm": 0.4318191707134247, + "learning_rate": 7.440028099400677e-05, + "loss": 0.9383, + "step": 4002 + }, + { + "epoch": 0.3576741796412536, + "grad_norm": 0.4305358827114105, + "learning_rate": 7.438764934465283e-05, + "loss": 1.0499, + "step": 4003 + }, + { + "epoch": 0.35776353117251547, + "grad_norm": 0.38480517268180847, + "learning_rate": 7.437501565257329e-05, + "loss": 0.9881, + "step": 4004 + }, + { + "epoch": 0.35785288270377735, + "grad_norm": 0.5233545303344727, + "learning_rate": 7.436237991882637e-05, + "loss": 0.969, + "step": 4005 + }, + { + "epoch": 0.3579422342350392, + "grad_norm": 0.5495209693908691, + "learning_rate": 7.434974214447047e-05, + "loss": 0.9978, + "step": 4006 + }, + { + "epoch": 0.35803158576630106, + "grad_norm": 0.46904364228248596, + "learning_rate": 7.43371023305641e-05, + "loss": 0.9741, + "step": 4007 + }, + { + "epoch": 0.35812093729756295, + "grad_norm": 0.4199860990047455, + "learning_rate": 7.432446047816599e-05, + "loss": 0.9811, + "step": 4008 + }, + { + "epoch": 0.35821028882882483, + "grad_norm": 0.3669317364692688, + "learning_rate": 7.431181658833504e-05, + "loss": 1.07, + "step": 4009 + }, + { + "epoch": 0.35829964036008666, + "grad_norm": 0.41901111602783203, + "learning_rate": 7.42991706621303e-05, + "loss": 0.896, + "step": 4010 + }, + { + "epoch": 0.35838899189134854, + "grad_norm": 0.47777092456817627, + "learning_rate": 7.428652270061102e-05, + "loss": 1.0107, + "step": 4011 + }, + { + "epoch": 0.3584783434226104, + "grad_norm": 0.4377864897251129, + "learning_rate": 7.427387270483659e-05, + "loss": 0.9397, + "step": 4012 + }, + { + "epoch": 0.35856769495387225, + "grad_norm": 0.45756855607032776, + "learning_rate": 7.426122067586656e-05, + "loss": 0.9688, + "step": 4013 + }, + { + "epoch": 0.35865704648513413, + "grad_norm": 0.4156295359134674, + "learning_rate": 7.424856661476071e-05, + "loss": 1.0963, + "step": 4014 + }, + { + "epoch": 0.358746398016396, + "grad_norm": 0.43841129541397095, + "learning_rate": 7.423591052257893e-05, + "loss": 0.9589, + "step": 4015 + }, + { + "epoch": 0.3588357495476579, + "grad_norm": 0.35517293214797974, + "learning_rate": 7.42232524003813e-05, + "loss": 1.0089, + "step": 4016 + }, + { + "epoch": 0.35892510107891973, + "grad_norm": 0.5077205300331116, + "learning_rate": 7.42105922492281e-05, + "loss": 0.9804, + "step": 4017 + }, + { + "epoch": 0.3590144526101816, + "grad_norm": 0.480881929397583, + "learning_rate": 7.419793007017972e-05, + "loss": 1.0214, + "step": 4018 + }, + { + "epoch": 0.3591038041414435, + "grad_norm": 0.4301677942276001, + "learning_rate": 7.418526586429676e-05, + "loss": 1.0346, + "step": 4019 + }, + { + "epoch": 0.3591931556727053, + "grad_norm": 0.4578787684440613, + "learning_rate": 7.417259963263999e-05, + "loss": 0.9741, + "step": 4020 + }, + { + "epoch": 0.3592825072039672, + "grad_norm": 0.5144799947738647, + "learning_rate": 7.415993137627036e-05, + "loss": 0.9486, + "step": 4021 + }, + { + "epoch": 0.3593718587352291, + "grad_norm": 0.37972408533096313, + "learning_rate": 7.414726109624892e-05, + "loss": 1.0567, + "step": 4022 + }, + { + "epoch": 0.3594612102664909, + "grad_norm": 0.40619876980781555, + "learning_rate": 7.413458879363698e-05, + "loss": 0.9922, + "step": 4023 + }, + { + "epoch": 0.3595505617977528, + "grad_norm": 0.5421142578125, + "learning_rate": 7.412191446949598e-05, + "loss": 0.9754, + "step": 4024 + }, + { + "epoch": 0.3596399133290147, + "grad_norm": 0.46210673451423645, + "learning_rate": 7.410923812488752e-05, + "loss": 1.0145, + "step": 4025 + }, + { + "epoch": 0.35972926486027657, + "grad_norm": 0.45515942573547363, + "learning_rate": 7.409655976087339e-05, + "loss": 0.9441, + "step": 4026 + }, + { + "epoch": 0.3598186163915384, + "grad_norm": 0.40571707487106323, + "learning_rate": 7.408387937851551e-05, + "loss": 0.9343, + "step": 4027 + }, + { + "epoch": 0.3599079679228003, + "grad_norm": 0.5472614765167236, + "learning_rate": 7.407119697887602e-05, + "loss": 0.964, + "step": 4028 + }, + { + "epoch": 0.35999731945406216, + "grad_norm": 0.4565272629261017, + "learning_rate": 7.405851256301722e-05, + "loss": 0.9862, + "step": 4029 + }, + { + "epoch": 0.360086670985324, + "grad_norm": 0.5199412703514099, + "learning_rate": 7.404582613200153e-05, + "loss": 0.9162, + "step": 4030 + }, + { + "epoch": 0.36017602251658587, + "grad_norm": 0.38596537709236145, + "learning_rate": 7.403313768689159e-05, + "loss": 1.0374, + "step": 4031 + }, + { + "epoch": 0.36026537404784775, + "grad_norm": 0.4285348653793335, + "learning_rate": 7.402044722875021e-05, + "loss": 0.9533, + "step": 4032 + }, + { + "epoch": 0.36035472557910964, + "grad_norm": 0.4435160756111145, + "learning_rate": 7.400775475864032e-05, + "loss": 0.9475, + "step": 4033 + }, + { + "epoch": 0.36044407711037146, + "grad_norm": 0.40904703736305237, + "learning_rate": 7.399506027762507e-05, + "loss": 0.9486, + "step": 4034 + }, + { + "epoch": 0.36053342864163335, + "grad_norm": 0.42833709716796875, + "learning_rate": 7.398236378676776e-05, + "loss": 0.9862, + "step": 4035 + }, + { + "epoch": 0.36062278017289523, + "grad_norm": 0.44428741931915283, + "learning_rate": 7.396966528713184e-05, + "loss": 0.9491, + "step": 4036 + }, + { + "epoch": 0.36071213170415706, + "grad_norm": 0.37404051423072815, + "learning_rate": 7.395696477978096e-05, + "loss": 1.0652, + "step": 4037 + }, + { + "epoch": 0.36080148323541894, + "grad_norm": 0.5555472373962402, + "learning_rate": 7.394426226577891e-05, + "loss": 0.8922, + "step": 4038 + }, + { + "epoch": 0.3608908347666808, + "grad_norm": 0.40428173542022705, + "learning_rate": 7.393155774618967e-05, + "loss": 0.9652, + "step": 4039 + }, + { + "epoch": 0.3609801862979427, + "grad_norm": 0.5697705149650574, + "learning_rate": 7.391885122207738e-05, + "loss": 0.9272, + "step": 4040 + }, + { + "epoch": 0.36106953782920453, + "grad_norm": 0.38609790802001953, + "learning_rate": 7.390614269450634e-05, + "loss": 0.9702, + "step": 4041 + }, + { + "epoch": 0.3611588893604664, + "grad_norm": 0.401506245136261, + "learning_rate": 7.389343216454103e-05, + "loss": 1.0075, + "step": 4042 + }, + { + "epoch": 0.3612482408917283, + "grad_norm": 0.43169543147087097, + "learning_rate": 7.388071963324609e-05, + "loss": 0.9761, + "step": 4043 + }, + { + "epoch": 0.36133759242299013, + "grad_norm": 0.4852740168571472, + "learning_rate": 7.386800510168632e-05, + "loss": 0.9143, + "step": 4044 + }, + { + "epoch": 0.361426943954252, + "grad_norm": 0.4311829209327698, + "learning_rate": 7.385528857092672e-05, + "loss": 0.9728, + "step": 4045 + }, + { + "epoch": 0.3615162954855139, + "grad_norm": 0.4241611063480377, + "learning_rate": 7.384257004203242e-05, + "loss": 0.9631, + "step": 4046 + }, + { + "epoch": 0.3616056470167758, + "grad_norm": 0.435428649187088, + "learning_rate": 7.382984951606875e-05, + "loss": 0.9813, + "step": 4047 + }, + { + "epoch": 0.3616949985480376, + "grad_norm": 0.4283429980278015, + "learning_rate": 7.381712699410116e-05, + "loss": 0.9696, + "step": 4048 + }, + { + "epoch": 0.3617843500792995, + "grad_norm": 0.43169790506362915, + "learning_rate": 7.380440247719532e-05, + "loss": 0.968, + "step": 4049 + }, + { + "epoch": 0.36187370161056137, + "grad_norm": 0.44879093766212463, + "learning_rate": 7.379167596641702e-05, + "loss": 0.9826, + "step": 4050 + }, + { + "epoch": 0.3619630531418232, + "grad_norm": 0.3845369219779968, + "learning_rate": 7.377894746283227e-05, + "loss": 0.9373, + "step": 4051 + }, + { + "epoch": 0.3620524046730851, + "grad_norm": 0.42470452189445496, + "learning_rate": 7.37662169675072e-05, + "loss": 1.0286, + "step": 4052 + }, + { + "epoch": 0.36214175620434697, + "grad_norm": 0.5062727332115173, + "learning_rate": 7.375348448150814e-05, + "loss": 0.9409, + "step": 4053 + }, + { + "epoch": 0.3622311077356088, + "grad_norm": 0.41713082790374756, + "learning_rate": 7.374075000590155e-05, + "loss": 0.9857, + "step": 4054 + }, + { + "epoch": 0.3623204592668707, + "grad_norm": 0.5149965286254883, + "learning_rate": 7.372801354175409e-05, + "loss": 0.8709, + "step": 4055 + }, + { + "epoch": 0.36240981079813256, + "grad_norm": 0.49608248472213745, + "learning_rate": 7.371527509013257e-05, + "loss": 0.9298, + "step": 4056 + }, + { + "epoch": 0.36249916232939444, + "grad_norm": 0.5133218169212341, + "learning_rate": 7.370253465210398e-05, + "loss": 0.9234, + "step": 4057 + }, + { + "epoch": 0.36258851386065627, + "grad_norm": 0.4780612587928772, + "learning_rate": 7.368979222873547e-05, + "loss": 0.9725, + "step": 4058 + }, + { + "epoch": 0.36267786539191815, + "grad_norm": 0.5128650665283203, + "learning_rate": 7.367704782109433e-05, + "loss": 0.9635, + "step": 4059 + }, + { + "epoch": 0.36276721692318004, + "grad_norm": 0.41586834192276, + "learning_rate": 7.366430143024805e-05, + "loss": 0.9352, + "step": 4060 + }, + { + "epoch": 0.36285656845444186, + "grad_norm": 0.3711674213409424, + "learning_rate": 7.36515530572643e-05, + "loss": 1.0388, + "step": 4061 + }, + { + "epoch": 0.36294591998570375, + "grad_norm": 0.5341949462890625, + "learning_rate": 7.363880270321087e-05, + "loss": 0.9257, + "step": 4062 + }, + { + "epoch": 0.36303527151696563, + "grad_norm": 0.43725699186325073, + "learning_rate": 7.362605036915574e-05, + "loss": 0.9485, + "step": 4063 + }, + { + "epoch": 0.3631246230482275, + "grad_norm": 0.47559094429016113, + "learning_rate": 7.361329605616705e-05, + "loss": 1.0375, + "step": 4064 + }, + { + "epoch": 0.36321397457948934, + "grad_norm": 0.3746141493320465, + "learning_rate": 7.360053976531312e-05, + "loss": 0.9983, + "step": 4065 + }, + { + "epoch": 0.3633033261107512, + "grad_norm": 0.4194895625114441, + "learning_rate": 7.358778149766244e-05, + "loss": 0.9532, + "step": 4066 + }, + { + "epoch": 0.3633926776420131, + "grad_norm": 0.42524465918540955, + "learning_rate": 7.35750212542836e-05, + "loss": 1.0122, + "step": 4067 + }, + { + "epoch": 0.36348202917327493, + "grad_norm": 0.5586007237434387, + "learning_rate": 7.356225903624545e-05, + "loss": 0.9938, + "step": 4068 + }, + { + "epoch": 0.3635713807045368, + "grad_norm": 0.36692580580711365, + "learning_rate": 7.354949484461697e-05, + "loss": 1.0279, + "step": 4069 + }, + { + "epoch": 0.3636607322357987, + "grad_norm": 0.40066802501678467, + "learning_rate": 7.353672868046725e-05, + "loss": 1.0642, + "step": 4070 + }, + { + "epoch": 0.3637500837670606, + "grad_norm": 0.4100087285041809, + "learning_rate": 7.352396054486562e-05, + "loss": 1.0454, + "step": 4071 + }, + { + "epoch": 0.3638394352983224, + "grad_norm": 0.39286187291145325, + "learning_rate": 7.351119043888158e-05, + "loss": 1.0109, + "step": 4072 + }, + { + "epoch": 0.3639287868295843, + "grad_norm": 0.39239948987960815, + "learning_rate": 7.349841836358468e-05, + "loss": 1.0396, + "step": 4073 + }, + { + "epoch": 0.3640181383608462, + "grad_norm": 0.3697894513607025, + "learning_rate": 7.34856443200448e-05, + "loss": 1.02, + "step": 4074 + }, + { + "epoch": 0.364107489892108, + "grad_norm": 0.44793811440467834, + "learning_rate": 7.347286830933187e-05, + "loss": 0.9778, + "step": 4075 + }, + { + "epoch": 0.3641968414233699, + "grad_norm": 0.46395543217658997, + "learning_rate": 7.3460090332516e-05, + "loss": 0.9936, + "step": 4076 + }, + { + "epoch": 0.36428619295463177, + "grad_norm": 0.4393361806869507, + "learning_rate": 7.344731039066752e-05, + "loss": 0.9751, + "step": 4077 + }, + { + "epoch": 0.36437554448589365, + "grad_norm": 0.3656950294971466, + "learning_rate": 7.343452848485683e-05, + "loss": 1.0026, + "step": 4078 + }, + { + "epoch": 0.3644648960171555, + "grad_norm": 0.43100160360336304, + "learning_rate": 7.342174461615461e-05, + "loss": 0.9219, + "step": 4079 + }, + { + "epoch": 0.36455424754841737, + "grad_norm": 0.37089818716049194, + "learning_rate": 7.340895878563162e-05, + "loss": 1.0019, + "step": 4080 + }, + { + "epoch": 0.36464359907967925, + "grad_norm": 0.4475706219673157, + "learning_rate": 7.339617099435881e-05, + "loss": 0.9677, + "step": 4081 + }, + { + "epoch": 0.3647329506109411, + "grad_norm": 0.3892630636692047, + "learning_rate": 7.338338124340728e-05, + "loss": 1.0284, + "step": 4082 + }, + { + "epoch": 0.36482230214220296, + "grad_norm": 0.4408927857875824, + "learning_rate": 7.337058953384834e-05, + "loss": 0.9299, + "step": 4083 + }, + { + "epoch": 0.36491165367346484, + "grad_norm": 0.5656498670578003, + "learning_rate": 7.335779586675341e-05, + "loss": 0.9921, + "step": 4084 + }, + { + "epoch": 0.36500100520472667, + "grad_norm": 0.40620046854019165, + "learning_rate": 7.334500024319409e-05, + "loss": 1.0244, + "step": 4085 + }, + { + "epoch": 0.36509035673598855, + "grad_norm": 0.5039829611778259, + "learning_rate": 7.333220266424217e-05, + "loss": 0.944, + "step": 4086 + }, + { + "epoch": 0.36517970826725044, + "grad_norm": 0.4727097451686859, + "learning_rate": 7.331940313096957e-05, + "loss": 0.8922, + "step": 4087 + }, + { + "epoch": 0.3652690597985123, + "grad_norm": 0.44061312079429626, + "learning_rate": 7.33066016444484e-05, + "loss": 0.9798, + "step": 4088 + }, + { + "epoch": 0.36535841132977415, + "grad_norm": 0.45610326528549194, + "learning_rate": 7.329379820575089e-05, + "loss": 1.004, + "step": 4089 + }, + { + "epoch": 0.36544776286103603, + "grad_norm": 0.4043094515800476, + "learning_rate": 7.32809928159495e-05, + "loss": 1.0491, + "step": 4090 + }, + { + "epoch": 0.3655371143922979, + "grad_norm": 0.4471263885498047, + "learning_rate": 7.32681854761168e-05, + "loss": 0.9727, + "step": 4091 + }, + { + "epoch": 0.36562646592355974, + "grad_norm": 0.5790581703186035, + "learning_rate": 7.325537618732557e-05, + "loss": 0.9889, + "step": 4092 + }, + { + "epoch": 0.3657158174548216, + "grad_norm": 0.4110223650932312, + "learning_rate": 7.324256495064867e-05, + "loss": 1.0602, + "step": 4093 + }, + { + "epoch": 0.3658051689860835, + "grad_norm": 0.5571956634521484, + "learning_rate": 7.322975176715921e-05, + "loss": 1.0389, + "step": 4094 + }, + { + "epoch": 0.3658945205173454, + "grad_norm": 0.3689397871494293, + "learning_rate": 7.321693663793044e-05, + "loss": 0.9932, + "step": 4095 + }, + { + "epoch": 0.3659838720486072, + "grad_norm": 0.37520068883895874, + "learning_rate": 7.320411956403573e-05, + "loss": 1.0419, + "step": 4096 + }, + { + "epoch": 0.3660732235798691, + "grad_norm": 0.5426008105278015, + "learning_rate": 7.319130054654869e-05, + "loss": 0.9726, + "step": 4097 + }, + { + "epoch": 0.366162575111131, + "grad_norm": 0.41847026348114014, + "learning_rate": 7.317847958654303e-05, + "loss": 0.9663, + "step": 4098 + }, + { + "epoch": 0.3662519266423928, + "grad_norm": 0.39576879143714905, + "learning_rate": 7.316565668509262e-05, + "loss": 0.9269, + "step": 4099 + }, + { + "epoch": 0.3663412781736547, + "grad_norm": 0.46187660098075867, + "learning_rate": 7.315283184327156e-05, + "loss": 1.0216, + "step": 4100 + }, + { + "epoch": 0.3664306297049166, + "grad_norm": 0.4226139187812805, + "learning_rate": 7.314000506215402e-05, + "loss": 0.9553, + "step": 4101 + }, + { + "epoch": 0.36651998123617846, + "grad_norm": 0.41681933403015137, + "learning_rate": 7.312717634281441e-05, + "loss": 1.0133, + "step": 4102 + }, + { + "epoch": 0.3666093327674403, + "grad_norm": 0.43406426906585693, + "learning_rate": 7.311434568632725e-05, + "loss": 1.0148, + "step": 4103 + }, + { + "epoch": 0.36669868429870217, + "grad_norm": 0.6190657615661621, + "learning_rate": 7.310151309376728e-05, + "loss": 0.8836, + "step": 4104 + }, + { + "epoch": 0.36678803582996405, + "grad_norm": 0.43368059396743774, + "learning_rate": 7.308867856620933e-05, + "loss": 0.9831, + "step": 4105 + }, + { + "epoch": 0.3668773873612259, + "grad_norm": 0.41560328006744385, + "learning_rate": 7.307584210472844e-05, + "loss": 1.0052, + "step": 4106 + }, + { + "epoch": 0.36696673889248776, + "grad_norm": 0.45990949869155884, + "learning_rate": 7.306300371039983e-05, + "loss": 0.9839, + "step": 4107 + }, + { + "epoch": 0.36705609042374965, + "grad_norm": 0.4657072126865387, + "learning_rate": 7.30501633842988e-05, + "loss": 0.9505, + "step": 4108 + }, + { + "epoch": 0.36714544195501153, + "grad_norm": 0.39380112290382385, + "learning_rate": 7.303732112750089e-05, + "loss": 1.0012, + "step": 4109 + }, + { + "epoch": 0.36723479348627336, + "grad_norm": 0.4520123600959778, + "learning_rate": 7.302447694108177e-05, + "loss": 0.9982, + "step": 4110 + }, + { + "epoch": 0.36732414501753524, + "grad_norm": 0.5289416909217834, + "learning_rate": 7.301163082611729e-05, + "loss": 0.9575, + "step": 4111 + }, + { + "epoch": 0.3674134965487971, + "grad_norm": 0.5018066167831421, + "learning_rate": 7.299878278368345e-05, + "loss": 0.8618, + "step": 4112 + }, + { + "epoch": 0.36750284808005895, + "grad_norm": 0.3886435031890869, + "learning_rate": 7.29859328148564e-05, + "loss": 0.9727, + "step": 4113 + }, + { + "epoch": 0.36759219961132084, + "grad_norm": 0.45843201875686646, + "learning_rate": 7.297308092071245e-05, + "loss": 0.9968, + "step": 4114 + }, + { + "epoch": 0.3676815511425827, + "grad_norm": 0.4753422141075134, + "learning_rate": 7.296022710232812e-05, + "loss": 0.9434, + "step": 4115 + }, + { + "epoch": 0.36777090267384455, + "grad_norm": 0.4317519962787628, + "learning_rate": 7.294737136078001e-05, + "loss": 0.9388, + "step": 4116 + }, + { + "epoch": 0.36786025420510643, + "grad_norm": 0.388031005859375, + "learning_rate": 7.293451369714495e-05, + "loss": 1.079, + "step": 4117 + }, + { + "epoch": 0.3679496057363683, + "grad_norm": 0.42603445053100586, + "learning_rate": 7.292165411249993e-05, + "loss": 0.9291, + "step": 4118 + }, + { + "epoch": 0.3680389572676302, + "grad_norm": 0.4375905990600586, + "learning_rate": 7.290879260792203e-05, + "loss": 0.9836, + "step": 4119 + }, + { + "epoch": 0.368128308798892, + "grad_norm": 0.40802040696144104, + "learning_rate": 7.289592918448856e-05, + "loss": 0.9789, + "step": 4120 + }, + { + "epoch": 0.3682176603301539, + "grad_norm": 0.3883795142173767, + "learning_rate": 7.288306384327696e-05, + "loss": 1.0521, + "step": 4121 + }, + { + "epoch": 0.3683070118614158, + "grad_norm": 0.42812490463256836, + "learning_rate": 7.287019658536486e-05, + "loss": 1.0214, + "step": 4122 + }, + { + "epoch": 0.3683963633926776, + "grad_norm": 0.4556853771209717, + "learning_rate": 7.285732741183003e-05, + "loss": 0.9778, + "step": 4123 + }, + { + "epoch": 0.3684857149239395, + "grad_norm": 0.43323996663093567, + "learning_rate": 7.284445632375035e-05, + "loss": 0.9807, + "step": 4124 + }, + { + "epoch": 0.3685750664552014, + "grad_norm": 0.4459212124347687, + "learning_rate": 7.283158332220397e-05, + "loss": 0.9184, + "step": 4125 + }, + { + "epoch": 0.36866441798646327, + "grad_norm": 0.43171730637550354, + "learning_rate": 7.281870840826912e-05, + "loss": 0.9214, + "step": 4126 + }, + { + "epoch": 0.3687537695177251, + "grad_norm": 0.4195806086063385, + "learning_rate": 7.280583158302421e-05, + "loss": 0.9684, + "step": 4127 + }, + { + "epoch": 0.368843121048987, + "grad_norm": 0.4562622606754303, + "learning_rate": 7.279295284754782e-05, + "loss": 0.9569, + "step": 4128 + }, + { + "epoch": 0.36893247258024886, + "grad_norm": 0.5500267148017883, + "learning_rate": 7.278007220291866e-05, + "loss": 1.0506, + "step": 4129 + }, + { + "epoch": 0.3690218241115107, + "grad_norm": 0.5012460947036743, + "learning_rate": 7.276718965021563e-05, + "loss": 0.9673, + "step": 4130 + }, + { + "epoch": 0.36911117564277257, + "grad_norm": 0.46749043464660645, + "learning_rate": 7.27543051905178e-05, + "loss": 0.9777, + "step": 4131 + }, + { + "epoch": 0.36920052717403445, + "grad_norm": 0.49202150106430054, + "learning_rate": 7.274141882490435e-05, + "loss": 0.9335, + "step": 4132 + }, + { + "epoch": 0.36928987870529634, + "grad_norm": 0.47457700967788696, + "learning_rate": 7.27285305544547e-05, + "loss": 0.9617, + "step": 4133 + }, + { + "epoch": 0.36937923023655816, + "grad_norm": 0.46560418605804443, + "learning_rate": 7.271564038024831e-05, + "loss": 1.0336, + "step": 4134 + }, + { + "epoch": 0.36946858176782005, + "grad_norm": 0.41487690806388855, + "learning_rate": 7.270274830336493e-05, + "loss": 0.9539, + "step": 4135 + }, + { + "epoch": 0.36955793329908193, + "grad_norm": 0.4482000470161438, + "learning_rate": 7.268985432488438e-05, + "loss": 0.9841, + "step": 4136 + }, + { + "epoch": 0.36964728483034376, + "grad_norm": 0.44116854667663574, + "learning_rate": 7.267695844588668e-05, + "loss": 0.9894, + "step": 4137 + }, + { + "epoch": 0.36973663636160564, + "grad_norm": 0.3913993537425995, + "learning_rate": 7.266406066745199e-05, + "loss": 0.9203, + "step": 4138 + }, + { + "epoch": 0.3698259878928675, + "grad_norm": 0.43621063232421875, + "learning_rate": 7.265116099066063e-05, + "loss": 1.0129, + "step": 4139 + }, + { + "epoch": 0.3699153394241294, + "grad_norm": 0.5264103412628174, + "learning_rate": 7.26382594165931e-05, + "loss": 0.9673, + "step": 4140 + }, + { + "epoch": 0.37000469095539124, + "grad_norm": 0.393413782119751, + "learning_rate": 7.262535594633002e-05, + "loss": 1.0037, + "step": 4141 + }, + { + "epoch": 0.3700940424866531, + "grad_norm": 0.5121841430664062, + "learning_rate": 7.261245058095223e-05, + "loss": 0.8443, + "step": 4142 + }, + { + "epoch": 0.370183394017915, + "grad_norm": 0.4309183359146118, + "learning_rate": 7.259954332154066e-05, + "loss": 1.0241, + "step": 4143 + }, + { + "epoch": 0.37027274554917683, + "grad_norm": 0.3869653046131134, + "learning_rate": 7.258663416917645e-05, + "loss": 1.0111, + "step": 4144 + }, + { + "epoch": 0.3703620970804387, + "grad_norm": 0.5161389112472534, + "learning_rate": 7.257372312494088e-05, + "loss": 1.0608, + "step": 4145 + }, + { + "epoch": 0.3704514486117006, + "grad_norm": 0.48384639620780945, + "learning_rate": 7.256081018991536e-05, + "loss": 0.9671, + "step": 4146 + }, + { + "epoch": 0.3705408001429624, + "grad_norm": 0.4092333912849426, + "learning_rate": 7.254789536518151e-05, + "loss": 1.0206, + "step": 4147 + }, + { + "epoch": 0.3706301516742243, + "grad_norm": 0.4120895266532898, + "learning_rate": 7.25349786518211e-05, + "loss": 1.0183, + "step": 4148 + }, + { + "epoch": 0.3707195032054862, + "grad_norm": 0.47029179334640503, + "learning_rate": 7.2522060050916e-05, + "loss": 0.9786, + "step": 4149 + }, + { + "epoch": 0.3708088547367481, + "grad_norm": 0.38169971108436584, + "learning_rate": 7.25091395635483e-05, + "loss": 0.9691, + "step": 4150 + }, + { + "epoch": 0.3708982062680099, + "grad_norm": 0.48329901695251465, + "learning_rate": 7.249621719080025e-05, + "loss": 0.9664, + "step": 4151 + }, + { + "epoch": 0.3709875577992718, + "grad_norm": 0.4072484076023102, + "learning_rate": 7.248329293375422e-05, + "loss": 1.0322, + "step": 4152 + }, + { + "epoch": 0.37107690933053367, + "grad_norm": 0.4380795955657959, + "learning_rate": 7.247036679349274e-05, + "loss": 0.9546, + "step": 4153 + }, + { + "epoch": 0.3711662608617955, + "grad_norm": 0.515524685382843, + "learning_rate": 7.245743877109852e-05, + "loss": 0.9816, + "step": 4154 + }, + { + "epoch": 0.3712556123930574, + "grad_norm": 0.42287132143974304, + "learning_rate": 7.244450886765443e-05, + "loss": 0.9431, + "step": 4155 + }, + { + "epoch": 0.37134496392431926, + "grad_norm": 0.4340779483318329, + "learning_rate": 7.243157708424348e-05, + "loss": 0.9488, + "step": 4156 + }, + { + "epoch": 0.37143431545558114, + "grad_norm": 0.524341344833374, + "learning_rate": 7.241864342194886e-05, + "loss": 0.9756, + "step": 4157 + }, + { + "epoch": 0.37152366698684297, + "grad_norm": 0.6122943758964539, + "learning_rate": 7.240570788185388e-05, + "loss": 0.9382, + "step": 4158 + }, + { + "epoch": 0.37161301851810485, + "grad_norm": 0.5012384653091431, + "learning_rate": 7.239277046504202e-05, + "loss": 0.9809, + "step": 4159 + }, + { + "epoch": 0.37170237004936674, + "grad_norm": 0.3928990364074707, + "learning_rate": 7.237983117259696e-05, + "loss": 0.9839, + "step": 4160 + }, + { + "epoch": 0.37179172158062856, + "grad_norm": 0.4313662350177765, + "learning_rate": 7.236689000560248e-05, + "loss": 1.0517, + "step": 4161 + }, + { + "epoch": 0.37188107311189045, + "grad_norm": 0.3812869191169739, + "learning_rate": 7.235394696514255e-05, + "loss": 1.0109, + "step": 4162 + }, + { + "epoch": 0.37197042464315233, + "grad_norm": 0.3870796263217926, + "learning_rate": 7.23410020523013e-05, + "loss": 1.0234, + "step": 4163 + }, + { + "epoch": 0.3720597761744142, + "grad_norm": 0.4438193142414093, + "learning_rate": 7.232805526816297e-05, + "loss": 0.9647, + "step": 4164 + }, + { + "epoch": 0.37214912770567604, + "grad_norm": 0.375704288482666, + "learning_rate": 7.231510661381202e-05, + "loss": 0.9969, + "step": 4165 + }, + { + "epoch": 0.3722384792369379, + "grad_norm": 0.38540956377983093, + "learning_rate": 7.230215609033301e-05, + "loss": 1.0308, + "step": 4166 + }, + { + "epoch": 0.3723278307681998, + "grad_norm": 0.3983640670776367, + "learning_rate": 7.228920369881073e-05, + "loss": 0.9823, + "step": 4167 + }, + { + "epoch": 0.37241718229946164, + "grad_norm": 0.4259108901023865, + "learning_rate": 7.227624944033006e-05, + "loss": 0.9739, + "step": 4168 + }, + { + "epoch": 0.3725065338307235, + "grad_norm": 0.4795960783958435, + "learning_rate": 7.226329331597604e-05, + "loss": 0.9372, + "step": 4169 + }, + { + "epoch": 0.3725958853619854, + "grad_norm": 0.4027024209499359, + "learning_rate": 7.225033532683388e-05, + "loss": 0.9707, + "step": 4170 + }, + { + "epoch": 0.3726852368932473, + "grad_norm": 0.42690253257751465, + "learning_rate": 7.223737547398898e-05, + "loss": 0.973, + "step": 4171 + }, + { + "epoch": 0.3727745884245091, + "grad_norm": 0.42597901821136475, + "learning_rate": 7.222441375852685e-05, + "loss": 0.9487, + "step": 4172 + }, + { + "epoch": 0.372863939955771, + "grad_norm": 0.43113502860069275, + "learning_rate": 7.221145018153317e-05, + "loss": 1.0139, + "step": 4173 + }, + { + "epoch": 0.3729532914870329, + "grad_norm": 0.4594292938709259, + "learning_rate": 7.219848474409378e-05, + "loss": 1.0409, + "step": 4174 + }, + { + "epoch": 0.3730426430182947, + "grad_norm": 0.39709535241127014, + "learning_rate": 7.218551744729468e-05, + "loss": 0.9839, + "step": 4175 + }, + { + "epoch": 0.3731319945495566, + "grad_norm": 0.38931846618652344, + "learning_rate": 7.217254829222201e-05, + "loss": 0.9697, + "step": 4176 + }, + { + "epoch": 0.3732213460808185, + "grad_norm": 0.3806284964084625, + "learning_rate": 7.215957727996207e-05, + "loss": 1.0805, + "step": 4177 + }, + { + "epoch": 0.3733106976120803, + "grad_norm": 0.4208787977695465, + "learning_rate": 7.214660441160134e-05, + "loss": 0.9579, + "step": 4178 + }, + { + "epoch": 0.3734000491433422, + "grad_norm": 0.4032423198223114, + "learning_rate": 7.213362968822643e-05, + "loss": 0.985, + "step": 4179 + }, + { + "epoch": 0.37348940067460407, + "grad_norm": 0.4312584400177002, + "learning_rate": 7.212065311092409e-05, + "loss": 0.9404, + "step": 4180 + }, + { + "epoch": 0.37357875220586595, + "grad_norm": 0.39697664976119995, + "learning_rate": 7.210767468078126e-05, + "loss": 0.9926, + "step": 4181 + }, + { + "epoch": 0.3736681037371278, + "grad_norm": 0.38140472769737244, + "learning_rate": 7.209469439888504e-05, + "loss": 0.9736, + "step": 4182 + }, + { + "epoch": 0.37375745526838966, + "grad_norm": 0.41029414534568787, + "learning_rate": 7.208171226632264e-05, + "loss": 1.0136, + "step": 4183 + }, + { + "epoch": 0.37384680679965154, + "grad_norm": 0.44529950618743896, + "learning_rate": 7.206872828418146e-05, + "loss": 0.9878, + "step": 4184 + }, + { + "epoch": 0.37393615833091337, + "grad_norm": 0.4840013086795807, + "learning_rate": 7.205574245354907e-05, + "loss": 0.9829, + "step": 4185 + }, + { + "epoch": 0.37402550986217525, + "grad_norm": 0.41455405950546265, + "learning_rate": 7.204275477551314e-05, + "loss": 0.9755, + "step": 4186 + }, + { + "epoch": 0.37411486139343714, + "grad_norm": 0.5246483087539673, + "learning_rate": 7.202976525116154e-05, + "loss": 0.9764, + "step": 4187 + }, + { + "epoch": 0.374204212924699, + "grad_norm": 0.484429270029068, + "learning_rate": 7.201677388158228e-05, + "loss": 0.9466, + "step": 4188 + }, + { + "epoch": 0.37429356445596085, + "grad_norm": 0.39618611335754395, + "learning_rate": 7.200378066786352e-05, + "loss": 1.0202, + "step": 4189 + }, + { + "epoch": 0.37438291598722273, + "grad_norm": 0.4614309072494507, + "learning_rate": 7.199078561109359e-05, + "loss": 0.9347, + "step": 4190 + }, + { + "epoch": 0.3744722675184846, + "grad_norm": 0.46450915932655334, + "learning_rate": 7.197778871236096e-05, + "loss": 0.9394, + "step": 4191 + }, + { + "epoch": 0.37456161904974644, + "grad_norm": 0.37364086508750916, + "learning_rate": 7.196478997275426e-05, + "loss": 1.0168, + "step": 4192 + }, + { + "epoch": 0.3746509705810083, + "grad_norm": 0.4108889400959015, + "learning_rate": 7.195178939336228e-05, + "loss": 0.9738, + "step": 4193 + }, + { + "epoch": 0.3747403221122702, + "grad_norm": 0.43093374371528625, + "learning_rate": 7.193878697527394e-05, + "loss": 0.9225, + "step": 4194 + }, + { + "epoch": 0.3748296736435321, + "grad_norm": 0.5023407340049744, + "learning_rate": 7.192578271957833e-05, + "loss": 0.9292, + "step": 4195 + }, + { + "epoch": 0.3749190251747939, + "grad_norm": 0.42425450682640076, + "learning_rate": 7.191277662736473e-05, + "loss": 0.9801, + "step": 4196 + }, + { + "epoch": 0.3750083767060558, + "grad_norm": 0.46366748213768005, + "learning_rate": 7.189976869972248e-05, + "loss": 0.9504, + "step": 4197 + }, + { + "epoch": 0.3750977282373177, + "grad_norm": 0.4702743589878082, + "learning_rate": 7.18867589377412e-05, + "loss": 0.9922, + "step": 4198 + }, + { + "epoch": 0.3751870797685795, + "grad_norm": 0.4722293019294739, + "learning_rate": 7.187374734251054e-05, + "loss": 1.0015, + "step": 4199 + }, + { + "epoch": 0.3752764312998414, + "grad_norm": 0.4281826615333557, + "learning_rate": 7.186073391512039e-05, + "loss": 0.9522, + "step": 4200 + }, + { + "epoch": 0.3753657828311033, + "grad_norm": 0.3947566747665405, + "learning_rate": 7.184771865666076e-05, + "loss": 1.0434, + "step": 4201 + }, + { + "epoch": 0.37545513436236516, + "grad_norm": 0.43979522585868835, + "learning_rate": 7.18347015682218e-05, + "loss": 0.9935, + "step": 4202 + }, + { + "epoch": 0.375544485893627, + "grad_norm": 0.41721320152282715, + "learning_rate": 7.182168265089386e-05, + "loss": 0.9861, + "step": 4203 + }, + { + "epoch": 0.3756338374248889, + "grad_norm": 0.46523186564445496, + "learning_rate": 7.18086619057674e-05, + "loss": 0.9951, + "step": 4204 + }, + { + "epoch": 0.37572318895615076, + "grad_norm": 0.43534207344055176, + "learning_rate": 7.179563933393303e-05, + "loss": 0.9288, + "step": 4205 + }, + { + "epoch": 0.3758125404874126, + "grad_norm": 0.4193892776966095, + "learning_rate": 7.178261493648154e-05, + "loss": 1.0575, + "step": 4206 + }, + { + "epoch": 0.37590189201867447, + "grad_norm": 0.48996466398239136, + "learning_rate": 7.176958871450386e-05, + "loss": 0.9262, + "step": 4207 + }, + { + "epoch": 0.37599124354993635, + "grad_norm": 0.4287664294242859, + "learning_rate": 7.175656066909109e-05, + "loss": 1.031, + "step": 4208 + }, + { + "epoch": 0.3760805950811982, + "grad_norm": 0.4364438056945801, + "learning_rate": 7.174353080133445e-05, + "loss": 1.0015, + "step": 4209 + }, + { + "epoch": 0.37616994661246006, + "grad_norm": 0.4443535804748535, + "learning_rate": 7.173049911232533e-05, + "loss": 0.9699, + "step": 4210 + }, + { + "epoch": 0.37625929814372194, + "grad_norm": 0.4485486149787903, + "learning_rate": 7.171746560315529e-05, + "loss": 0.996, + "step": 4211 + }, + { + "epoch": 0.3763486496749838, + "grad_norm": 0.43986693024635315, + "learning_rate": 7.1704430274916e-05, + "loss": 0.9912, + "step": 4212 + }, + { + "epoch": 0.37643800120624565, + "grad_norm": 0.42477118968963623, + "learning_rate": 7.169139312869933e-05, + "loss": 1.0281, + "step": 4213 + }, + { + "epoch": 0.37652735273750754, + "grad_norm": 0.35374677181243896, + "learning_rate": 7.167835416559727e-05, + "loss": 1.0282, + "step": 4214 + }, + { + "epoch": 0.3766167042687694, + "grad_norm": 0.5305443406105042, + "learning_rate": 7.166531338670198e-05, + "loss": 0.8789, + "step": 4215 + }, + { + "epoch": 0.37670605580003125, + "grad_norm": 0.4482033848762512, + "learning_rate": 7.165227079310575e-05, + "loss": 0.9928, + "step": 4216 + }, + { + "epoch": 0.37679540733129313, + "grad_norm": 0.4617825150489807, + "learning_rate": 7.163922638590104e-05, + "loss": 0.9478, + "step": 4217 + }, + { + "epoch": 0.376884758862555, + "grad_norm": 0.4732501804828644, + "learning_rate": 7.162618016618047e-05, + "loss": 0.9714, + "step": 4218 + }, + { + "epoch": 0.3769741103938169, + "grad_norm": 0.40239885449409485, + "learning_rate": 7.161313213503679e-05, + "loss": 0.9664, + "step": 4219 + }, + { + "epoch": 0.3770634619250787, + "grad_norm": 0.4415636658668518, + "learning_rate": 7.160008229356292e-05, + "loss": 1.0054, + "step": 4220 + }, + { + "epoch": 0.3771528134563406, + "grad_norm": 0.47628524899482727, + "learning_rate": 7.158703064285192e-05, + "loss": 0.9021, + "step": 4221 + }, + { + "epoch": 0.3772421649876025, + "grad_norm": 0.3998652398586273, + "learning_rate": 7.157397718399698e-05, + "loss": 1.046, + "step": 4222 + }, + { + "epoch": 0.3773315165188643, + "grad_norm": 0.3887675702571869, + "learning_rate": 7.156092191809152e-05, + "loss": 1.0419, + "step": 4223 + }, + { + "epoch": 0.3774208680501262, + "grad_norm": 0.3833785951137543, + "learning_rate": 7.154786484622901e-05, + "loss": 0.9874, + "step": 4224 + }, + { + "epoch": 0.3775102195813881, + "grad_norm": 0.4666437804698944, + "learning_rate": 7.153480596950314e-05, + "loss": 0.967, + "step": 4225 + }, + { + "epoch": 0.37759957111264997, + "grad_norm": 0.37928861379623413, + "learning_rate": 7.152174528900772e-05, + "loss": 0.9859, + "step": 4226 + }, + { + "epoch": 0.3776889226439118, + "grad_norm": 0.5285788178443909, + "learning_rate": 7.150868280583674e-05, + "loss": 0.9279, + "step": 4227 + }, + { + "epoch": 0.3777782741751737, + "grad_norm": 0.4263918697834015, + "learning_rate": 7.149561852108429e-05, + "loss": 0.9441, + "step": 4228 + }, + { + "epoch": 0.37786762570643556, + "grad_norm": 0.40622586011886597, + "learning_rate": 7.148255243584467e-05, + "loss": 0.9801, + "step": 4229 + }, + { + "epoch": 0.3779569772376974, + "grad_norm": 0.3894538879394531, + "learning_rate": 7.146948455121231e-05, + "loss": 1.0106, + "step": 4230 + }, + { + "epoch": 0.37804632876895927, + "grad_norm": 0.4317224621772766, + "learning_rate": 7.145641486828175e-05, + "loss": 0.941, + "step": 4231 + }, + { + "epoch": 0.37813568030022116, + "grad_norm": 0.4164870083332062, + "learning_rate": 7.144334338814774e-05, + "loss": 0.9747, + "step": 4232 + }, + { + "epoch": 0.37822503183148304, + "grad_norm": 0.5142220258712769, + "learning_rate": 7.143027011190515e-05, + "loss": 0.9172, + "step": 4233 + }, + { + "epoch": 0.37831438336274487, + "grad_norm": 0.3964780569076538, + "learning_rate": 7.141719504064902e-05, + "loss": 0.963, + "step": 4234 + }, + { + "epoch": 0.37840373489400675, + "grad_norm": 0.532341480255127, + "learning_rate": 7.140411817547452e-05, + "loss": 0.977, + "step": 4235 + }, + { + "epoch": 0.37849308642526863, + "grad_norm": 0.39835068583488464, + "learning_rate": 7.139103951747695e-05, + "loss": 0.9591, + "step": 4236 + }, + { + "epoch": 0.37858243795653046, + "grad_norm": 0.41592469811439514, + "learning_rate": 7.137795906775182e-05, + "loss": 0.9929, + "step": 4237 + }, + { + "epoch": 0.37867178948779234, + "grad_norm": 0.43503326177597046, + "learning_rate": 7.136487682739472e-05, + "loss": 0.9984, + "step": 4238 + }, + { + "epoch": 0.3787611410190542, + "grad_norm": 0.531616747379303, + "learning_rate": 7.135179279750149e-05, + "loss": 0.9221, + "step": 4239 + }, + { + "epoch": 0.37885049255031605, + "grad_norm": 0.4244007170200348, + "learning_rate": 7.1338706979168e-05, + "loss": 0.9679, + "step": 4240 + }, + { + "epoch": 0.37893984408157794, + "grad_norm": 0.4253380000591278, + "learning_rate": 7.132561937349035e-05, + "loss": 0.9562, + "step": 4241 + }, + { + "epoch": 0.3790291956128398, + "grad_norm": 0.4492589235305786, + "learning_rate": 7.131252998156476e-05, + "loss": 0.9848, + "step": 4242 + }, + { + "epoch": 0.3791185471441017, + "grad_norm": 0.41047170758247375, + "learning_rate": 7.129943880448762e-05, + "loss": 1.0768, + "step": 4243 + }, + { + "epoch": 0.37920789867536353, + "grad_norm": 0.43270984292030334, + "learning_rate": 7.128634584335545e-05, + "loss": 0.9937, + "step": 4244 + }, + { + "epoch": 0.3792972502066254, + "grad_norm": 0.45858004689216614, + "learning_rate": 7.127325109926491e-05, + "loss": 0.9881, + "step": 4245 + }, + { + "epoch": 0.3793866017378873, + "grad_norm": 0.4487616717815399, + "learning_rate": 7.126015457331281e-05, + "loss": 1.0136, + "step": 4246 + }, + { + "epoch": 0.3794759532691491, + "grad_norm": 0.41824638843536377, + "learning_rate": 7.124705626659616e-05, + "loss": 0.9809, + "step": 4247 + }, + { + "epoch": 0.379565304800411, + "grad_norm": 0.4919416606426239, + "learning_rate": 7.123395618021208e-05, + "loss": 0.9974, + "step": 4248 + }, + { + "epoch": 0.3796546563316729, + "grad_norm": 0.4910551905632019, + "learning_rate": 7.122085431525785e-05, + "loss": 1.0215, + "step": 4249 + }, + { + "epoch": 0.3797440078629348, + "grad_norm": 0.46813416481018066, + "learning_rate": 7.120775067283085e-05, + "loss": 0.951, + "step": 4250 + }, + { + "epoch": 0.3798333593941966, + "grad_norm": 0.4559837579727173, + "learning_rate": 7.119464525402867e-05, + "loss": 0.9815, + "step": 4251 + }, + { + "epoch": 0.3799227109254585, + "grad_norm": 0.38314858078956604, + "learning_rate": 7.118153805994902e-05, + "loss": 1.0214, + "step": 4252 + }, + { + "epoch": 0.38001206245672037, + "grad_norm": 0.4613505005836487, + "learning_rate": 7.11684290916898e-05, + "loss": 0.9771, + "step": 4253 + }, + { + "epoch": 0.3801014139879822, + "grad_norm": 0.43904268741607666, + "learning_rate": 7.1155318350349e-05, + "loss": 0.9689, + "step": 4254 + }, + { + "epoch": 0.3801907655192441, + "grad_norm": 0.4177344739437103, + "learning_rate": 7.11422058370248e-05, + "loss": 0.9386, + "step": 4255 + }, + { + "epoch": 0.38028011705050596, + "grad_norm": 0.44413110613822937, + "learning_rate": 7.112909155281549e-05, + "loss": 0.9454, + "step": 4256 + }, + { + "epoch": 0.38036946858176784, + "grad_norm": 0.4826868772506714, + "learning_rate": 7.111597549881953e-05, + "loss": 0.8906, + "step": 4257 + }, + { + "epoch": 0.38045882011302967, + "grad_norm": 0.517841100692749, + "learning_rate": 7.110285767613555e-05, + "loss": 0.9733, + "step": 4258 + }, + { + "epoch": 0.38054817164429156, + "grad_norm": 0.4552189111709595, + "learning_rate": 7.108973808586231e-05, + "loss": 0.969, + "step": 4259 + }, + { + "epoch": 0.38063752317555344, + "grad_norm": 0.40824398398399353, + "learning_rate": 7.107661672909871e-05, + "loss": 0.9928, + "step": 4260 + }, + { + "epoch": 0.38072687470681527, + "grad_norm": 0.45659610629081726, + "learning_rate": 7.106349360694379e-05, + "loss": 0.9898, + "step": 4261 + }, + { + "epoch": 0.38081622623807715, + "grad_norm": 0.39422810077667236, + "learning_rate": 7.105036872049675e-05, + "loss": 1.005, + "step": 4262 + }, + { + "epoch": 0.38090557776933903, + "grad_norm": 0.47162938117980957, + "learning_rate": 7.103724207085696e-05, + "loss": 0.995, + "step": 4263 + }, + { + "epoch": 0.3809949293006009, + "grad_norm": 0.45578041672706604, + "learning_rate": 7.10241136591239e-05, + "loss": 0.9659, + "step": 4264 + }, + { + "epoch": 0.38108428083186274, + "grad_norm": 0.35482344031333923, + "learning_rate": 7.101098348639722e-05, + "loss": 1.0144, + "step": 4265 + }, + { + "epoch": 0.3811736323631246, + "grad_norm": 0.4190642237663269, + "learning_rate": 7.099785155377672e-05, + "loss": 0.9442, + "step": 4266 + }, + { + "epoch": 0.3812629838943865, + "grad_norm": 0.4424975514411926, + "learning_rate": 7.098471786236231e-05, + "loss": 0.9564, + "step": 4267 + }, + { + "epoch": 0.38135233542564834, + "grad_norm": 0.4281081259250641, + "learning_rate": 7.097158241325411e-05, + "loss": 0.9608, + "step": 4268 + }, + { + "epoch": 0.3814416869569102, + "grad_norm": 0.46084991097450256, + "learning_rate": 7.095844520755235e-05, + "loss": 0.9865, + "step": 4269 + }, + { + "epoch": 0.3815310384881721, + "grad_norm": 0.3826462924480438, + "learning_rate": 7.094530624635741e-05, + "loss": 0.978, + "step": 4270 + }, + { + "epoch": 0.38162039001943393, + "grad_norm": 0.4165399372577667, + "learning_rate": 7.093216553076977e-05, + "loss": 0.9959, + "step": 4271 + }, + { + "epoch": 0.3817097415506958, + "grad_norm": 0.4049665331840515, + "learning_rate": 7.091902306189018e-05, + "loss": 1.0591, + "step": 4272 + }, + { + "epoch": 0.3817990930819577, + "grad_norm": 0.3788042664527893, + "learning_rate": 7.090587884081943e-05, + "loss": 1.0281, + "step": 4273 + }, + { + "epoch": 0.3818884446132196, + "grad_norm": 0.436847060918808, + "learning_rate": 7.089273286865849e-05, + "loss": 0.9862, + "step": 4274 + }, + { + "epoch": 0.3819777961444814, + "grad_norm": 0.4377835988998413, + "learning_rate": 7.087958514650846e-05, + "loss": 1.0327, + "step": 4275 + }, + { + "epoch": 0.3820671476757433, + "grad_norm": 0.43927526473999023, + "learning_rate": 7.08664356754706e-05, + "loss": 0.9563, + "step": 4276 + }, + { + "epoch": 0.3821564992070052, + "grad_norm": 0.4470997750759125, + "learning_rate": 7.085328445664636e-05, + "loss": 0.9565, + "step": 4277 + }, + { + "epoch": 0.382245850738267, + "grad_norm": 0.42849406599998474, + "learning_rate": 7.084013149113724e-05, + "loss": 0.9785, + "step": 4278 + }, + { + "epoch": 0.3823352022695289, + "grad_norm": 0.37449339032173157, + "learning_rate": 7.0826976780045e-05, + "loss": 0.9791, + "step": 4279 + }, + { + "epoch": 0.38242455380079077, + "grad_norm": 0.39009079337120056, + "learning_rate": 7.081382032447142e-05, + "loss": 0.9007, + "step": 4280 + }, + { + "epoch": 0.38251390533205265, + "grad_norm": 0.37804755568504333, + "learning_rate": 7.080066212551854e-05, + "loss": 1.0259, + "step": 4281 + }, + { + "epoch": 0.3826032568633145, + "grad_norm": 0.4352472722530365, + "learning_rate": 7.07875021842885e-05, + "loss": 0.9648, + "step": 4282 + }, + { + "epoch": 0.38269260839457636, + "grad_norm": 0.38885149359703064, + "learning_rate": 7.077434050188356e-05, + "loss": 1.0278, + "step": 4283 + }, + { + "epoch": 0.38278195992583824, + "grad_norm": 0.3463941514492035, + "learning_rate": 7.076117707940617e-05, + "loss": 1.0115, + "step": 4284 + }, + { + "epoch": 0.38287131145710007, + "grad_norm": 0.4116084575653076, + "learning_rate": 7.07480119179589e-05, + "loss": 0.9362, + "step": 4285 + }, + { + "epoch": 0.38296066298836196, + "grad_norm": 0.44609686732292175, + "learning_rate": 7.073484501864447e-05, + "loss": 0.9525, + "step": 4286 + }, + { + "epoch": 0.38305001451962384, + "grad_norm": 0.40134397149086, + "learning_rate": 7.072167638256576e-05, + "loss": 1.0264, + "step": 4287 + }, + { + "epoch": 0.3831393660508857, + "grad_norm": 0.38840845227241516, + "learning_rate": 7.070850601082577e-05, + "loss": 0.9817, + "step": 4288 + }, + { + "epoch": 0.38322871758214755, + "grad_norm": 0.43877261877059937, + "learning_rate": 7.069533390452766e-05, + "loss": 0.9695, + "step": 4289 + }, + { + "epoch": 0.38331806911340943, + "grad_norm": 0.42478203773498535, + "learning_rate": 7.068216006477476e-05, + "loss": 0.9468, + "step": 4290 + }, + { + "epoch": 0.3834074206446713, + "grad_norm": 0.5718162059783936, + "learning_rate": 7.066898449267047e-05, + "loss": 0.851, + "step": 4291 + }, + { + "epoch": 0.38349677217593314, + "grad_norm": 0.5737486481666565, + "learning_rate": 7.065580718931843e-05, + "loss": 0.852, + "step": 4292 + }, + { + "epoch": 0.383586123707195, + "grad_norm": 0.4374921917915344, + "learning_rate": 7.064262815582238e-05, + "loss": 0.8747, + "step": 4293 + }, + { + "epoch": 0.3836754752384569, + "grad_norm": 0.47579288482666016, + "learning_rate": 7.062944739328616e-05, + "loss": 1.02, + "step": 4294 + }, + { + "epoch": 0.3837648267697188, + "grad_norm": 0.4117971658706665, + "learning_rate": 7.061626490281384e-05, + "loss": 0.9578, + "step": 4295 + }, + { + "epoch": 0.3838541783009806, + "grad_norm": 0.38990846276283264, + "learning_rate": 7.060308068550959e-05, + "loss": 0.9143, + "step": 4296 + }, + { + "epoch": 0.3839435298322425, + "grad_norm": 0.45375731587409973, + "learning_rate": 7.05898947424777e-05, + "loss": 0.9272, + "step": 4297 + }, + { + "epoch": 0.3840328813635044, + "grad_norm": 0.496006041765213, + "learning_rate": 7.057670707482264e-05, + "loss": 0.8945, + "step": 4298 + }, + { + "epoch": 0.3841222328947662, + "grad_norm": 0.4508313536643982, + "learning_rate": 7.056351768364905e-05, + "loss": 1.0054, + "step": 4299 + }, + { + "epoch": 0.3842115844260281, + "grad_norm": 0.406969279050827, + "learning_rate": 7.055032657006166e-05, + "loss": 0.967, + "step": 4300 + }, + { + "epoch": 0.38430093595729, + "grad_norm": 0.432956725358963, + "learning_rate": 7.053713373516538e-05, + "loss": 0.9331, + "step": 4301 + }, + { + "epoch": 0.38439028748855186, + "grad_norm": 0.5145943760871887, + "learning_rate": 7.052393918006522e-05, + "loss": 0.9295, + "step": 4302 + }, + { + "epoch": 0.3844796390198137, + "grad_norm": 0.40598639845848083, + "learning_rate": 7.05107429058664e-05, + "loss": 0.99, + "step": 4303 + }, + { + "epoch": 0.3845689905510756, + "grad_norm": 0.42250970005989075, + "learning_rate": 7.049754491367421e-05, + "loss": 0.9821, + "step": 4304 + }, + { + "epoch": 0.38465834208233746, + "grad_norm": 0.48077642917633057, + "learning_rate": 7.048434520459418e-05, + "loss": 0.9702, + "step": 4305 + }, + { + "epoch": 0.3847476936135993, + "grad_norm": 0.41635996103286743, + "learning_rate": 7.047114377973188e-05, + "loss": 0.986, + "step": 4306 + }, + { + "epoch": 0.38483704514486117, + "grad_norm": 0.5170190930366516, + "learning_rate": 7.045794064019306e-05, + "loss": 0.9175, + "step": 4307 + }, + { + "epoch": 0.38492639667612305, + "grad_norm": 0.41324326395988464, + "learning_rate": 7.044473578708366e-05, + "loss": 1.0153, + "step": 4308 + }, + { + "epoch": 0.3850157482073849, + "grad_norm": 0.46481555700302124, + "learning_rate": 7.043152922150972e-05, + "loss": 0.9365, + "step": 4309 + }, + { + "epoch": 0.38510509973864676, + "grad_norm": 0.5050130486488342, + "learning_rate": 7.041832094457742e-05, + "loss": 1.0131, + "step": 4310 + }, + { + "epoch": 0.38519445126990864, + "grad_norm": 0.3722282648086548, + "learning_rate": 7.04051109573931e-05, + "loss": 0.9874, + "step": 4311 + }, + { + "epoch": 0.3852838028011705, + "grad_norm": 0.4793398082256317, + "learning_rate": 7.039189926106324e-05, + "loss": 1.0063, + "step": 4312 + }, + { + "epoch": 0.38537315433243235, + "grad_norm": 0.48876821994781494, + "learning_rate": 7.037868585669443e-05, + "loss": 0.9032, + "step": 4313 + }, + { + "epoch": 0.38546250586369424, + "grad_norm": 0.43197745084762573, + "learning_rate": 7.036547074539347e-05, + "loss": 1.0087, + "step": 4314 + }, + { + "epoch": 0.3855518573949561, + "grad_norm": 0.3676876127719879, + "learning_rate": 7.035225392826727e-05, + "loss": 1.0239, + "step": 4315 + }, + { + "epoch": 0.38564120892621795, + "grad_norm": 0.39950066804885864, + "learning_rate": 7.033903540642283e-05, + "loss": 0.9731, + "step": 4316 + }, + { + "epoch": 0.38573056045747983, + "grad_norm": 0.44849759340286255, + "learning_rate": 7.032581518096741e-05, + "loss": 0.9701, + "step": 4317 + }, + { + "epoch": 0.3858199119887417, + "grad_norm": 0.4369259476661682, + "learning_rate": 7.03125932530083e-05, + "loss": 1.032, + "step": 4318 + }, + { + "epoch": 0.3859092635200036, + "grad_norm": 0.40139544010162354, + "learning_rate": 7.029936962365297e-05, + "loss": 1.0624, + "step": 4319 + }, + { + "epoch": 0.3859986150512654, + "grad_norm": 0.4485412836074829, + "learning_rate": 7.028614429400908e-05, + "loss": 1.0387, + "step": 4320 + }, + { + "epoch": 0.3860879665825273, + "grad_norm": 0.44548967480659485, + "learning_rate": 7.027291726518436e-05, + "loss": 1.0029, + "step": 4321 + }, + { + "epoch": 0.3861773181137892, + "grad_norm": 0.40692946314811707, + "learning_rate": 7.025968853828674e-05, + "loss": 0.959, + "step": 4322 + }, + { + "epoch": 0.386266669645051, + "grad_norm": 0.4305645525455475, + "learning_rate": 7.024645811442423e-05, + "loss": 0.9873, + "step": 4323 + }, + { + "epoch": 0.3863560211763129, + "grad_norm": 0.5114787817001343, + "learning_rate": 7.023322599470505e-05, + "loss": 0.8899, + "step": 4324 + }, + { + "epoch": 0.3864453727075748, + "grad_norm": 0.41181930899620056, + "learning_rate": 7.021999218023753e-05, + "loss": 0.9755, + "step": 4325 + }, + { + "epoch": 0.38653472423883667, + "grad_norm": 0.3804915249347687, + "learning_rate": 7.020675667213014e-05, + "loss": 0.9616, + "step": 4326 + }, + { + "epoch": 0.3866240757700985, + "grad_norm": 0.41884955763816833, + "learning_rate": 7.019351947149148e-05, + "loss": 0.9835, + "step": 4327 + }, + { + "epoch": 0.3867134273013604, + "grad_norm": 0.44132474064826965, + "learning_rate": 7.018028057943032e-05, + "loss": 1.0083, + "step": 4328 + }, + { + "epoch": 0.38680277883262226, + "grad_norm": 0.3745604157447815, + "learning_rate": 7.016703999705556e-05, + "loss": 0.986, + "step": 4329 + }, + { + "epoch": 0.3868921303638841, + "grad_norm": 0.39664706587791443, + "learning_rate": 7.015379772547623e-05, + "loss": 0.9742, + "step": 4330 + }, + { + "epoch": 0.386981481895146, + "grad_norm": 0.510838508605957, + "learning_rate": 7.014055376580151e-05, + "loss": 1.0018, + "step": 4331 + }, + { + "epoch": 0.38707083342640786, + "grad_norm": 0.5416032075881958, + "learning_rate": 7.012730811914074e-05, + "loss": 0.9375, + "step": 4332 + }, + { + "epoch": 0.38716018495766974, + "grad_norm": 0.45843154191970825, + "learning_rate": 7.011406078660336e-05, + "loss": 0.9032, + "step": 4333 + }, + { + "epoch": 0.38724953648893157, + "grad_norm": 0.4215903878211975, + "learning_rate": 7.0100811769299e-05, + "loss": 0.9499, + "step": 4334 + }, + { + "epoch": 0.38733888802019345, + "grad_norm": 0.4728352725505829, + "learning_rate": 7.008756106833739e-05, + "loss": 0.9126, + "step": 4335 + }, + { + "epoch": 0.38742823955145533, + "grad_norm": 0.4127133786678314, + "learning_rate": 7.007430868482842e-05, + "loss": 1.0198, + "step": 4336 + }, + { + "epoch": 0.38751759108271716, + "grad_norm": 0.4411311745643616, + "learning_rate": 7.006105461988212e-05, + "loss": 0.9796, + "step": 4337 + }, + { + "epoch": 0.38760694261397904, + "grad_norm": 0.4794873297214508, + "learning_rate": 7.004779887460867e-05, + "loss": 0.9494, + "step": 4338 + }, + { + "epoch": 0.3876962941452409, + "grad_norm": 0.5127260088920593, + "learning_rate": 7.003454145011836e-05, + "loss": 0.9547, + "step": 4339 + }, + { + "epoch": 0.38778564567650275, + "grad_norm": 0.3873635530471802, + "learning_rate": 7.002128234752166e-05, + "loss": 1.028, + "step": 4340 + }, + { + "epoch": 0.38787499720776464, + "grad_norm": 0.4938105344772339, + "learning_rate": 7.000802156792915e-05, + "loss": 0.9775, + "step": 4341 + }, + { + "epoch": 0.3879643487390265, + "grad_norm": 0.43845218420028687, + "learning_rate": 6.999475911245156e-05, + "loss": 0.945, + "step": 4342 + }, + { + "epoch": 0.3880537002702884, + "grad_norm": 0.4035814106464386, + "learning_rate": 6.998149498219977e-05, + "loss": 1.0355, + "step": 4343 + }, + { + "epoch": 0.38814305180155023, + "grad_norm": 0.4540978670120239, + "learning_rate": 6.996822917828477e-05, + "loss": 0.9927, + "step": 4344 + }, + { + "epoch": 0.3882324033328121, + "grad_norm": 0.41308870911598206, + "learning_rate": 6.995496170181775e-05, + "loss": 0.963, + "step": 4345 + }, + { + "epoch": 0.388321754864074, + "grad_norm": 0.40299081802368164, + "learning_rate": 6.994169255390999e-05, + "loss": 0.9774, + "step": 4346 + }, + { + "epoch": 0.3884111063953358, + "grad_norm": 0.5228738784790039, + "learning_rate": 6.99284217356729e-05, + "loss": 0.9757, + "step": 4347 + }, + { + "epoch": 0.3885004579265977, + "grad_norm": 0.4321485757827759, + "learning_rate": 6.991514924821807e-05, + "loss": 0.9918, + "step": 4348 + }, + { + "epoch": 0.3885898094578596, + "grad_norm": 0.4189693331718445, + "learning_rate": 6.990187509265721e-05, + "loss": 0.9971, + "step": 4349 + }, + { + "epoch": 0.3886791609891215, + "grad_norm": 0.5382431745529175, + "learning_rate": 6.988859927010219e-05, + "loss": 1.0251, + "step": 4350 + }, + { + "epoch": 0.3887685125203833, + "grad_norm": 0.47325825691223145, + "learning_rate": 6.987532178166496e-05, + "loss": 0.9422, + "step": 4351 + }, + { + "epoch": 0.3888578640516452, + "grad_norm": 0.441567987203598, + "learning_rate": 6.986204262845768e-05, + "loss": 0.9758, + "step": 4352 + }, + { + "epoch": 0.38894721558290707, + "grad_norm": 0.44492974877357483, + "learning_rate": 6.984876181159261e-05, + "loss": 0.9484, + "step": 4353 + }, + { + "epoch": 0.3890365671141689, + "grad_norm": 0.4059096872806549, + "learning_rate": 6.983547933218218e-05, + "loss": 1.0436, + "step": 4354 + }, + { + "epoch": 0.3891259186454308, + "grad_norm": 0.4348817765712738, + "learning_rate": 6.982219519133892e-05, + "loss": 0.9641, + "step": 4355 + }, + { + "epoch": 0.38921527017669266, + "grad_norm": 0.4612578749656677, + "learning_rate": 6.980890939017551e-05, + "loss": 0.9583, + "step": 4356 + }, + { + "epoch": 0.38930462170795455, + "grad_norm": 0.4391173720359802, + "learning_rate": 6.979562192980481e-05, + "loss": 0.9592, + "step": 4357 + }, + { + "epoch": 0.3893939732392164, + "grad_norm": 0.45531633496284485, + "learning_rate": 6.978233281133976e-05, + "loss": 0.9537, + "step": 4358 + }, + { + "epoch": 0.38948332477047826, + "grad_norm": 0.39362677931785583, + "learning_rate": 6.976904203589346e-05, + "loss": 1.0051, + "step": 4359 + }, + { + "epoch": 0.38957267630174014, + "grad_norm": 0.5158856511116028, + "learning_rate": 6.975574960457919e-05, + "loss": 1.0024, + "step": 4360 + }, + { + "epoch": 0.38966202783300197, + "grad_norm": 0.5970863699913025, + "learning_rate": 6.974245551851029e-05, + "loss": 0.9359, + "step": 4361 + }, + { + "epoch": 0.38975137936426385, + "grad_norm": 0.40095895528793335, + "learning_rate": 6.972915977880031e-05, + "loss": 1.0245, + "step": 4362 + }, + { + "epoch": 0.38984073089552573, + "grad_norm": 0.46378135681152344, + "learning_rate": 6.971586238656291e-05, + "loss": 0.9918, + "step": 4363 + }, + { + "epoch": 0.3899300824267876, + "grad_norm": 0.5374437570571899, + "learning_rate": 6.970256334291187e-05, + "loss": 1.0324, + "step": 4364 + }, + { + "epoch": 0.39001943395804944, + "grad_norm": 0.5034531950950623, + "learning_rate": 6.968926264896114e-05, + "loss": 0.8832, + "step": 4365 + }, + { + "epoch": 0.3901087854893113, + "grad_norm": 0.3909834921360016, + "learning_rate": 6.967596030582478e-05, + "loss": 0.9664, + "step": 4366 + }, + { + "epoch": 0.3901981370205732, + "grad_norm": 0.4428032338619232, + "learning_rate": 6.966265631461703e-05, + "loss": 0.9607, + "step": 4367 + }, + { + "epoch": 0.39028748855183504, + "grad_norm": 0.6216595768928528, + "learning_rate": 6.96493506764522e-05, + "loss": 0.9773, + "step": 4368 + }, + { + "epoch": 0.3903768400830969, + "grad_norm": 0.44318845868110657, + "learning_rate": 6.963604339244481e-05, + "loss": 0.9698, + "step": 4369 + }, + { + "epoch": 0.3904661916143588, + "grad_norm": 0.4492654800415039, + "learning_rate": 6.96227344637095e-05, + "loss": 0.9126, + "step": 4370 + }, + { + "epoch": 0.39055554314562063, + "grad_norm": 0.46567076444625854, + "learning_rate": 6.960942389136101e-05, + "loss": 0.9868, + "step": 4371 + }, + { + "epoch": 0.3906448946768825, + "grad_norm": 0.4394150674343109, + "learning_rate": 6.959611167651423e-05, + "loss": 0.9878, + "step": 4372 + }, + { + "epoch": 0.3907342462081444, + "grad_norm": 0.44331878423690796, + "learning_rate": 6.958279782028423e-05, + "loss": 0.9312, + "step": 4373 + }, + { + "epoch": 0.3908235977394063, + "grad_norm": 0.4242664575576782, + "learning_rate": 6.956948232378617e-05, + "loss": 1.0962, + "step": 4374 + }, + { + "epoch": 0.3909129492706681, + "grad_norm": 0.44841963052749634, + "learning_rate": 6.955616518813538e-05, + "loss": 1.0338, + "step": 4375 + }, + { + "epoch": 0.39100230080193, + "grad_norm": 0.40618273615837097, + "learning_rate": 6.95428464144473e-05, + "loss": 0.9697, + "step": 4376 + }, + { + "epoch": 0.3910916523331919, + "grad_norm": 0.41985857486724854, + "learning_rate": 6.952952600383752e-05, + "loss": 0.9683, + "step": 4377 + }, + { + "epoch": 0.3911810038644537, + "grad_norm": 0.5133029222488403, + "learning_rate": 6.951620395742176e-05, + "loss": 0.953, + "step": 4378 + }, + { + "epoch": 0.3912703553957156, + "grad_norm": 0.42223209142684937, + "learning_rate": 6.950288027631588e-05, + "loss": 1.063, + "step": 4379 + }, + { + "epoch": 0.39135970692697747, + "grad_norm": 0.4248124659061432, + "learning_rate": 6.948955496163593e-05, + "loss": 1.0083, + "step": 4380 + }, + { + "epoch": 0.39144905845823935, + "grad_norm": 0.389886736869812, + "learning_rate": 6.947622801449799e-05, + "loss": 1.0032, + "step": 4381 + }, + { + "epoch": 0.3915384099895012, + "grad_norm": 0.4102005660533905, + "learning_rate": 6.946289943601833e-05, + "loss": 0.9879, + "step": 4382 + }, + { + "epoch": 0.39162776152076306, + "grad_norm": 0.4157712757587433, + "learning_rate": 6.94495692273134e-05, + "loss": 0.9817, + "step": 4383 + }, + { + "epoch": 0.39171711305202495, + "grad_norm": 0.42993542551994324, + "learning_rate": 6.943623738949973e-05, + "loss": 1.0497, + "step": 4384 + }, + { + "epoch": 0.3918064645832868, + "grad_norm": 0.46929094195365906, + "learning_rate": 6.942290392369401e-05, + "loss": 0.945, + "step": 4385 + }, + { + "epoch": 0.39189581611454866, + "grad_norm": 0.41299015283584595, + "learning_rate": 6.940956883101304e-05, + "loss": 0.9762, + "step": 4386 + }, + { + "epoch": 0.39198516764581054, + "grad_norm": 0.4164472222328186, + "learning_rate": 6.93962321125738e-05, + "loss": 0.8936, + "step": 4387 + }, + { + "epoch": 0.3920745191770724, + "grad_norm": 0.5273587107658386, + "learning_rate": 6.938289376949336e-05, + "loss": 0.9785, + "step": 4388 + }, + { + "epoch": 0.39216387070833425, + "grad_norm": 0.5239537954330444, + "learning_rate": 6.936955380288897e-05, + "loss": 0.9617, + "step": 4389 + }, + { + "epoch": 0.39225322223959613, + "grad_norm": 0.3496021628379822, + "learning_rate": 6.935621221387797e-05, + "loss": 1.0419, + "step": 4390 + }, + { + "epoch": 0.392342573770858, + "grad_norm": 0.5209729075431824, + "learning_rate": 6.934286900357789e-05, + "loss": 0.9904, + "step": 4391 + }, + { + "epoch": 0.39243192530211984, + "grad_norm": 0.4754767119884491, + "learning_rate": 6.932952417310634e-05, + "loss": 0.9543, + "step": 4392 + }, + { + "epoch": 0.3925212768333817, + "grad_norm": 0.4421217739582062, + "learning_rate": 6.931617772358112e-05, + "loss": 0.9329, + "step": 4393 + }, + { + "epoch": 0.3926106283646436, + "grad_norm": 0.40273427963256836, + "learning_rate": 6.93028296561201e-05, + "loss": 0.9904, + "step": 4394 + }, + { + "epoch": 0.3926999798959055, + "grad_norm": 0.4427558183670044, + "learning_rate": 6.928947997184134e-05, + "loss": 0.9429, + "step": 4395 + }, + { + "epoch": 0.3927893314271673, + "grad_norm": 0.46975377202033997, + "learning_rate": 6.927612867186305e-05, + "loss": 0.9122, + "step": 4396 + }, + { + "epoch": 0.3928786829584292, + "grad_norm": 0.3829523026943207, + "learning_rate": 6.926277575730349e-05, + "loss": 0.9974, + "step": 4397 + }, + { + "epoch": 0.3929680344896911, + "grad_norm": 0.5063918828964233, + "learning_rate": 6.924942122928115e-05, + "loss": 0.9954, + "step": 4398 + }, + { + "epoch": 0.3930573860209529, + "grad_norm": 0.4212724566459656, + "learning_rate": 6.923606508891459e-05, + "loss": 1.0462, + "step": 4399 + }, + { + "epoch": 0.3931467375522148, + "grad_norm": 0.44579607248306274, + "learning_rate": 6.922270733732254e-05, + "loss": 0.9423, + "step": 4400 + }, + { + "epoch": 0.3932360890834767, + "grad_norm": 0.45305055379867554, + "learning_rate": 6.920934797562385e-05, + "loss": 0.9498, + "step": 4401 + }, + { + "epoch": 0.3933254406147385, + "grad_norm": 0.39803916215896606, + "learning_rate": 6.91959870049375e-05, + "loss": 0.9772, + "step": 4402 + }, + { + "epoch": 0.3934147921460004, + "grad_norm": 0.5116259455680847, + "learning_rate": 6.918262442638263e-05, + "loss": 0.9202, + "step": 4403 + }, + { + "epoch": 0.3935041436772623, + "grad_norm": 0.39163100719451904, + "learning_rate": 6.91692602410785e-05, + "loss": 0.9933, + "step": 4404 + }, + { + "epoch": 0.39359349520852416, + "grad_norm": 0.386976420879364, + "learning_rate": 6.915589445014448e-05, + "loss": 1.006, + "step": 4405 + }, + { + "epoch": 0.393682846739786, + "grad_norm": 0.4204871356487274, + "learning_rate": 6.914252705470013e-05, + "loss": 0.9525, + "step": 4406 + }, + { + "epoch": 0.39377219827104787, + "grad_norm": 0.5166035294532776, + "learning_rate": 6.912915805586509e-05, + "loss": 0.8921, + "step": 4407 + }, + { + "epoch": 0.39386154980230975, + "grad_norm": 0.4778018891811371, + "learning_rate": 6.911578745475915e-05, + "loss": 0.9687, + "step": 4408 + }, + { + "epoch": 0.3939509013335716, + "grad_norm": 0.4855932295322418, + "learning_rate": 6.910241525250225e-05, + "loss": 0.9592, + "step": 4409 + }, + { + "epoch": 0.39404025286483346, + "grad_norm": 0.4324698746204376, + "learning_rate": 6.908904145021447e-05, + "loss": 0.9615, + "step": 4410 + }, + { + "epoch": 0.39412960439609535, + "grad_norm": 0.41898903250694275, + "learning_rate": 6.907566604901599e-05, + "loss": 0.9127, + "step": 4411 + }, + { + "epoch": 0.39421895592735723, + "grad_norm": 0.44606366753578186, + "learning_rate": 6.906228905002714e-05, + "loss": 0.9715, + "step": 4412 + }, + { + "epoch": 0.39430830745861906, + "grad_norm": 0.4419662356376648, + "learning_rate": 6.90489104543684e-05, + "loss": 0.9972, + "step": 4413 + }, + { + "epoch": 0.39439765898988094, + "grad_norm": 0.43655163049697876, + "learning_rate": 6.903553026316036e-05, + "loss": 0.9968, + "step": 4414 + }, + { + "epoch": 0.3944870105211428, + "grad_norm": 0.41895565390586853, + "learning_rate": 6.902214847752375e-05, + "loss": 0.9499, + "step": 4415 + }, + { + "epoch": 0.39457636205240465, + "grad_norm": 0.403507798910141, + "learning_rate": 6.900876509857945e-05, + "loss": 0.9934, + "step": 4416 + }, + { + "epoch": 0.39466571358366653, + "grad_norm": 0.4834029972553253, + "learning_rate": 6.899538012744847e-05, + "loss": 0.9242, + "step": 4417 + }, + { + "epoch": 0.3947550651149284, + "grad_norm": 0.46136564016342163, + "learning_rate": 6.89819935652519e-05, + "loss": 0.9524, + "step": 4418 + }, + { + "epoch": 0.3948444166461903, + "grad_norm": 0.4441271424293518, + "learning_rate": 6.896860541311105e-05, + "loss": 0.9916, + "step": 4419 + }, + { + "epoch": 0.3949337681774521, + "grad_norm": 0.5411950349807739, + "learning_rate": 6.89552156721473e-05, + "loss": 0.8352, + "step": 4420 + }, + { + "epoch": 0.395023119708714, + "grad_norm": 0.43596935272216797, + "learning_rate": 6.89418243434822e-05, + "loss": 0.9776, + "step": 4421 + }, + { + "epoch": 0.3951124712399759, + "grad_norm": 0.4356800317764282, + "learning_rate": 6.892843142823739e-05, + "loss": 0.9779, + "step": 4422 + }, + { + "epoch": 0.3952018227712377, + "grad_norm": 0.4590302109718323, + "learning_rate": 6.891503692753469e-05, + "loss": 1.0105, + "step": 4423 + }, + { + "epoch": 0.3952911743024996, + "grad_norm": 0.3544255793094635, + "learning_rate": 6.890164084249602e-05, + "loss": 1.0239, + "step": 4424 + }, + { + "epoch": 0.3953805258337615, + "grad_norm": 0.4187402129173279, + "learning_rate": 6.888824317424347e-05, + "loss": 1.0746, + "step": 4425 + }, + { + "epoch": 0.39546987736502337, + "grad_norm": 0.5059707760810852, + "learning_rate": 6.887484392389922e-05, + "loss": 0.7944, + "step": 4426 + }, + { + "epoch": 0.3955592288962852, + "grad_norm": 0.4193587601184845, + "learning_rate": 6.88614430925856e-05, + "loss": 1.0126, + "step": 4427 + }, + { + "epoch": 0.3956485804275471, + "grad_norm": 0.4689008891582489, + "learning_rate": 6.884804068142505e-05, + "loss": 0.9036, + "step": 4428 + }, + { + "epoch": 0.39573793195880896, + "grad_norm": 0.4160483479499817, + "learning_rate": 6.88346366915402e-05, + "loss": 0.9379, + "step": 4429 + }, + { + "epoch": 0.3958272834900708, + "grad_norm": 0.4379422068595886, + "learning_rate": 6.882123112405376e-05, + "loss": 0.9573, + "step": 4430 + }, + { + "epoch": 0.3959166350213327, + "grad_norm": 0.4520152509212494, + "learning_rate": 6.880782398008862e-05, + "loss": 0.9565, + "step": 4431 + }, + { + "epoch": 0.39600598655259456, + "grad_norm": 0.4579842686653137, + "learning_rate": 6.879441526076771e-05, + "loss": 0.9489, + "step": 4432 + }, + { + "epoch": 0.3960953380838564, + "grad_norm": 0.49286845326423645, + "learning_rate": 6.878100496721423e-05, + "loss": 0.9866, + "step": 4433 + }, + { + "epoch": 0.39618468961511827, + "grad_norm": 0.39000403881073, + "learning_rate": 6.876759310055135e-05, + "loss": 1.0161, + "step": 4434 + }, + { + "epoch": 0.39627404114638015, + "grad_norm": 0.3937504291534424, + "learning_rate": 6.875417966190251e-05, + "loss": 0.975, + "step": 4435 + }, + { + "epoch": 0.39636339267764203, + "grad_norm": 0.504807710647583, + "learning_rate": 6.874076465239123e-05, + "loss": 0.8956, + "step": 4436 + }, + { + "epoch": 0.39645274420890386, + "grad_norm": 0.38329121470451355, + "learning_rate": 6.872734807314115e-05, + "loss": 1.0308, + "step": 4437 + }, + { + "epoch": 0.39654209574016575, + "grad_norm": 0.4224548637866974, + "learning_rate": 6.871392992527603e-05, + "loss": 0.9944, + "step": 4438 + }, + { + "epoch": 0.39663144727142763, + "grad_norm": 0.44873788952827454, + "learning_rate": 6.870051020991981e-05, + "loss": 1.0696, + "step": 4439 + }, + { + "epoch": 0.39672079880268946, + "grad_norm": 0.49590742588043213, + "learning_rate": 6.868708892819653e-05, + "loss": 0.9732, + "step": 4440 + }, + { + "epoch": 0.39681015033395134, + "grad_norm": 0.42665350437164307, + "learning_rate": 6.867366608123038e-05, + "loss": 0.9137, + "step": 4441 + }, + { + "epoch": 0.3968995018652132, + "grad_norm": 0.3908413052558899, + "learning_rate": 6.866024167014562e-05, + "loss": 1.0205, + "step": 4442 + }, + { + "epoch": 0.3969888533964751, + "grad_norm": 0.4211508631706238, + "learning_rate": 6.864681569606673e-05, + "loss": 1.0103, + "step": 4443 + }, + { + "epoch": 0.39707820492773693, + "grad_norm": 0.4399520754814148, + "learning_rate": 6.863338816011826e-05, + "loss": 1.0127, + "step": 4444 + }, + { + "epoch": 0.3971675564589988, + "grad_norm": 0.37878918647766113, + "learning_rate": 6.861995906342491e-05, + "loss": 0.9763, + "step": 4445 + }, + { + "epoch": 0.3972569079902607, + "grad_norm": 0.47828400135040283, + "learning_rate": 6.860652840711154e-05, + "loss": 0.955, + "step": 4446 + }, + { + "epoch": 0.3973462595215225, + "grad_norm": 0.45073986053466797, + "learning_rate": 6.859309619230305e-05, + "loss": 0.9025, + "step": 4447 + }, + { + "epoch": 0.3974356110527844, + "grad_norm": 0.38527220487594604, + "learning_rate": 6.85796624201246e-05, + "loss": 0.9831, + "step": 4448 + }, + { + "epoch": 0.3975249625840463, + "grad_norm": 0.4932858347892761, + "learning_rate": 6.856622709170134e-05, + "loss": 0.9358, + "step": 4449 + }, + { + "epoch": 0.3976143141153082, + "grad_norm": 0.5208497047424316, + "learning_rate": 6.855279020815868e-05, + "loss": 0.9976, + "step": 4450 + }, + { + "epoch": 0.39770366564657, + "grad_norm": 0.40091952681541443, + "learning_rate": 6.853935177062209e-05, + "loss": 0.9934, + "step": 4451 + }, + { + "epoch": 0.3977930171778319, + "grad_norm": 0.4903776943683624, + "learning_rate": 6.852591178021716e-05, + "loss": 0.9286, + "step": 4452 + }, + { + "epoch": 0.39788236870909377, + "grad_norm": 0.45064613223075867, + "learning_rate": 6.851247023806964e-05, + "loss": 0.9525, + "step": 4453 + }, + { + "epoch": 0.3979717202403556, + "grad_norm": 0.49910131096839905, + "learning_rate": 6.849902714530542e-05, + "loss": 0.971, + "step": 4454 + }, + { + "epoch": 0.3980610717716175, + "grad_norm": 0.4719197452068329, + "learning_rate": 6.84855825030505e-05, + "loss": 0.9095, + "step": 4455 + }, + { + "epoch": 0.39815042330287936, + "grad_norm": 0.4351549744606018, + "learning_rate": 6.847213631243099e-05, + "loss": 0.8641, + "step": 4456 + }, + { + "epoch": 0.39823977483414125, + "grad_norm": 0.37931522727012634, + "learning_rate": 6.845868857457316e-05, + "loss": 0.9773, + "step": 4457 + }, + { + "epoch": 0.3983291263654031, + "grad_norm": 0.49004626274108887, + "learning_rate": 6.844523929060343e-05, + "loss": 1.012, + "step": 4458 + }, + { + "epoch": 0.39841847789666496, + "grad_norm": 0.36821407079696655, + "learning_rate": 6.843178846164826e-05, + "loss": 1.0009, + "step": 4459 + }, + { + "epoch": 0.39850782942792684, + "grad_norm": 0.42328307032585144, + "learning_rate": 6.841833608883437e-05, + "loss": 0.9568, + "step": 4460 + }, + { + "epoch": 0.39859718095918867, + "grad_norm": 0.4346807897090912, + "learning_rate": 6.840488217328848e-05, + "loss": 1.0407, + "step": 4461 + }, + { + "epoch": 0.39868653249045055, + "grad_norm": 0.35460466146469116, + "learning_rate": 6.839142671613757e-05, + "loss": 1.0353, + "step": 4462 + }, + { + "epoch": 0.39877588402171243, + "grad_norm": 0.43105101585388184, + "learning_rate": 6.837796971850859e-05, + "loss": 1.0479, + "step": 4463 + }, + { + "epoch": 0.39886523555297426, + "grad_norm": 0.43495675921440125, + "learning_rate": 6.836451118152877e-05, + "loss": 0.9813, + "step": 4464 + }, + { + "epoch": 0.39895458708423615, + "grad_norm": 0.4951534867286682, + "learning_rate": 6.835105110632539e-05, + "loss": 0.9718, + "step": 4465 + }, + { + "epoch": 0.39904393861549803, + "grad_norm": 0.48780739307403564, + "learning_rate": 6.833758949402587e-05, + "loss": 0.9653, + "step": 4466 + }, + { + "epoch": 0.3991332901467599, + "grad_norm": 0.3795831799507141, + "learning_rate": 6.832412634575774e-05, + "loss": 1.0119, + "step": 4467 + }, + { + "epoch": 0.39922264167802174, + "grad_norm": 0.36188191175460815, + "learning_rate": 6.831066166264874e-05, + "loss": 0.979, + "step": 4468 + }, + { + "epoch": 0.3993119932092836, + "grad_norm": 0.4243720769882202, + "learning_rate": 6.829719544582665e-05, + "loss": 1.0764, + "step": 4469 + }, + { + "epoch": 0.3994013447405455, + "grad_norm": 0.3986402750015259, + "learning_rate": 6.828372769641938e-05, + "loss": 1.0687, + "step": 4470 + }, + { + "epoch": 0.39949069627180733, + "grad_norm": 0.48252764344215393, + "learning_rate": 6.827025841555504e-05, + "loss": 0.9346, + "step": 4471 + }, + { + "epoch": 0.3995800478030692, + "grad_norm": 0.41776660084724426, + "learning_rate": 6.825678760436182e-05, + "loss": 0.9995, + "step": 4472 + }, + { + "epoch": 0.3996693993343311, + "grad_norm": 0.48631688952445984, + "learning_rate": 6.824331526396801e-05, + "loss": 1.0179, + "step": 4473 + }, + { + "epoch": 0.399758750865593, + "grad_norm": 0.39176318049430847, + "learning_rate": 6.82298413955021e-05, + "loss": 1.0147, + "step": 4474 + }, + { + "epoch": 0.3998481023968548, + "grad_norm": 0.43048596382141113, + "learning_rate": 6.821636600009266e-05, + "loss": 0.9092, + "step": 4475 + }, + { + "epoch": 0.3999374539281167, + "grad_norm": 0.445742130279541, + "learning_rate": 6.82028890788684e-05, + "loss": 0.9472, + "step": 4476 + }, + { + "epoch": 0.4000268054593786, + "grad_norm": 0.41645416617393494, + "learning_rate": 6.818941063295815e-05, + "loss": 0.9294, + "step": 4477 + }, + { + "epoch": 0.4001161569906404, + "grad_norm": 0.5943310260772705, + "learning_rate": 6.817593066349086e-05, + "loss": 0.9001, + "step": 4478 + }, + { + "epoch": 0.4002055085219023, + "grad_norm": 0.39284154772758484, + "learning_rate": 6.816244917159564e-05, + "loss": 0.9755, + "step": 4479 + }, + { + "epoch": 0.40029486005316417, + "grad_norm": 0.41957008838653564, + "learning_rate": 6.814896615840171e-05, + "loss": 0.9493, + "step": 4480 + }, + { + "epoch": 0.40038421158442605, + "grad_norm": 0.4775201082229614, + "learning_rate": 6.81354816250384e-05, + "loss": 1.0124, + "step": 4481 + }, + { + "epoch": 0.4004735631156879, + "grad_norm": 0.4292440116405487, + "learning_rate": 6.812199557263522e-05, + "loss": 0.9147, + "step": 4482 + }, + { + "epoch": 0.40056291464694976, + "grad_norm": 0.4357702434062958, + "learning_rate": 6.810850800232172e-05, + "loss": 0.9812, + "step": 4483 + }, + { + "epoch": 0.40065226617821165, + "grad_norm": 0.525195837020874, + "learning_rate": 6.809501891522765e-05, + "loss": 0.9529, + "step": 4484 + }, + { + "epoch": 0.4007416177094735, + "grad_norm": 0.42463192343711853, + "learning_rate": 6.808152831248287e-05, + "loss": 0.9269, + "step": 4485 + }, + { + "epoch": 0.40083096924073536, + "grad_norm": 0.4291436970233917, + "learning_rate": 6.806803619521737e-05, + "loss": 1.0478, + "step": 4486 + }, + { + "epoch": 0.40092032077199724, + "grad_norm": 0.4439351260662079, + "learning_rate": 6.805454256456125e-05, + "loss": 0.9217, + "step": 4487 + }, + { + "epoch": 0.4010096723032591, + "grad_norm": 0.5323197245597839, + "learning_rate": 6.804104742164472e-05, + "loss": 0.977, + "step": 4488 + }, + { + "epoch": 0.40109902383452095, + "grad_norm": 0.40645474195480347, + "learning_rate": 6.802755076759819e-05, + "loss": 0.9972, + "step": 4489 + }, + { + "epoch": 0.40118837536578283, + "grad_norm": 0.4209262728691101, + "learning_rate": 6.80140526035521e-05, + "loss": 0.9412, + "step": 4490 + }, + { + "epoch": 0.4012777268970447, + "grad_norm": 0.504423201084137, + "learning_rate": 6.80005529306371e-05, + "loss": 0.9029, + "step": 4491 + }, + { + "epoch": 0.40136707842830655, + "grad_norm": 0.4125538170337677, + "learning_rate": 6.798705174998392e-05, + "loss": 0.9586, + "step": 4492 + }, + { + "epoch": 0.40145642995956843, + "grad_norm": 0.36721163988113403, + "learning_rate": 6.797354906272342e-05, + "loss": 1.0099, + "step": 4493 + }, + { + "epoch": 0.4015457814908303, + "grad_norm": 0.4413255453109741, + "learning_rate": 6.796004486998661e-05, + "loss": 0.9163, + "step": 4494 + }, + { + "epoch": 0.40163513302209214, + "grad_norm": 0.44812944531440735, + "learning_rate": 6.79465391729046e-05, + "loss": 0.967, + "step": 4495 + }, + { + "epoch": 0.401724484553354, + "grad_norm": 0.38190940022468567, + "learning_rate": 6.793303197260864e-05, + "loss": 0.9877, + "step": 4496 + }, + { + "epoch": 0.4018138360846159, + "grad_norm": 0.44277167320251465, + "learning_rate": 6.79195232702301e-05, + "loss": 0.956, + "step": 4497 + }, + { + "epoch": 0.4019031876158778, + "grad_norm": 0.41372618079185486, + "learning_rate": 6.790601306690048e-05, + "loss": 1.003, + "step": 4498 + }, + { + "epoch": 0.4019925391471396, + "grad_norm": 0.39554059505462646, + "learning_rate": 6.78925013637514e-05, + "loss": 1.0137, + "step": 4499 + }, + { + "epoch": 0.4020818906784015, + "grad_norm": 0.3876018226146698, + "learning_rate": 6.78789881619146e-05, + "loss": 0.964, + "step": 4500 + }, + { + "epoch": 0.4021712422096634, + "grad_norm": 0.5051389932632446, + "learning_rate": 6.786547346252198e-05, + "loss": 0.9447, + "step": 4501 + }, + { + "epoch": 0.4022605937409252, + "grad_norm": 0.49276506900787354, + "learning_rate": 6.785195726670552e-05, + "loss": 0.949, + "step": 4502 + }, + { + "epoch": 0.4023499452721871, + "grad_norm": 0.5032357573509216, + "learning_rate": 6.783843957559734e-05, + "loss": 0.9475, + "step": 4503 + }, + { + "epoch": 0.402439296803449, + "grad_norm": 0.42340540885925293, + "learning_rate": 6.782492039032971e-05, + "loss": 0.9155, + "step": 4504 + }, + { + "epoch": 0.40252864833471086, + "grad_norm": 0.4485189914703369, + "learning_rate": 6.7811399712035e-05, + "loss": 0.8825, + "step": 4505 + }, + { + "epoch": 0.4026179998659727, + "grad_norm": 0.4616818130016327, + "learning_rate": 6.779787754184571e-05, + "loss": 0.9936, + "step": 4506 + }, + { + "epoch": 0.40270735139723457, + "grad_norm": 0.44533130526542664, + "learning_rate": 6.778435388089446e-05, + "loss": 0.9267, + "step": 4507 + }, + { + "epoch": 0.40279670292849645, + "grad_norm": 0.4395417869091034, + "learning_rate": 6.777082873031401e-05, + "loss": 0.9568, + "step": 4508 + }, + { + "epoch": 0.4028860544597583, + "grad_norm": 0.44981324672698975, + "learning_rate": 6.775730209123722e-05, + "loss": 0.9014, + "step": 4509 + }, + { + "epoch": 0.40297540599102016, + "grad_norm": 0.4732804596424103, + "learning_rate": 6.77437739647971e-05, + "loss": 0.9897, + "step": 4510 + }, + { + "epoch": 0.40306475752228205, + "grad_norm": 0.4026157557964325, + "learning_rate": 6.773024435212678e-05, + "loss": 1.0569, + "step": 4511 + }, + { + "epoch": 0.40315410905354393, + "grad_norm": 0.39510583877563477, + "learning_rate": 6.771671325435952e-05, + "loss": 0.9722, + "step": 4512 + }, + { + "epoch": 0.40324346058480576, + "grad_norm": 0.4581720232963562, + "learning_rate": 6.770318067262866e-05, + "loss": 0.9051, + "step": 4513 + }, + { + "epoch": 0.40333281211606764, + "grad_norm": 0.5355126261711121, + "learning_rate": 6.768964660806772e-05, + "loss": 0.9467, + "step": 4514 + }, + { + "epoch": 0.4034221636473295, + "grad_norm": 0.38436293601989746, + "learning_rate": 6.767611106181031e-05, + "loss": 0.9994, + "step": 4515 + }, + { + "epoch": 0.40351151517859135, + "grad_norm": 0.4458340108394623, + "learning_rate": 6.766257403499019e-05, + "loss": 0.9685, + "step": 4516 + }, + { + "epoch": 0.40360086670985323, + "grad_norm": 0.5055366158485413, + "learning_rate": 6.764903552874125e-05, + "loss": 0.9154, + "step": 4517 + }, + { + "epoch": 0.4036902182411151, + "grad_norm": 0.4424796998500824, + "learning_rate": 6.763549554419743e-05, + "loss": 0.9438, + "step": 4518 + }, + { + "epoch": 0.403779569772377, + "grad_norm": 0.4608469605445862, + "learning_rate": 6.762195408249288e-05, + "loss": 0.9699, + "step": 4519 + }, + { + "epoch": 0.40386892130363883, + "grad_norm": 0.4548738896846771, + "learning_rate": 6.760841114476185e-05, + "loss": 1.0052, + "step": 4520 + }, + { + "epoch": 0.4039582728349007, + "grad_norm": 0.5669234395027161, + "learning_rate": 6.759486673213869e-05, + "loss": 1.0287, + "step": 4521 + }, + { + "epoch": 0.4040476243661626, + "grad_norm": 0.4517204165458679, + "learning_rate": 6.75813208457579e-05, + "loss": 0.909, + "step": 4522 + }, + { + "epoch": 0.4041369758974244, + "grad_norm": 0.5192089080810547, + "learning_rate": 6.756777348675407e-05, + "loss": 0.9845, + "step": 4523 + }, + { + "epoch": 0.4042263274286863, + "grad_norm": 0.4186631739139557, + "learning_rate": 6.755422465626196e-05, + "loss": 0.9981, + "step": 4524 + }, + { + "epoch": 0.4043156789599482, + "grad_norm": 0.4013117551803589, + "learning_rate": 6.754067435541642e-05, + "loss": 0.9479, + "step": 4525 + }, + { + "epoch": 0.40440503049121, + "grad_norm": 0.4642598330974579, + "learning_rate": 6.752712258535244e-05, + "loss": 0.9661, + "step": 4526 + }, + { + "epoch": 0.4044943820224719, + "grad_norm": 0.5033729672431946, + "learning_rate": 6.751356934720511e-05, + "loss": 0.8962, + "step": 4527 + }, + { + "epoch": 0.4045837335537338, + "grad_norm": 0.4341753125190735, + "learning_rate": 6.750001464210967e-05, + "loss": 0.976, + "step": 4528 + }, + { + "epoch": 0.40467308508499567, + "grad_norm": 0.5763057470321655, + "learning_rate": 6.748645847120146e-05, + "loss": 0.9349, + "step": 4529 + }, + { + "epoch": 0.4047624366162575, + "grad_norm": 0.5385793447494507, + "learning_rate": 6.747290083561596e-05, + "loss": 0.9334, + "step": 4530 + }, + { + "epoch": 0.4048517881475194, + "grad_norm": 0.42370885610580444, + "learning_rate": 6.745934173648876e-05, + "loss": 0.9816, + "step": 4531 + }, + { + "epoch": 0.40494113967878126, + "grad_norm": 0.48537033796310425, + "learning_rate": 6.744578117495562e-05, + "loss": 0.9148, + "step": 4532 + }, + { + "epoch": 0.4050304912100431, + "grad_norm": 0.46244072914123535, + "learning_rate": 6.743221915215232e-05, + "loss": 0.9388, + "step": 4533 + }, + { + "epoch": 0.40511984274130497, + "grad_norm": 0.40350237488746643, + "learning_rate": 6.741865566921484e-05, + "loss": 0.9161, + "step": 4534 + }, + { + "epoch": 0.40520919427256685, + "grad_norm": 0.45283472537994385, + "learning_rate": 6.740509072727931e-05, + "loss": 1.0257, + "step": 4535 + }, + { + "epoch": 0.40529854580382874, + "grad_norm": 0.41299960017204285, + "learning_rate": 6.73915243274819e-05, + "loss": 1.0064, + "step": 4536 + }, + { + "epoch": 0.40538789733509056, + "grad_norm": 0.4359651207923889, + "learning_rate": 6.737795647095893e-05, + "loss": 0.9863, + "step": 4537 + }, + { + "epoch": 0.40547724886635245, + "grad_norm": 0.477792888879776, + "learning_rate": 6.736438715884688e-05, + "loss": 0.957, + "step": 4538 + }, + { + "epoch": 0.40556660039761433, + "grad_norm": 0.4058784544467926, + "learning_rate": 6.735081639228232e-05, + "loss": 0.971, + "step": 4539 + }, + { + "epoch": 0.40565595192887616, + "grad_norm": 0.45812395215034485, + "learning_rate": 6.733724417240194e-05, + "loss": 0.9408, + "step": 4540 + }, + { + "epoch": 0.40574530346013804, + "grad_norm": 0.47528955340385437, + "learning_rate": 6.732367050034254e-05, + "loss": 0.9003, + "step": 4541 + }, + { + "epoch": 0.4058346549913999, + "grad_norm": 0.3834351897239685, + "learning_rate": 6.731009537724112e-05, + "loss": 0.9874, + "step": 4542 + }, + { + "epoch": 0.4059240065226618, + "grad_norm": 0.3695882260799408, + "learning_rate": 6.729651880423465e-05, + "loss": 1.0228, + "step": 4543 + }, + { + "epoch": 0.40601335805392363, + "grad_norm": 0.4406605064868927, + "learning_rate": 6.728294078246038e-05, + "loss": 1.0354, + "step": 4544 + }, + { + "epoch": 0.4061027095851855, + "grad_norm": 0.4965038001537323, + "learning_rate": 6.72693613130556e-05, + "loss": 0.912, + "step": 4545 + }, + { + "epoch": 0.4061920611164474, + "grad_norm": 0.38543984293937683, + "learning_rate": 6.725578039715774e-05, + "loss": 1.0156, + "step": 4546 + }, + { + "epoch": 0.40628141264770923, + "grad_norm": 0.49438318610191345, + "learning_rate": 6.724219803590433e-05, + "loss": 0.9627, + "step": 4547 + }, + { + "epoch": 0.4063707641789711, + "grad_norm": 0.3538610637187958, + "learning_rate": 6.722861423043305e-05, + "loss": 1.0335, + "step": 4548 + }, + { + "epoch": 0.406460115710233, + "grad_norm": 0.3865319490432739, + "learning_rate": 6.721502898188167e-05, + "loss": 1.0594, + "step": 4549 + }, + { + "epoch": 0.4065494672414949, + "grad_norm": 0.395279198884964, + "learning_rate": 6.720144229138813e-05, + "loss": 0.9866, + "step": 4550 + }, + { + "epoch": 0.4066388187727567, + "grad_norm": 0.5369691252708435, + "learning_rate": 6.718785416009044e-05, + "loss": 0.9146, + "step": 4551 + }, + { + "epoch": 0.4067281703040186, + "grad_norm": 0.43310627341270447, + "learning_rate": 6.717426458912675e-05, + "loss": 0.9597, + "step": 4552 + }, + { + "epoch": 0.40681752183528047, + "grad_norm": 0.4922178387641907, + "learning_rate": 6.716067357963535e-05, + "loss": 0.9746, + "step": 4553 + }, + { + "epoch": 0.4069068733665423, + "grad_norm": 0.43535903096199036, + "learning_rate": 6.714708113275461e-05, + "loss": 0.967, + "step": 4554 + }, + { + "epoch": 0.4069962248978042, + "grad_norm": 0.44184789061546326, + "learning_rate": 6.713348724962305e-05, + "loss": 0.8968, + "step": 4555 + }, + { + "epoch": 0.40708557642906606, + "grad_norm": 0.4985772371292114, + "learning_rate": 6.711989193137929e-05, + "loss": 0.905, + "step": 4556 + }, + { + "epoch": 0.4071749279603279, + "grad_norm": 0.40521472692489624, + "learning_rate": 6.710629517916211e-05, + "loss": 1.0015, + "step": 4557 + }, + { + "epoch": 0.4072642794915898, + "grad_norm": 0.3867556154727936, + "learning_rate": 6.709269699411038e-05, + "loss": 0.9735, + "step": 4558 + }, + { + "epoch": 0.40735363102285166, + "grad_norm": 0.46226397156715393, + "learning_rate": 6.707909737736306e-05, + "loss": 0.8992, + "step": 4559 + }, + { + "epoch": 0.40744298255411354, + "grad_norm": 0.43801769614219666, + "learning_rate": 6.70654963300593e-05, + "loss": 0.9856, + "step": 4560 + }, + { + "epoch": 0.40753233408537537, + "grad_norm": 0.45655280351638794, + "learning_rate": 6.70518938533383e-05, + "loss": 0.9502, + "step": 4561 + }, + { + "epoch": 0.40762168561663725, + "grad_norm": 0.4761228561401367, + "learning_rate": 6.703828994833944e-05, + "loss": 0.9812, + "step": 4562 + }, + { + "epoch": 0.40771103714789914, + "grad_norm": 0.5070672631263733, + "learning_rate": 6.702468461620218e-05, + "loss": 0.8713, + "step": 4563 + }, + { + "epoch": 0.40780038867916096, + "grad_norm": 0.3671000599861145, + "learning_rate": 6.701107785806612e-05, + "loss": 1.0342, + "step": 4564 + }, + { + "epoch": 0.40788974021042285, + "grad_norm": 0.4318379759788513, + "learning_rate": 6.699746967507095e-05, + "loss": 0.9301, + "step": 4565 + }, + { + "epoch": 0.40797909174168473, + "grad_norm": 0.45091480016708374, + "learning_rate": 6.698386006835653e-05, + "loss": 1.0128, + "step": 4566 + }, + { + "epoch": 0.4080684432729466, + "grad_norm": 0.45846810936927795, + "learning_rate": 6.697024903906279e-05, + "loss": 1.044, + "step": 4567 + }, + { + "epoch": 0.40815779480420844, + "grad_norm": 0.42733606696128845, + "learning_rate": 6.695663658832981e-05, + "loss": 0.9401, + "step": 4568 + }, + { + "epoch": 0.4082471463354703, + "grad_norm": 0.48478513956069946, + "learning_rate": 6.694302271729774e-05, + "loss": 0.9547, + "step": 4569 + }, + { + "epoch": 0.4083364978667322, + "grad_norm": 0.42427879571914673, + "learning_rate": 6.692940742710694e-05, + "loss": 0.9735, + "step": 4570 + }, + { + "epoch": 0.40842584939799403, + "grad_norm": 0.40262219309806824, + "learning_rate": 6.691579071889782e-05, + "loss": 0.9945, + "step": 4571 + }, + { + "epoch": 0.4085152009292559, + "grad_norm": 0.445208877325058, + "learning_rate": 6.690217259381091e-05, + "loss": 0.9534, + "step": 4572 + }, + { + "epoch": 0.4086045524605178, + "grad_norm": 0.40912604331970215, + "learning_rate": 6.688855305298688e-05, + "loss": 0.9062, + "step": 4573 + }, + { + "epoch": 0.4086939039917797, + "grad_norm": 0.4119530916213989, + "learning_rate": 6.687493209756653e-05, + "loss": 0.9902, + "step": 4574 + }, + { + "epoch": 0.4087832555230415, + "grad_norm": 0.43412020802497864, + "learning_rate": 6.686130972869072e-05, + "loss": 0.9655, + "step": 4575 + }, + { + "epoch": 0.4088726070543034, + "grad_norm": 0.4143422842025757, + "learning_rate": 6.68476859475005e-05, + "loss": 0.9909, + "step": 4576 + }, + { + "epoch": 0.4089619585855653, + "grad_norm": 0.3809065520763397, + "learning_rate": 6.6834060755137e-05, + "loss": 1.0173, + "step": 4577 + }, + { + "epoch": 0.4090513101168271, + "grad_norm": 0.5703213810920715, + "learning_rate": 6.682043415274147e-05, + "loss": 0.9383, + "step": 4578 + }, + { + "epoch": 0.409140661648089, + "grad_norm": 0.37264561653137207, + "learning_rate": 6.680680614145529e-05, + "loss": 0.9856, + "step": 4579 + }, + { + "epoch": 0.40923001317935087, + "grad_norm": 0.4544839859008789, + "learning_rate": 6.679317672241994e-05, + "loss": 0.9899, + "step": 4580 + }, + { + "epoch": 0.40931936471061275, + "grad_norm": 0.3862614929676056, + "learning_rate": 6.677954589677705e-05, + "loss": 1.0016, + "step": 4581 + }, + { + "epoch": 0.4094087162418746, + "grad_norm": 0.4736924171447754, + "learning_rate": 6.676591366566831e-05, + "loss": 0.9096, + "step": 4582 + }, + { + "epoch": 0.40949806777313646, + "grad_norm": 0.4209919273853302, + "learning_rate": 6.675228003023561e-05, + "loss": 0.9689, + "step": 4583 + }, + { + "epoch": 0.40958741930439835, + "grad_norm": 0.416951060295105, + "learning_rate": 6.673864499162089e-05, + "loss": 0.9733, + "step": 4584 + }, + { + "epoch": 0.4096767708356602, + "grad_norm": 0.44141101837158203, + "learning_rate": 6.672500855096623e-05, + "loss": 0.9765, + "step": 4585 + }, + { + "epoch": 0.40976612236692206, + "grad_norm": 0.4138414263725281, + "learning_rate": 6.671137070941383e-05, + "loss": 0.9467, + "step": 4586 + }, + { + "epoch": 0.40985547389818394, + "grad_norm": 0.42416608333587646, + "learning_rate": 6.669773146810599e-05, + "loss": 0.9659, + "step": 4587 + }, + { + "epoch": 0.40994482542944577, + "grad_norm": 0.4376296401023865, + "learning_rate": 6.668409082818517e-05, + "loss": 1.0398, + "step": 4588 + }, + { + "epoch": 0.41003417696070765, + "grad_norm": 0.4650912880897522, + "learning_rate": 6.66704487907939e-05, + "loss": 0.9026, + "step": 4589 + }, + { + "epoch": 0.41012352849196954, + "grad_norm": 0.436990350484848, + "learning_rate": 6.665680535707485e-05, + "loss": 1.0163, + "step": 4590 + }, + { + "epoch": 0.4102128800232314, + "grad_norm": 0.42195725440979004, + "learning_rate": 6.664316052817079e-05, + "loss": 0.9237, + "step": 4591 + }, + { + "epoch": 0.41030223155449325, + "grad_norm": 0.4974232017993927, + "learning_rate": 6.662951430522464e-05, + "loss": 0.9441, + "step": 4592 + }, + { + "epoch": 0.41039158308575513, + "grad_norm": 0.43734729290008545, + "learning_rate": 6.661586668937943e-05, + "loss": 0.9482, + "step": 4593 + }, + { + "epoch": 0.410480934617017, + "grad_norm": 0.45577242970466614, + "learning_rate": 6.660221768177824e-05, + "loss": 0.937, + "step": 4594 + }, + { + "epoch": 0.41057028614827884, + "grad_norm": 0.4594924747943878, + "learning_rate": 6.658856728356437e-05, + "loss": 0.9181, + "step": 4595 + }, + { + "epoch": 0.4106596376795407, + "grad_norm": 0.441518098115921, + "learning_rate": 6.657491549588115e-05, + "loss": 0.9669, + "step": 4596 + }, + { + "epoch": 0.4107489892108026, + "grad_norm": 0.4046085774898529, + "learning_rate": 6.65612623198721e-05, + "loss": 0.9289, + "step": 4597 + }, + { + "epoch": 0.4108383407420645, + "grad_norm": 0.4187487065792084, + "learning_rate": 6.65476077566808e-05, + "loss": 1.0489, + "step": 4598 + }, + { + "epoch": 0.4109276922733263, + "grad_norm": 0.4384293854236603, + "learning_rate": 6.653395180745095e-05, + "loss": 0.9635, + "step": 4599 + }, + { + "epoch": 0.4110170438045882, + "grad_norm": 0.38338857889175415, + "learning_rate": 6.652029447332641e-05, + "loss": 1.0088, + "step": 4600 + }, + { + "epoch": 0.4111063953358501, + "grad_norm": 0.5203673839569092, + "learning_rate": 6.650663575545111e-05, + "loss": 0.9381, + "step": 4601 + }, + { + "epoch": 0.4111957468671119, + "grad_norm": 0.43095913529396057, + "learning_rate": 6.649297565496911e-05, + "loss": 1.0205, + "step": 4602 + }, + { + "epoch": 0.4112850983983738, + "grad_norm": 0.43971747159957886, + "learning_rate": 6.647931417302462e-05, + "loss": 0.9816, + "step": 4603 + }, + { + "epoch": 0.4113744499296357, + "grad_norm": 0.3989725708961487, + "learning_rate": 6.64656513107619e-05, + "loss": 0.9747, + "step": 4604 + }, + { + "epoch": 0.41146380146089756, + "grad_norm": 0.47779417037963867, + "learning_rate": 6.645198706932536e-05, + "loss": 0.9923, + "step": 4605 + }, + { + "epoch": 0.4115531529921594, + "grad_norm": 0.4292563199996948, + "learning_rate": 6.643832144985955e-05, + "loss": 0.944, + "step": 4606 + }, + { + "epoch": 0.41164250452342127, + "grad_norm": 0.5014485120773315, + "learning_rate": 6.64246544535091e-05, + "loss": 0.9265, + "step": 4607 + }, + { + "epoch": 0.41173185605468315, + "grad_norm": 0.43778491020202637, + "learning_rate": 6.641098608141874e-05, + "loss": 0.9643, + "step": 4608 + }, + { + "epoch": 0.411821207585945, + "grad_norm": 0.6082847714424133, + "learning_rate": 6.63973163347334e-05, + "loss": 0.9869, + "step": 4609 + }, + { + "epoch": 0.41191055911720686, + "grad_norm": 0.4783382713794708, + "learning_rate": 6.638364521459802e-05, + "loss": 1.0311, + "step": 4610 + }, + { + "epoch": 0.41199991064846875, + "grad_norm": 0.5080249905586243, + "learning_rate": 6.636997272215772e-05, + "loss": 0.9857, + "step": 4611 + }, + { + "epoch": 0.41208926217973063, + "grad_norm": 0.3644493520259857, + "learning_rate": 6.63562988585577e-05, + "loss": 1.006, + "step": 4612 + }, + { + "epoch": 0.41217861371099246, + "grad_norm": 0.4645083248615265, + "learning_rate": 6.634262362494332e-05, + "loss": 0.9701, + "step": 4613 + }, + { + "epoch": 0.41226796524225434, + "grad_norm": 0.48680824041366577, + "learning_rate": 6.632894702246001e-05, + "loss": 0.9591, + "step": 4614 + }, + { + "epoch": 0.4123573167735162, + "grad_norm": 0.4588204622268677, + "learning_rate": 6.631526905225333e-05, + "loss": 0.926, + "step": 4615 + }, + { + "epoch": 0.41244666830477805, + "grad_norm": 0.43005460500717163, + "learning_rate": 6.630158971546896e-05, + "loss": 1.0193, + "step": 4616 + }, + { + "epoch": 0.41253601983603994, + "grad_norm": 0.4423251152038574, + "learning_rate": 6.628790901325267e-05, + "loss": 0.8798, + "step": 4617 + }, + { + "epoch": 0.4126253713673018, + "grad_norm": 0.5219715237617493, + "learning_rate": 6.627422694675042e-05, + "loss": 1.0016, + "step": 4618 + }, + { + "epoch": 0.4127147228985637, + "grad_norm": 0.4232001304626465, + "learning_rate": 6.626054351710817e-05, + "loss": 0.9672, + "step": 4619 + }, + { + "epoch": 0.41280407442982553, + "grad_norm": 0.47682496905326843, + "learning_rate": 6.624685872547207e-05, + "loss": 0.9314, + "step": 4620 + }, + { + "epoch": 0.4128934259610874, + "grad_norm": 0.46166396141052246, + "learning_rate": 6.623317257298837e-05, + "loss": 1.015, + "step": 4621 + }, + { + "epoch": 0.4129827774923493, + "grad_norm": 0.39371028542518616, + "learning_rate": 6.621948506080345e-05, + "loss": 0.9835, + "step": 4622 + }, + { + "epoch": 0.4130721290236111, + "grad_norm": 0.41618743538856506, + "learning_rate": 6.620579619006377e-05, + "loss": 0.9857, + "step": 4623 + }, + { + "epoch": 0.413161480554873, + "grad_norm": 0.4997164011001587, + "learning_rate": 6.619210596191592e-05, + "loss": 0.8842, + "step": 4624 + }, + { + "epoch": 0.4132508320861349, + "grad_norm": 0.45205265283584595, + "learning_rate": 6.61784143775066e-05, + "loss": 0.9845, + "step": 4625 + }, + { + "epoch": 0.4133401836173967, + "grad_norm": 0.49516791105270386, + "learning_rate": 6.616472143798261e-05, + "loss": 0.8691, + "step": 4626 + }, + { + "epoch": 0.4134295351486586, + "grad_norm": 0.4403949975967407, + "learning_rate": 6.615102714449089e-05, + "loss": 0.9634, + "step": 4627 + }, + { + "epoch": 0.4135188866799205, + "grad_norm": 0.43604418635368347, + "learning_rate": 6.613733149817852e-05, + "loss": 1.0116, + "step": 4628 + }, + { + "epoch": 0.41360823821118237, + "grad_norm": 0.4321603775024414, + "learning_rate": 6.612363450019261e-05, + "loss": 0.924, + "step": 4629 + }, + { + "epoch": 0.4136975897424442, + "grad_norm": 0.40170714259147644, + "learning_rate": 6.610993615168044e-05, + "loss": 1.0031, + "step": 4630 + }, + { + "epoch": 0.4137869412737061, + "grad_norm": 0.4581966996192932, + "learning_rate": 6.60962364537894e-05, + "loss": 0.9848, + "step": 4631 + }, + { + "epoch": 0.41387629280496796, + "grad_norm": 0.42870965600013733, + "learning_rate": 6.608253540766698e-05, + "loss": 1.0401, + "step": 4632 + }, + { + "epoch": 0.4139656443362298, + "grad_norm": 0.4049166738986969, + "learning_rate": 6.60688330144608e-05, + "loss": 0.9862, + "step": 4633 + }, + { + "epoch": 0.41405499586749167, + "grad_norm": 0.44598478078842163, + "learning_rate": 6.605512927531858e-05, + "loss": 1.0188, + "step": 4634 + }, + { + "epoch": 0.41414434739875355, + "grad_norm": 0.4548451602458954, + "learning_rate": 6.604142419138812e-05, + "loss": 0.9463, + "step": 4635 + }, + { + "epoch": 0.41423369893001544, + "grad_norm": 0.41547542810440063, + "learning_rate": 6.602771776381743e-05, + "loss": 1.0352, + "step": 4636 + }, + { + "epoch": 0.41432305046127726, + "grad_norm": 0.42148005962371826, + "learning_rate": 6.60140099937545e-05, + "loss": 0.944, + "step": 4637 + }, + { + "epoch": 0.41441240199253915, + "grad_norm": 0.41155093908309937, + "learning_rate": 6.600030088234755e-05, + "loss": 0.9639, + "step": 4638 + }, + { + "epoch": 0.41450175352380103, + "grad_norm": 0.4119633436203003, + "learning_rate": 6.598659043074487e-05, + "loss": 0.9254, + "step": 4639 + }, + { + "epoch": 0.41459110505506286, + "grad_norm": 0.4401843547821045, + "learning_rate": 6.59728786400948e-05, + "loss": 0.9798, + "step": 4640 + }, + { + "epoch": 0.41468045658632474, + "grad_norm": 0.4861004054546356, + "learning_rate": 6.595916551154591e-05, + "loss": 0.9584, + "step": 4641 + }, + { + "epoch": 0.4147698081175866, + "grad_norm": 0.44102394580841064, + "learning_rate": 6.59454510462468e-05, + "loss": 0.9108, + "step": 4642 + }, + { + "epoch": 0.4148591596488485, + "grad_norm": 0.4617249667644501, + "learning_rate": 6.593173524534619e-05, + "loss": 0.9601, + "step": 4643 + }, + { + "epoch": 0.41494851118011034, + "grad_norm": 0.5489480495452881, + "learning_rate": 6.591801810999294e-05, + "loss": 0.9668, + "step": 4644 + }, + { + "epoch": 0.4150378627113722, + "grad_norm": 0.5144347548484802, + "learning_rate": 6.590429964133599e-05, + "loss": 0.9139, + "step": 4645 + }, + { + "epoch": 0.4151272142426341, + "grad_norm": 0.4274512827396393, + "learning_rate": 6.589057984052441e-05, + "loss": 1.001, + "step": 4646 + }, + { + "epoch": 0.41521656577389593, + "grad_norm": 0.4437926709651947, + "learning_rate": 6.58768587087074e-05, + "loss": 0.9184, + "step": 4647 + }, + { + "epoch": 0.4153059173051578, + "grad_norm": 0.46083325147628784, + "learning_rate": 6.586313624703423e-05, + "loss": 1.0409, + "step": 4648 + }, + { + "epoch": 0.4153952688364197, + "grad_norm": 0.46094614267349243, + "learning_rate": 6.584941245665432e-05, + "loss": 0.9456, + "step": 4649 + }, + { + "epoch": 0.4154846203676816, + "grad_norm": 0.3888949751853943, + "learning_rate": 6.583568733871716e-05, + "loss": 0.9999, + "step": 4650 + }, + { + "epoch": 0.4155739718989434, + "grad_norm": 0.4531100392341614, + "learning_rate": 6.58219608943724e-05, + "loss": 1.0092, + "step": 4651 + }, + { + "epoch": 0.4156633234302053, + "grad_norm": 0.44666728377342224, + "learning_rate": 6.580823312476976e-05, + "loss": 0.961, + "step": 4652 + }, + { + "epoch": 0.4157526749614672, + "grad_norm": 0.39835554361343384, + "learning_rate": 6.579450403105909e-05, + "loss": 0.9973, + "step": 4653 + }, + { + "epoch": 0.415842026492729, + "grad_norm": 0.4402940571308136, + "learning_rate": 6.578077361439037e-05, + "loss": 0.9221, + "step": 4654 + }, + { + "epoch": 0.4159313780239909, + "grad_norm": 0.43528056144714355, + "learning_rate": 6.576704187591362e-05, + "loss": 0.9335, + "step": 4655 + }, + { + "epoch": 0.41602072955525277, + "grad_norm": 0.48147714138031006, + "learning_rate": 6.575330881677907e-05, + "loss": 0.9158, + "step": 4656 + }, + { + "epoch": 0.4161100810865146, + "grad_norm": 0.4357036352157593, + "learning_rate": 6.573957443813698e-05, + "loss": 0.9688, + "step": 4657 + }, + { + "epoch": 0.4161994326177765, + "grad_norm": 0.41116756200790405, + "learning_rate": 6.572583874113777e-05, + "loss": 0.9678, + "step": 4658 + }, + { + "epoch": 0.41628878414903836, + "grad_norm": 0.3910764753818512, + "learning_rate": 6.571210172693192e-05, + "loss": 1.0419, + "step": 4659 + }, + { + "epoch": 0.41637813568030024, + "grad_norm": 0.5164288878440857, + "learning_rate": 6.569836339667009e-05, + "loss": 0.8935, + "step": 4660 + }, + { + "epoch": 0.41646748721156207, + "grad_norm": 0.48171505331993103, + "learning_rate": 6.568462375150298e-05, + "loss": 1.0256, + "step": 4661 + }, + { + "epoch": 0.41655683874282395, + "grad_norm": 0.43852394819259644, + "learning_rate": 6.567088279258144e-05, + "loss": 0.925, + "step": 4662 + }, + { + "epoch": 0.41664619027408584, + "grad_norm": 0.42417702078819275, + "learning_rate": 6.565714052105645e-05, + "loss": 0.9589, + "step": 4663 + }, + { + "epoch": 0.41673554180534766, + "grad_norm": 0.42217710614204407, + "learning_rate": 6.564339693807904e-05, + "loss": 0.9391, + "step": 4664 + }, + { + "epoch": 0.41682489333660955, + "grad_norm": 0.48488372564315796, + "learning_rate": 6.562965204480039e-05, + "loss": 0.9816, + "step": 4665 + }, + { + "epoch": 0.41691424486787143, + "grad_norm": 0.424485981464386, + "learning_rate": 6.561590584237176e-05, + "loss": 1.0454, + "step": 4666 + }, + { + "epoch": 0.4170035963991333, + "grad_norm": 0.40469837188720703, + "learning_rate": 6.560215833194457e-05, + "loss": 1.0021, + "step": 4667 + }, + { + "epoch": 0.41709294793039514, + "grad_norm": 0.4954060912132263, + "learning_rate": 6.558840951467033e-05, + "loss": 1.0377, + "step": 4668 + }, + { + "epoch": 0.417182299461657, + "grad_norm": 0.47317901253700256, + "learning_rate": 6.557465939170062e-05, + "loss": 0.9776, + "step": 4669 + }, + { + "epoch": 0.4172716509929189, + "grad_norm": 0.4012228846549988, + "learning_rate": 6.556090796418717e-05, + "loss": 0.9594, + "step": 4670 + }, + { + "epoch": 0.41736100252418074, + "grad_norm": 0.4025217890739441, + "learning_rate": 6.554715523328181e-05, + "loss": 0.9784, + "step": 4671 + }, + { + "epoch": 0.4174503540554426, + "grad_norm": 0.4811991751194, + "learning_rate": 6.553340120013649e-05, + "loss": 0.9726, + "step": 4672 + }, + { + "epoch": 0.4175397055867045, + "grad_norm": 0.4033472239971161, + "learning_rate": 6.551964586590323e-05, + "loss": 0.9814, + "step": 4673 + }, + { + "epoch": 0.4176290571179664, + "grad_norm": 0.45485714077949524, + "learning_rate": 6.550588923173422e-05, + "loss": 0.9099, + "step": 4674 + }, + { + "epoch": 0.4177184086492282, + "grad_norm": 0.48728662729263306, + "learning_rate": 6.549213129878169e-05, + "loss": 0.9082, + "step": 4675 + }, + { + "epoch": 0.4178077601804901, + "grad_norm": 0.46427708864212036, + "learning_rate": 6.547837206819804e-05, + "loss": 0.9797, + "step": 4676 + }, + { + "epoch": 0.417897111711752, + "grad_norm": 0.5304161310195923, + "learning_rate": 6.546461154113575e-05, + "loss": 0.9526, + "step": 4677 + }, + { + "epoch": 0.4179864632430138, + "grad_norm": 0.5012792348861694, + "learning_rate": 6.545084971874738e-05, + "loss": 0.9665, + "step": 4678 + }, + { + "epoch": 0.4180758147742757, + "grad_norm": 0.41334888339042664, + "learning_rate": 6.543708660218566e-05, + "loss": 0.945, + "step": 4679 + }, + { + "epoch": 0.4181651663055376, + "grad_norm": 0.38999998569488525, + "learning_rate": 6.54233221926034e-05, + "loss": 0.9889, + "step": 4680 + }, + { + "epoch": 0.41825451783679946, + "grad_norm": 0.43242332339286804, + "learning_rate": 6.540955649115349e-05, + "loss": 0.9914, + "step": 4681 + }, + { + "epoch": 0.4183438693680613, + "grad_norm": 0.46687716245651245, + "learning_rate": 6.539578949898896e-05, + "loss": 0.986, + "step": 4682 + }, + { + "epoch": 0.41843322089932317, + "grad_norm": 0.5817546844482422, + "learning_rate": 6.538202121726298e-05, + "loss": 0.8811, + "step": 4683 + }, + { + "epoch": 0.41852257243058505, + "grad_norm": 0.4378966987133026, + "learning_rate": 6.536825164712876e-05, + "loss": 0.9343, + "step": 4684 + }, + { + "epoch": 0.4186119239618469, + "grad_norm": 0.45147213339805603, + "learning_rate": 6.535448078973963e-05, + "loss": 0.928, + "step": 4685 + }, + { + "epoch": 0.41870127549310876, + "grad_norm": 0.5144475102424622, + "learning_rate": 6.534070864624908e-05, + "loss": 0.9623, + "step": 4686 + }, + { + "epoch": 0.41879062702437064, + "grad_norm": 0.44257742166519165, + "learning_rate": 6.532693521781066e-05, + "loss": 0.9797, + "step": 4687 + }, + { + "epoch": 0.41887997855563247, + "grad_norm": 0.4004027247428894, + "learning_rate": 6.531316050557803e-05, + "loss": 1.0237, + "step": 4688 + }, + { + "epoch": 0.41896933008689435, + "grad_norm": 0.5068597793579102, + "learning_rate": 6.529938451070501e-05, + "loss": 0.9238, + "step": 4689 + }, + { + "epoch": 0.41905868161815624, + "grad_norm": 0.4261419177055359, + "learning_rate": 6.528560723434543e-05, + "loss": 0.9867, + "step": 4690 + }, + { + "epoch": 0.4191480331494181, + "grad_norm": 0.5222342014312744, + "learning_rate": 6.527182867765332e-05, + "loss": 0.9864, + "step": 4691 + }, + { + "epoch": 0.41923738468067995, + "grad_norm": 0.5611239075660706, + "learning_rate": 6.525804884178277e-05, + "loss": 0.9771, + "step": 4692 + }, + { + "epoch": 0.41932673621194183, + "grad_norm": 0.3744679391384125, + "learning_rate": 6.524426772788801e-05, + "loss": 0.9883, + "step": 4693 + }, + { + "epoch": 0.4194160877432037, + "grad_norm": 0.4100448787212372, + "learning_rate": 6.523048533712331e-05, + "loss": 1.0148, + "step": 4694 + }, + { + "epoch": 0.41950543927446554, + "grad_norm": 0.42226895689964294, + "learning_rate": 6.521670167064313e-05, + "loss": 1.027, + "step": 4695 + }, + { + "epoch": 0.4195947908057274, + "grad_norm": 0.450785756111145, + "learning_rate": 6.5202916729602e-05, + "loss": 0.9611, + "step": 4696 + }, + { + "epoch": 0.4196841423369893, + "grad_norm": 0.4674597382545471, + "learning_rate": 6.518913051515451e-05, + "loss": 0.9714, + "step": 4697 + }, + { + "epoch": 0.4197734938682512, + "grad_norm": 0.5814161896705627, + "learning_rate": 6.517534302845545e-05, + "loss": 0.8825, + "step": 4698 + }, + { + "epoch": 0.419862845399513, + "grad_norm": 0.4287194609642029, + "learning_rate": 6.516155427065967e-05, + "loss": 1.0032, + "step": 4699 + }, + { + "epoch": 0.4199521969307749, + "grad_norm": 0.466623991727829, + "learning_rate": 6.514776424292208e-05, + "loss": 0.9162, + "step": 4700 + }, + { + "epoch": 0.4200415484620368, + "grad_norm": 0.5070553421974182, + "learning_rate": 6.513397294639778e-05, + "loss": 1.0066, + "step": 4701 + }, + { + "epoch": 0.4201308999932986, + "grad_norm": 0.42103180289268494, + "learning_rate": 6.512018038224194e-05, + "loss": 0.9602, + "step": 4702 + }, + { + "epoch": 0.4202202515245605, + "grad_norm": 0.49392175674438477, + "learning_rate": 6.510638655160981e-05, + "loss": 0.9859, + "step": 4703 + }, + { + "epoch": 0.4203096030558224, + "grad_norm": 0.46998798847198486, + "learning_rate": 6.509259145565681e-05, + "loss": 0.9445, + "step": 4704 + }, + { + "epoch": 0.42039895458708426, + "grad_norm": 0.47235095500946045, + "learning_rate": 6.507879509553837e-05, + "loss": 0.942, + "step": 4705 + }, + { + "epoch": 0.4204883061183461, + "grad_norm": 0.4976077973842621, + "learning_rate": 6.506499747241013e-05, + "loss": 0.9331, + "step": 4706 + }, + { + "epoch": 0.42057765764960797, + "grad_norm": 0.45665648579597473, + "learning_rate": 6.505119858742775e-05, + "loss": 1.0242, + "step": 4707 + }, + { + "epoch": 0.42066700918086986, + "grad_norm": 0.5178200602531433, + "learning_rate": 6.503739844174708e-05, + "loss": 0.9287, + "step": 4708 + }, + { + "epoch": 0.4207563607121317, + "grad_norm": 0.41020286083221436, + "learning_rate": 6.502359703652398e-05, + "loss": 1.0093, + "step": 4709 + }, + { + "epoch": 0.42084571224339357, + "grad_norm": 0.44288718700408936, + "learning_rate": 6.500979437291451e-05, + "loss": 1.0115, + "step": 4710 + }, + { + "epoch": 0.42093506377465545, + "grad_norm": 0.4006830155849457, + "learning_rate": 6.499599045207475e-05, + "loss": 0.9736, + "step": 4711 + }, + { + "epoch": 0.42102441530591733, + "grad_norm": 0.4733505845069885, + "learning_rate": 6.498218527516097e-05, + "loss": 0.9286, + "step": 4712 + }, + { + "epoch": 0.42111376683717916, + "grad_norm": 0.48651424050331116, + "learning_rate": 6.496837884332945e-05, + "loss": 1.0209, + "step": 4713 + }, + { + "epoch": 0.42120311836844104, + "grad_norm": 0.38982510566711426, + "learning_rate": 6.495457115773667e-05, + "loss": 1.0108, + "step": 4714 + }, + { + "epoch": 0.4212924698997029, + "grad_norm": 0.47377124428749084, + "learning_rate": 6.494076221953912e-05, + "loss": 0.986, + "step": 4715 + }, + { + "epoch": 0.42138182143096475, + "grad_norm": 0.3638954758644104, + "learning_rate": 6.492695202989351e-05, + "loss": 1.0072, + "step": 4716 + }, + { + "epoch": 0.42147117296222664, + "grad_norm": 0.49875038862228394, + "learning_rate": 6.491314058995654e-05, + "loss": 0.9364, + "step": 4717 + }, + { + "epoch": 0.4215605244934885, + "grad_norm": 0.41740208864212036, + "learning_rate": 6.489932790088508e-05, + "loss": 0.9213, + "step": 4718 + }, + { + "epoch": 0.42164987602475035, + "grad_norm": 0.40949100255966187, + "learning_rate": 6.48855139638361e-05, + "loss": 0.9908, + "step": 4719 + }, + { + "epoch": 0.42173922755601223, + "grad_norm": 0.4184902608394623, + "learning_rate": 6.487169877996667e-05, + "loss": 0.9252, + "step": 4720 + }, + { + "epoch": 0.4218285790872741, + "grad_norm": 0.3919154405593872, + "learning_rate": 6.485788235043392e-05, + "loss": 1.0046, + "step": 4721 + }, + { + "epoch": 0.421917930618536, + "grad_norm": 0.44090232253074646, + "learning_rate": 6.484406467639516e-05, + "loss": 1.0108, + "step": 4722 + }, + { + "epoch": 0.4220072821497978, + "grad_norm": 0.407052606344223, + "learning_rate": 6.483024575900776e-05, + "loss": 1.01, + "step": 4723 + }, + { + "epoch": 0.4220966336810597, + "grad_norm": 0.5126084089279175, + "learning_rate": 6.481642559942919e-05, + "loss": 0.9421, + "step": 4724 + }, + { + "epoch": 0.4221859852123216, + "grad_norm": 0.3954399824142456, + "learning_rate": 6.480260419881706e-05, + "loss": 1.0176, + "step": 4725 + }, + { + "epoch": 0.4222753367435834, + "grad_norm": 0.4803306758403778, + "learning_rate": 6.478878155832903e-05, + "loss": 0.8829, + "step": 4726 + }, + { + "epoch": 0.4223646882748453, + "grad_norm": 0.4403332769870758, + "learning_rate": 6.477495767912292e-05, + "loss": 0.9965, + "step": 4727 + }, + { + "epoch": 0.4224540398061072, + "grad_norm": 0.4345102906227112, + "learning_rate": 6.476113256235661e-05, + "loss": 0.9472, + "step": 4728 + }, + { + "epoch": 0.42254339133736907, + "grad_norm": 0.4194163978099823, + "learning_rate": 6.47473062091881e-05, + "loss": 1.0142, + "step": 4729 + }, + { + "epoch": 0.4226327428686309, + "grad_norm": 0.5548313856124878, + "learning_rate": 6.473347862077552e-05, + "loss": 0.9398, + "step": 4730 + }, + { + "epoch": 0.4227220943998928, + "grad_norm": 0.49894118309020996, + "learning_rate": 6.471964979827702e-05, + "loss": 0.9525, + "step": 4731 + }, + { + "epoch": 0.42281144593115466, + "grad_norm": 0.5956845879554749, + "learning_rate": 6.470581974285098e-05, + "loss": 0.9364, + "step": 4732 + }, + { + "epoch": 0.4229007974624165, + "grad_norm": 0.5391216278076172, + "learning_rate": 6.469198845565577e-05, + "loss": 0.9362, + "step": 4733 + }, + { + "epoch": 0.42299014899367837, + "grad_norm": 0.4035119414329529, + "learning_rate": 6.467815593784993e-05, + "loss": 0.9826, + "step": 4734 + }, + { + "epoch": 0.42307950052494026, + "grad_norm": 0.43336036801338196, + "learning_rate": 6.466432219059208e-05, + "loss": 0.9782, + "step": 4735 + }, + { + "epoch": 0.42316885205620214, + "grad_norm": 0.43682193756103516, + "learning_rate": 6.465048721504091e-05, + "loss": 0.9663, + "step": 4736 + }, + { + "epoch": 0.42325820358746397, + "grad_norm": 0.49515190720558167, + "learning_rate": 6.46366510123553e-05, + "loss": 0.9055, + "step": 4737 + }, + { + "epoch": 0.42334755511872585, + "grad_norm": 0.4037051200866699, + "learning_rate": 6.462281358369413e-05, + "loss": 0.9359, + "step": 4738 + }, + { + "epoch": 0.42343690664998773, + "grad_norm": 0.4590768814086914, + "learning_rate": 6.460897493021646e-05, + "loss": 0.9446, + "step": 4739 + }, + { + "epoch": 0.42352625818124956, + "grad_norm": 0.45074954628944397, + "learning_rate": 6.459513505308142e-05, + "loss": 0.993, + "step": 4740 + }, + { + "epoch": 0.42361560971251144, + "grad_norm": 0.4910895824432373, + "learning_rate": 6.458129395344825e-05, + "loss": 0.9115, + "step": 4741 + }, + { + "epoch": 0.4237049612437733, + "grad_norm": 0.4874398410320282, + "learning_rate": 6.456745163247628e-05, + "loss": 0.9896, + "step": 4742 + }, + { + "epoch": 0.4237943127750352, + "grad_norm": 0.4627344012260437, + "learning_rate": 6.455360809132496e-05, + "loss": 0.9819, + "step": 4743 + }, + { + "epoch": 0.42388366430629704, + "grad_norm": 0.3980332314968109, + "learning_rate": 6.453976333115383e-05, + "loss": 0.9962, + "step": 4744 + }, + { + "epoch": 0.4239730158375589, + "grad_norm": 0.527260422706604, + "learning_rate": 6.452591735312257e-05, + "loss": 0.8636, + "step": 4745 + }, + { + "epoch": 0.4240623673688208, + "grad_norm": 0.4000941514968872, + "learning_rate": 6.451207015839086e-05, + "loss": 0.9904, + "step": 4746 + }, + { + "epoch": 0.42415171890008263, + "grad_norm": 0.4481007158756256, + "learning_rate": 6.44982217481186e-05, + "loss": 0.9877, + "step": 4747 + }, + { + "epoch": 0.4242410704313445, + "grad_norm": 0.49271902441978455, + "learning_rate": 6.448437212346572e-05, + "loss": 0.8871, + "step": 4748 + }, + { + "epoch": 0.4243304219626064, + "grad_norm": 0.4154813587665558, + "learning_rate": 6.447052128559229e-05, + "loss": 0.9812, + "step": 4749 + }, + { + "epoch": 0.4244197734938682, + "grad_norm": 0.5907759666442871, + "learning_rate": 6.445666923565846e-05, + "loss": 0.9432, + "step": 4750 + }, + { + "epoch": 0.4245091250251301, + "grad_norm": 0.4116978049278259, + "learning_rate": 6.444281597482448e-05, + "loss": 0.9952, + "step": 4751 + }, + { + "epoch": 0.424598476556392, + "grad_norm": 0.4625799357891083, + "learning_rate": 6.44289615042507e-05, + "loss": 1.0082, + "step": 4752 + }, + { + "epoch": 0.4246878280876539, + "grad_norm": 0.4559314548969269, + "learning_rate": 6.441510582509761e-05, + "loss": 1.0048, + "step": 4753 + }, + { + "epoch": 0.4247771796189157, + "grad_norm": 0.4704115390777588, + "learning_rate": 6.440124893852575e-05, + "loss": 0.9848, + "step": 4754 + }, + { + "epoch": 0.4248665311501776, + "grad_norm": 0.40942203998565674, + "learning_rate": 6.438739084569579e-05, + "loss": 0.9624, + "step": 4755 + }, + { + "epoch": 0.42495588268143947, + "grad_norm": 0.3844430446624756, + "learning_rate": 6.437353154776849e-05, + "loss": 1.0214, + "step": 4756 + }, + { + "epoch": 0.4250452342127013, + "grad_norm": 0.4514857232570648, + "learning_rate": 6.435967104590469e-05, + "loss": 0.9475, + "step": 4757 + }, + { + "epoch": 0.4251345857439632, + "grad_norm": 0.44414305686950684, + "learning_rate": 6.43458093412654e-05, + "loss": 0.9882, + "step": 4758 + }, + { + "epoch": 0.42522393727522506, + "grad_norm": 0.5354793667793274, + "learning_rate": 6.433194643501164e-05, + "loss": 0.9163, + "step": 4759 + }, + { + "epoch": 0.42531328880648694, + "grad_norm": 0.4825092554092407, + "learning_rate": 6.431808232830462e-05, + "loss": 0.96, + "step": 4760 + }, + { + "epoch": 0.42540264033774877, + "grad_norm": 0.44994306564331055, + "learning_rate": 6.430421702230556e-05, + "loss": 0.9369, + "step": 4761 + }, + { + "epoch": 0.42549199186901066, + "grad_norm": 0.4433254599571228, + "learning_rate": 6.429035051817588e-05, + "loss": 0.9945, + "step": 4762 + }, + { + "epoch": 0.42558134340027254, + "grad_norm": 0.49267831444740295, + "learning_rate": 6.427648281707701e-05, + "loss": 1.0097, + "step": 4763 + }, + { + "epoch": 0.42567069493153437, + "grad_norm": 0.3941679000854492, + "learning_rate": 6.426261392017052e-05, + "loss": 1.0503, + "step": 4764 + }, + { + "epoch": 0.42576004646279625, + "grad_norm": 0.3821266293525696, + "learning_rate": 6.424874382861811e-05, + "loss": 0.9521, + "step": 4765 + }, + { + "epoch": 0.42584939799405813, + "grad_norm": 0.44064003229141235, + "learning_rate": 6.42348725435815e-05, + "loss": 0.9589, + "step": 4766 + }, + { + "epoch": 0.42593874952532, + "grad_norm": 0.437092661857605, + "learning_rate": 6.422100006622257e-05, + "loss": 1.004, + "step": 4767 + }, + { + "epoch": 0.42602810105658184, + "grad_norm": 0.5033923387527466, + "learning_rate": 6.420712639770333e-05, + "loss": 0.8967, + "step": 4768 + }, + { + "epoch": 0.4261174525878437, + "grad_norm": 0.4463227093219757, + "learning_rate": 6.41932515391858e-05, + "loss": 0.9387, + "step": 4769 + }, + { + "epoch": 0.4262068041191056, + "grad_norm": 0.4098617732524872, + "learning_rate": 6.417937549183218e-05, + "loss": 0.9521, + "step": 4770 + }, + { + "epoch": 0.42629615565036744, + "grad_norm": 0.4326515197753906, + "learning_rate": 6.41654982568047e-05, + "loss": 1.0335, + "step": 4771 + }, + { + "epoch": 0.4263855071816293, + "grad_norm": 0.42155736684799194, + "learning_rate": 6.415161983526576e-05, + "loss": 0.9386, + "step": 4772 + }, + { + "epoch": 0.4264748587128912, + "grad_norm": 0.407709538936615, + "learning_rate": 6.41377402283778e-05, + "loss": 1.0556, + "step": 4773 + }, + { + "epoch": 0.4265642102441531, + "grad_norm": 0.4312109649181366, + "learning_rate": 6.412385943730341e-05, + "loss": 1.004, + "step": 4774 + }, + { + "epoch": 0.4266535617754149, + "grad_norm": 0.5315758585929871, + "learning_rate": 6.410997746320524e-05, + "loss": 0.8835, + "step": 4775 + }, + { + "epoch": 0.4267429133066768, + "grad_norm": 0.40640825033187866, + "learning_rate": 6.409609430724607e-05, + "loss": 1.0284, + "step": 4776 + }, + { + "epoch": 0.4268322648379387, + "grad_norm": 0.3886488676071167, + "learning_rate": 6.408220997058873e-05, + "loss": 0.9606, + "step": 4777 + }, + { + "epoch": 0.4269216163692005, + "grad_norm": 0.47869306802749634, + "learning_rate": 6.40683244543962e-05, + "loss": 1.0092, + "step": 4778 + }, + { + "epoch": 0.4270109679004624, + "grad_norm": 0.5204102396965027, + "learning_rate": 6.405443775983154e-05, + "loss": 0.8834, + "step": 4779 + }, + { + "epoch": 0.4271003194317243, + "grad_norm": 0.5049982666969299, + "learning_rate": 6.404054988805792e-05, + "loss": 0.8793, + "step": 4780 + }, + { + "epoch": 0.4271896709629861, + "grad_norm": 0.48872700333595276, + "learning_rate": 6.402666084023858e-05, + "loss": 0.9817, + "step": 4781 + }, + { + "epoch": 0.427279022494248, + "grad_norm": 0.428631991147995, + "learning_rate": 6.401277061753689e-05, + "loss": 0.9423, + "step": 4782 + }, + { + "epoch": 0.42736837402550987, + "grad_norm": 0.4668726623058319, + "learning_rate": 6.399887922111627e-05, + "loss": 0.9722, + "step": 4783 + }, + { + "epoch": 0.42745772555677175, + "grad_norm": 0.5166489481925964, + "learning_rate": 6.398498665214032e-05, + "loss": 0.9832, + "step": 4784 + }, + { + "epoch": 0.4275470770880336, + "grad_norm": 0.4075476825237274, + "learning_rate": 6.397109291177266e-05, + "loss": 1.0145, + "step": 4785 + }, + { + "epoch": 0.42763642861929546, + "grad_norm": 0.40599527955055237, + "learning_rate": 6.395719800117706e-05, + "loss": 0.9469, + "step": 4786 + }, + { + "epoch": 0.42772578015055734, + "grad_norm": 0.4204799234867096, + "learning_rate": 6.394330192151732e-05, + "loss": 0.9147, + "step": 4787 + }, + { + "epoch": 0.42781513168181917, + "grad_norm": 0.3994778096675873, + "learning_rate": 6.392940467395745e-05, + "loss": 1.0195, + "step": 4788 + }, + { + "epoch": 0.42790448321308105, + "grad_norm": 0.4340113401412964, + "learning_rate": 6.391550625966144e-05, + "loss": 1.0036, + "step": 4789 + }, + { + "epoch": 0.42799383474434294, + "grad_norm": 0.45802706480026245, + "learning_rate": 6.390160667979348e-05, + "loss": 0.9356, + "step": 4790 + }, + { + "epoch": 0.4280831862756048, + "grad_norm": 0.4832572937011719, + "learning_rate": 6.388770593551777e-05, + "loss": 1.0298, + "step": 4791 + }, + { + "epoch": 0.42817253780686665, + "grad_norm": 0.45630741119384766, + "learning_rate": 6.387380402799866e-05, + "loss": 0.9754, + "step": 4792 + }, + { + "epoch": 0.42826188933812853, + "grad_norm": 0.42788875102996826, + "learning_rate": 6.385990095840055e-05, + "loss": 1.0313, + "step": 4793 + }, + { + "epoch": 0.4283512408693904, + "grad_norm": 0.4116227328777313, + "learning_rate": 6.384599672788802e-05, + "loss": 0.9863, + "step": 4794 + }, + { + "epoch": 0.42844059240065224, + "grad_norm": 0.3941948115825653, + "learning_rate": 6.383209133762569e-05, + "loss": 1.0179, + "step": 4795 + }, + { + "epoch": 0.4285299439319141, + "grad_norm": 0.41168758273124695, + "learning_rate": 6.381818478877825e-05, + "loss": 0.979, + "step": 4796 + }, + { + "epoch": 0.428619295463176, + "grad_norm": 0.5500960946083069, + "learning_rate": 6.380427708251054e-05, + "loss": 0.9703, + "step": 4797 + }, + { + "epoch": 0.4287086469944379, + "grad_norm": 0.42650794982910156, + "learning_rate": 6.379036821998751e-05, + "loss": 0.9895, + "step": 4798 + }, + { + "epoch": 0.4287979985256997, + "grad_norm": 0.4761294424533844, + "learning_rate": 6.377645820237412e-05, + "loss": 0.9629, + "step": 4799 + }, + { + "epoch": 0.4288873500569616, + "grad_norm": 0.45492222905158997, + "learning_rate": 6.376254703083552e-05, + "loss": 0.9479, + "step": 4800 + }, + { + "epoch": 0.4289767015882235, + "grad_norm": 0.44997844099998474, + "learning_rate": 6.374863470653691e-05, + "loss": 0.9265, + "step": 4801 + }, + { + "epoch": 0.4290660531194853, + "grad_norm": 0.5663099884986877, + "learning_rate": 6.373472123064358e-05, + "loss": 0.8915, + "step": 4802 + }, + { + "epoch": 0.4291554046507472, + "grad_norm": 0.41858309507369995, + "learning_rate": 6.372080660432095e-05, + "loss": 0.9733, + "step": 4803 + }, + { + "epoch": 0.4292447561820091, + "grad_norm": 0.42884135246276855, + "learning_rate": 6.370689082873451e-05, + "loss": 0.9741, + "step": 4804 + }, + { + "epoch": 0.42933410771327096, + "grad_norm": 0.49144938588142395, + "learning_rate": 6.369297390504987e-05, + "loss": 0.9202, + "step": 4805 + }, + { + "epoch": 0.4294234592445328, + "grad_norm": 0.43993905186653137, + "learning_rate": 6.36790558344327e-05, + "loss": 0.9182, + "step": 4806 + }, + { + "epoch": 0.4295128107757947, + "grad_norm": 0.41477641463279724, + "learning_rate": 6.36651366180488e-05, + "loss": 1.0702, + "step": 4807 + }, + { + "epoch": 0.42960216230705656, + "grad_norm": 0.48536887764930725, + "learning_rate": 6.365121625706405e-05, + "loss": 1.009, + "step": 4808 + }, + { + "epoch": 0.4296915138383184, + "grad_norm": 0.4555813670158386, + "learning_rate": 6.363729475264441e-05, + "loss": 0.9185, + "step": 4809 + }, + { + "epoch": 0.42978086536958027, + "grad_norm": 0.4251463711261749, + "learning_rate": 6.362337210595599e-05, + "loss": 0.9602, + "step": 4810 + }, + { + "epoch": 0.42987021690084215, + "grad_norm": 0.5072605013847351, + "learning_rate": 6.360944831816495e-05, + "loss": 0.9825, + "step": 4811 + }, + { + "epoch": 0.429959568432104, + "grad_norm": 0.39637628197669983, + "learning_rate": 6.359552339043753e-05, + "loss": 1.0062, + "step": 4812 + }, + { + "epoch": 0.43004891996336586, + "grad_norm": 0.45429277420043945, + "learning_rate": 6.35815973239401e-05, + "loss": 0.9989, + "step": 4813 + }, + { + "epoch": 0.43013827149462774, + "grad_norm": 0.4280487298965454, + "learning_rate": 6.356767011983915e-05, + "loss": 0.9106, + "step": 4814 + }, + { + "epoch": 0.4302276230258896, + "grad_norm": 0.45695221424102783, + "learning_rate": 6.355374177930118e-05, + "loss": 0.977, + "step": 4815 + }, + { + "epoch": 0.43031697455715145, + "grad_norm": 0.40981754660606384, + "learning_rate": 6.353981230349289e-05, + "loss": 1.0272, + "step": 4816 + }, + { + "epoch": 0.43040632608841334, + "grad_norm": 0.4471674859523773, + "learning_rate": 6.352588169358099e-05, + "loss": 0.9212, + "step": 4817 + }, + { + "epoch": 0.4304956776196752, + "grad_norm": 0.5394784808158875, + "learning_rate": 6.35119499507323e-05, + "loss": 0.9334, + "step": 4818 + }, + { + "epoch": 0.43058502915093705, + "grad_norm": 0.5647401213645935, + "learning_rate": 6.34980170761138e-05, + "loss": 0.9176, + "step": 4819 + }, + { + "epoch": 0.43067438068219893, + "grad_norm": 0.5009137392044067, + "learning_rate": 6.348408307089248e-05, + "loss": 0.9756, + "step": 4820 + }, + { + "epoch": 0.4307637322134608, + "grad_norm": 0.38283008337020874, + "learning_rate": 6.347014793623547e-05, + "loss": 1.0153, + "step": 4821 + }, + { + "epoch": 0.4308530837447227, + "grad_norm": 0.4172475337982178, + "learning_rate": 6.345621167331e-05, + "loss": 1.0625, + "step": 4822 + }, + { + "epoch": 0.4309424352759845, + "grad_norm": 0.3502405285835266, + "learning_rate": 6.344227428328335e-05, + "loss": 0.9735, + "step": 4823 + }, + { + "epoch": 0.4310317868072464, + "grad_norm": 0.43267232179641724, + "learning_rate": 6.342833576732297e-05, + "loss": 0.9561, + "step": 4824 + }, + { + "epoch": 0.4311211383385083, + "grad_norm": 0.41811031103134155, + "learning_rate": 6.341439612659631e-05, + "loss": 0.9465, + "step": 4825 + }, + { + "epoch": 0.4312104898697701, + "grad_norm": 0.4469202756881714, + "learning_rate": 6.340045536227101e-05, + "loss": 0.9639, + "step": 4826 + }, + { + "epoch": 0.431299841401032, + "grad_norm": 0.43867889046669006, + "learning_rate": 6.338651347551472e-05, + "loss": 0.9826, + "step": 4827 + }, + { + "epoch": 0.4313891929322939, + "grad_norm": 0.4113059341907501, + "learning_rate": 6.337257046749523e-05, + "loss": 1.0002, + "step": 4828 + }, + { + "epoch": 0.43147854446355577, + "grad_norm": 0.47658199071884155, + "learning_rate": 6.335862633938044e-05, + "loss": 0.9545, + "step": 4829 + }, + { + "epoch": 0.4315678959948176, + "grad_norm": 0.3787316083908081, + "learning_rate": 6.334468109233827e-05, + "loss": 1.033, + "step": 4830 + }, + { + "epoch": 0.4316572475260795, + "grad_norm": 0.5350416898727417, + "learning_rate": 6.333073472753686e-05, + "loss": 0.9514, + "step": 4831 + }, + { + "epoch": 0.43174659905734136, + "grad_norm": 0.40219366550445557, + "learning_rate": 6.331678724614429e-05, + "loss": 0.9928, + "step": 4832 + }, + { + "epoch": 0.4318359505886032, + "grad_norm": 0.457727313041687, + "learning_rate": 6.330283864932885e-05, + "loss": 0.9433, + "step": 4833 + }, + { + "epoch": 0.4319253021198651, + "grad_norm": 0.4098197817802429, + "learning_rate": 6.328888893825888e-05, + "loss": 1.0253, + "step": 4834 + }, + { + "epoch": 0.43201465365112696, + "grad_norm": 0.4684712886810303, + "learning_rate": 6.32749381141028e-05, + "loss": 0.9667, + "step": 4835 + }, + { + "epoch": 0.43210400518238884, + "grad_norm": 0.45592206716537476, + "learning_rate": 6.326098617802917e-05, + "loss": 0.9459, + "step": 4836 + }, + { + "epoch": 0.43219335671365067, + "grad_norm": 0.4041098952293396, + "learning_rate": 6.324703313120659e-05, + "loss": 1.0305, + "step": 4837 + }, + { + "epoch": 0.43228270824491255, + "grad_norm": 0.44690701365470886, + "learning_rate": 6.323307897480376e-05, + "loss": 0.9919, + "step": 4838 + }, + { + "epoch": 0.43237205977617443, + "grad_norm": 0.4994511008262634, + "learning_rate": 6.321912370998952e-05, + "loss": 0.921, + "step": 4839 + }, + { + "epoch": 0.43246141130743626, + "grad_norm": 0.4134766757488251, + "learning_rate": 6.320516733793278e-05, + "loss": 0.9395, + "step": 4840 + }, + { + "epoch": 0.43255076283869814, + "grad_norm": 0.517279863357544, + "learning_rate": 6.319120985980251e-05, + "loss": 0.9349, + "step": 4841 + }, + { + "epoch": 0.43264011436996, + "grad_norm": 0.5020187497138977, + "learning_rate": 6.317725127676781e-05, + "loss": 0.961, + "step": 4842 + }, + { + "epoch": 0.43272946590122185, + "grad_norm": 0.4880422353744507, + "learning_rate": 6.316329158999784e-05, + "loss": 0.9669, + "step": 4843 + }, + { + "epoch": 0.43281881743248374, + "grad_norm": 0.3964580297470093, + "learning_rate": 6.31493308006619e-05, + "loss": 0.9649, + "step": 4844 + }, + { + "epoch": 0.4329081689637456, + "grad_norm": 0.37571898102760315, + "learning_rate": 6.313536890992935e-05, + "loss": 1.0031, + "step": 4845 + }, + { + "epoch": 0.4329975204950075, + "grad_norm": 0.45982518792152405, + "learning_rate": 6.312140591896964e-05, + "loss": 0.9339, + "step": 4846 + }, + { + "epoch": 0.43308687202626933, + "grad_norm": 0.4254016876220703, + "learning_rate": 6.310744182895231e-05, + "loss": 1.0641, + "step": 4847 + }, + { + "epoch": 0.4331762235575312, + "grad_norm": 0.43494075536727905, + "learning_rate": 6.309347664104701e-05, + "loss": 0.9164, + "step": 4848 + }, + { + "epoch": 0.4332655750887931, + "grad_norm": 0.451556921005249, + "learning_rate": 6.307951035642349e-05, + "loss": 1.0389, + "step": 4849 + }, + { + "epoch": 0.4333549266200549, + "grad_norm": 0.34367331862449646, + "learning_rate": 6.306554297625156e-05, + "loss": 1.0079, + "step": 4850 + }, + { + "epoch": 0.4334442781513168, + "grad_norm": 0.5219529271125793, + "learning_rate": 6.305157450170111e-05, + "loss": 0.8767, + "step": 4851 + }, + { + "epoch": 0.4335336296825787, + "grad_norm": 0.36903488636016846, + "learning_rate": 6.303760493394221e-05, + "loss": 0.9979, + "step": 4852 + }, + { + "epoch": 0.4336229812138406, + "grad_norm": 0.37834039330482483, + "learning_rate": 6.302363427414491e-05, + "loss": 0.9771, + "step": 4853 + }, + { + "epoch": 0.4337123327451024, + "grad_norm": 0.4161636233329773, + "learning_rate": 6.300966252347942e-05, + "loss": 0.9805, + "step": 4854 + }, + { + "epoch": 0.4338016842763643, + "grad_norm": 0.5334766507148743, + "learning_rate": 6.299568968311601e-05, + "loss": 0.9568, + "step": 4855 + }, + { + "epoch": 0.43389103580762617, + "grad_norm": 0.4917714595794678, + "learning_rate": 6.298171575422508e-05, + "loss": 0.917, + "step": 4856 + }, + { + "epoch": 0.433980387338888, + "grad_norm": 0.42037853598594666, + "learning_rate": 6.296774073797708e-05, + "loss": 0.9624, + "step": 4857 + }, + { + "epoch": 0.4340697388701499, + "grad_norm": 0.3785925805568695, + "learning_rate": 6.295376463554255e-05, + "loss": 1.0437, + "step": 4858 + }, + { + "epoch": 0.43415909040141176, + "grad_norm": 0.47149381041526794, + "learning_rate": 6.293978744809217e-05, + "loss": 0.9401, + "step": 4859 + }, + { + "epoch": 0.43424844193267365, + "grad_norm": 0.4850710928440094, + "learning_rate": 6.292580917679665e-05, + "loss": 0.9874, + "step": 4860 + }, + { + "epoch": 0.4343377934639355, + "grad_norm": 0.43360236287117004, + "learning_rate": 6.291182982282685e-05, + "loss": 0.9758, + "step": 4861 + }, + { + "epoch": 0.43442714499519736, + "grad_norm": 0.41151541471481323, + "learning_rate": 6.289784938735366e-05, + "loss": 1.0028, + "step": 4862 + }, + { + "epoch": 0.43451649652645924, + "grad_norm": 0.46998703479766846, + "learning_rate": 6.28838678715481e-05, + "loss": 0.9591, + "step": 4863 + }, + { + "epoch": 0.43460584805772107, + "grad_norm": 0.5863698124885559, + "learning_rate": 6.286988527658129e-05, + "loss": 0.9661, + "step": 4864 + }, + { + "epoch": 0.43469519958898295, + "grad_norm": 0.434731662273407, + "learning_rate": 6.285590160362438e-05, + "loss": 0.9454, + "step": 4865 + }, + { + "epoch": 0.43478455112024483, + "grad_norm": 0.5044207572937012, + "learning_rate": 6.28419168538487e-05, + "loss": 0.9308, + "step": 4866 + }, + { + "epoch": 0.4348739026515067, + "grad_norm": 0.4645664095878601, + "learning_rate": 6.282793102842559e-05, + "loss": 0.9077, + "step": 4867 + }, + { + "epoch": 0.43496325418276854, + "grad_norm": 0.43170252442359924, + "learning_rate": 6.281394412852652e-05, + "loss": 1.0509, + "step": 4868 + }, + { + "epoch": 0.4350526057140304, + "grad_norm": 0.41620826721191406, + "learning_rate": 6.279995615532304e-05, + "loss": 0.9653, + "step": 4869 + }, + { + "epoch": 0.4351419572452923, + "grad_norm": 0.3965005874633789, + "learning_rate": 6.27859671099868e-05, + "loss": 0.9601, + "step": 4870 + }, + { + "epoch": 0.43523130877655414, + "grad_norm": 0.3978107273578644, + "learning_rate": 6.277197699368954e-05, + "loss": 1.0369, + "step": 4871 + }, + { + "epoch": 0.435320660307816, + "grad_norm": 0.42217400670051575, + "learning_rate": 6.275798580760304e-05, + "loss": 0.9626, + "step": 4872 + }, + { + "epoch": 0.4354100118390779, + "grad_norm": 0.4375147223472595, + "learning_rate": 6.274399355289923e-05, + "loss": 0.9481, + "step": 4873 + }, + { + "epoch": 0.43549936337033973, + "grad_norm": 0.4031033515930176, + "learning_rate": 6.273000023075014e-05, + "loss": 0.9599, + "step": 4874 + }, + { + "epoch": 0.4355887149016016, + "grad_norm": 0.50283282995224, + "learning_rate": 6.271600584232784e-05, + "loss": 0.9854, + "step": 4875 + }, + { + "epoch": 0.4356780664328635, + "grad_norm": 0.5138799548149109, + "learning_rate": 6.27020103888045e-05, + "loss": 1.0223, + "step": 4876 + }, + { + "epoch": 0.4357674179641254, + "grad_norm": 0.37269532680511475, + "learning_rate": 6.26880138713524e-05, + "loss": 0.9626, + "step": 4877 + }, + { + "epoch": 0.4358567694953872, + "grad_norm": 0.4185316264629364, + "learning_rate": 6.267401629114389e-05, + "loss": 0.9613, + "step": 4878 + }, + { + "epoch": 0.4359461210266491, + "grad_norm": 0.41371724009513855, + "learning_rate": 6.266001764935144e-05, + "loss": 0.9528, + "step": 4879 + }, + { + "epoch": 0.436035472557911, + "grad_norm": 0.392198383808136, + "learning_rate": 6.264601794714753e-05, + "loss": 0.9722, + "step": 4880 + }, + { + "epoch": 0.4361248240891728, + "grad_norm": 0.39201459288597107, + "learning_rate": 6.263201718570485e-05, + "loss": 0.9175, + "step": 4881 + }, + { + "epoch": 0.4362141756204347, + "grad_norm": 0.5743230581283569, + "learning_rate": 6.261801536619607e-05, + "loss": 0.9564, + "step": 4882 + }, + { + "epoch": 0.43630352715169657, + "grad_norm": 0.5308464765548706, + "learning_rate": 6.2604012489794e-05, + "loss": 0.9499, + "step": 4883 + }, + { + "epoch": 0.43639287868295845, + "grad_norm": 0.4258040487766266, + "learning_rate": 6.259000855767155e-05, + "loss": 0.9583, + "step": 4884 + }, + { + "epoch": 0.4364822302142203, + "grad_norm": 0.45301637053489685, + "learning_rate": 6.257600357100167e-05, + "loss": 0.9621, + "step": 4885 + }, + { + "epoch": 0.43657158174548216, + "grad_norm": 0.5080031752586365, + "learning_rate": 6.256199753095745e-05, + "loss": 0.9353, + "step": 4886 + }, + { + "epoch": 0.43666093327674405, + "grad_norm": 0.41666001081466675, + "learning_rate": 6.254799043871204e-05, + "loss": 1.0159, + "step": 4887 + }, + { + "epoch": 0.4367502848080059, + "grad_norm": 0.4304046928882599, + "learning_rate": 6.253398229543867e-05, + "loss": 0.9442, + "step": 4888 + }, + { + "epoch": 0.43683963633926776, + "grad_norm": 0.41285210847854614, + "learning_rate": 6.251997310231067e-05, + "loss": 0.9862, + "step": 4889 + }, + { + "epoch": 0.43692898787052964, + "grad_norm": 0.5112424492835999, + "learning_rate": 6.250596286050148e-05, + "loss": 0.9551, + "step": 4890 + }, + { + "epoch": 0.4370183394017915, + "grad_norm": 0.46234673261642456, + "learning_rate": 6.249195157118461e-05, + "loss": 0.9087, + "step": 4891 + }, + { + "epoch": 0.43710769093305335, + "grad_norm": 0.43776413798332214, + "learning_rate": 6.247793923553362e-05, + "loss": 0.9494, + "step": 4892 + }, + { + "epoch": 0.43719704246431523, + "grad_norm": 0.42224618792533875, + "learning_rate": 6.246392585472222e-05, + "loss": 1.0397, + "step": 4893 + }, + { + "epoch": 0.4372863939955771, + "grad_norm": 0.4742605984210968, + "learning_rate": 6.244991142992417e-05, + "loss": 0.9794, + "step": 4894 + }, + { + "epoch": 0.43737574552683894, + "grad_norm": 0.5097817778587341, + "learning_rate": 6.243589596231333e-05, + "loss": 0.9319, + "step": 4895 + }, + { + "epoch": 0.4374650970581008, + "grad_norm": 0.4433581829071045, + "learning_rate": 6.242187945306364e-05, + "loss": 0.9704, + "step": 4896 + }, + { + "epoch": 0.4375544485893627, + "grad_norm": 0.3601015508174896, + "learning_rate": 6.240786190334918e-05, + "loss": 1.0209, + "step": 4897 + }, + { + "epoch": 0.4376438001206246, + "grad_norm": 0.6255925893783569, + "learning_rate": 6.239384331434399e-05, + "loss": 0.986, + "step": 4898 + }, + { + "epoch": 0.4377331516518864, + "grad_norm": 0.45729881525039673, + "learning_rate": 6.237982368722232e-05, + "loss": 1.0179, + "step": 4899 + }, + { + "epoch": 0.4378225031831483, + "grad_norm": 0.41129839420318604, + "learning_rate": 6.236580302315844e-05, + "loss": 0.9502, + "step": 4900 + }, + { + "epoch": 0.4379118547144102, + "grad_norm": 0.4457799792289734, + "learning_rate": 6.235178132332677e-05, + "loss": 0.9722, + "step": 4901 + }, + { + "epoch": 0.438001206245672, + "grad_norm": 0.492878794670105, + "learning_rate": 6.233775858890175e-05, + "loss": 0.9167, + "step": 4902 + }, + { + "epoch": 0.4380905577769339, + "grad_norm": 0.43004390597343445, + "learning_rate": 6.232373482105794e-05, + "loss": 0.9728, + "step": 4903 + }, + { + "epoch": 0.4381799093081958, + "grad_norm": 0.4913460314273834, + "learning_rate": 6.230971002096999e-05, + "loss": 1.05, + "step": 4904 + }, + { + "epoch": 0.43826926083945766, + "grad_norm": 0.47525662183761597, + "learning_rate": 6.229568418981258e-05, + "loss": 0.9822, + "step": 4905 + }, + { + "epoch": 0.4383586123707195, + "grad_norm": 0.4017292857170105, + "learning_rate": 6.228165732876056e-05, + "loss": 0.9573, + "step": 4906 + }, + { + "epoch": 0.4384479639019814, + "grad_norm": 0.41514596343040466, + "learning_rate": 6.226762943898886e-05, + "loss": 0.9435, + "step": 4907 + }, + { + "epoch": 0.43853731543324326, + "grad_norm": 0.4426291286945343, + "learning_rate": 6.22536005216724e-05, + "loss": 1.0117, + "step": 4908 + }, + { + "epoch": 0.4386266669645051, + "grad_norm": 0.42675042152404785, + "learning_rate": 6.223957057798629e-05, + "loss": 0.9363, + "step": 4909 + }, + { + "epoch": 0.43871601849576697, + "grad_norm": 0.5054933428764343, + "learning_rate": 6.222553960910567e-05, + "loss": 0.961, + "step": 4910 + }, + { + "epoch": 0.43880537002702885, + "grad_norm": 0.37240302562713623, + "learning_rate": 6.221150761620581e-05, + "loss": 1.098, + "step": 4911 + }, + { + "epoch": 0.4388947215582907, + "grad_norm": 0.37764713168144226, + "learning_rate": 6.219747460046203e-05, + "loss": 0.9538, + "step": 4912 + }, + { + "epoch": 0.43898407308955256, + "grad_norm": 0.49377158284187317, + "learning_rate": 6.218344056304972e-05, + "loss": 0.9257, + "step": 4913 + }, + { + "epoch": 0.43907342462081445, + "grad_norm": 0.44281005859375, + "learning_rate": 6.216940550514439e-05, + "loss": 0.9663, + "step": 4914 + }, + { + "epoch": 0.43916277615207633, + "grad_norm": 0.42006000876426697, + "learning_rate": 6.215536942792163e-05, + "loss": 0.9435, + "step": 4915 + }, + { + "epoch": 0.43925212768333816, + "grad_norm": 0.468666136264801, + "learning_rate": 6.214133233255713e-05, + "loss": 0.9804, + "step": 4916 + }, + { + "epoch": 0.43934147921460004, + "grad_norm": 0.3761788308620453, + "learning_rate": 6.212729422022664e-05, + "loss": 1.0003, + "step": 4917 + }, + { + "epoch": 0.4394308307458619, + "grad_norm": 0.4250943064689636, + "learning_rate": 6.211325509210597e-05, + "loss": 0.9809, + "step": 4918 + }, + { + "epoch": 0.43952018227712375, + "grad_norm": 0.40891769528388977, + "learning_rate": 6.209921494937108e-05, + "loss": 0.9841, + "step": 4919 + }, + { + "epoch": 0.43960953380838563, + "grad_norm": 0.3647308945655823, + "learning_rate": 6.208517379319796e-05, + "loss": 0.9747, + "step": 4920 + }, + { + "epoch": 0.4396988853396475, + "grad_norm": 0.4176364541053772, + "learning_rate": 6.207113162476272e-05, + "loss": 0.9336, + "step": 4921 + }, + { + "epoch": 0.4397882368709094, + "grad_norm": 0.40200045704841614, + "learning_rate": 6.205708844524153e-05, + "loss": 1.0223, + "step": 4922 + }, + { + "epoch": 0.4398775884021712, + "grad_norm": 0.4894162714481354, + "learning_rate": 6.204304425581069e-05, + "loss": 0.9348, + "step": 4923 + }, + { + "epoch": 0.4399669399334331, + "grad_norm": 0.4525613784790039, + "learning_rate": 6.20289990576465e-05, + "loss": 0.941, + "step": 4924 + }, + { + "epoch": 0.440056291464695, + "grad_norm": 0.4009491205215454, + "learning_rate": 6.201495285192542e-05, + "loss": 1.0821, + "step": 4925 + }, + { + "epoch": 0.4401456429959568, + "grad_norm": 0.4592028558254242, + "learning_rate": 6.200090563982397e-05, + "loss": 0.9359, + "step": 4926 + }, + { + "epoch": 0.4402349945272187, + "grad_norm": 0.45449405908584595, + "learning_rate": 6.198685742251877e-05, + "loss": 0.9356, + "step": 4927 + }, + { + "epoch": 0.4403243460584806, + "grad_norm": 0.6728967428207397, + "learning_rate": 6.197280820118646e-05, + "loss": 1.0374, + "step": 4928 + }, + { + "epoch": 0.44041369758974247, + "grad_norm": 0.5596892833709717, + "learning_rate": 6.195875797700385e-05, + "loss": 0.9524, + "step": 4929 + }, + { + "epoch": 0.4405030491210043, + "grad_norm": 0.4790952801704407, + "learning_rate": 6.19447067511478e-05, + "loss": 1.0068, + "step": 4930 + }, + { + "epoch": 0.4405924006522662, + "grad_norm": 0.4227498173713684, + "learning_rate": 6.193065452479523e-05, + "loss": 1.0209, + "step": 4931 + }, + { + "epoch": 0.44068175218352806, + "grad_norm": 0.4277952015399933, + "learning_rate": 6.191660129912317e-05, + "loss": 0.8967, + "step": 4932 + }, + { + "epoch": 0.4407711037147899, + "grad_norm": 0.4384140074253082, + "learning_rate": 6.190254707530874e-05, + "loss": 0.9636, + "step": 4933 + }, + { + "epoch": 0.4408604552460518, + "grad_norm": 0.4057149589061737, + "learning_rate": 6.188849185452911e-05, + "loss": 1.0107, + "step": 4934 + }, + { + "epoch": 0.44094980677731366, + "grad_norm": 0.43641364574432373, + "learning_rate": 6.187443563796157e-05, + "loss": 0.9296, + "step": 4935 + }, + { + "epoch": 0.44103915830857554, + "grad_norm": 0.4788879156112671, + "learning_rate": 6.186037842678349e-05, + "loss": 0.9666, + "step": 4936 + }, + { + "epoch": 0.44112850983983737, + "grad_norm": 0.3350755572319031, + "learning_rate": 6.184632022217227e-05, + "loss": 1.0117, + "step": 4937 + }, + { + "epoch": 0.44121786137109925, + "grad_norm": 0.4167011082172394, + "learning_rate": 6.183226102530547e-05, + "loss": 1.023, + "step": 4938 + }, + { + "epoch": 0.44130721290236113, + "grad_norm": 0.47220227122306824, + "learning_rate": 6.181820083736067e-05, + "loss": 1.0494, + "step": 4939 + }, + { + "epoch": 0.44139656443362296, + "grad_norm": 0.4266274571418762, + "learning_rate": 6.18041396595156e-05, + "loss": 0.9815, + "step": 4940 + }, + { + "epoch": 0.44148591596488485, + "grad_norm": 0.4164573848247528, + "learning_rate": 6.1790077492948e-05, + "loss": 1.0067, + "step": 4941 + }, + { + "epoch": 0.44157526749614673, + "grad_norm": 0.4291037917137146, + "learning_rate": 6.177601433883573e-05, + "loss": 0.9725, + "step": 4942 + }, + { + "epoch": 0.44166461902740856, + "grad_norm": 0.49879229068756104, + "learning_rate": 6.176195019835674e-05, + "loss": 0.9782, + "step": 4943 + }, + { + "epoch": 0.44175397055867044, + "grad_norm": 0.5074213743209839, + "learning_rate": 6.174788507268905e-05, + "loss": 0.9614, + "step": 4944 + }, + { + "epoch": 0.4418433220899323, + "grad_norm": 0.39095085859298706, + "learning_rate": 6.173381896301076e-05, + "loss": 0.9515, + "step": 4945 + }, + { + "epoch": 0.4419326736211942, + "grad_norm": 0.40660542249679565, + "learning_rate": 6.171975187050005e-05, + "loss": 0.9768, + "step": 4946 + }, + { + "epoch": 0.44202202515245603, + "grad_norm": 0.6411069631576538, + "learning_rate": 6.170568379633522e-05, + "loss": 1.0163, + "step": 4947 + }, + { + "epoch": 0.4421113766837179, + "grad_norm": 0.4861661195755005, + "learning_rate": 6.169161474169458e-05, + "loss": 0.9521, + "step": 4948 + }, + { + "epoch": 0.4422007282149798, + "grad_norm": 0.4027434289455414, + "learning_rate": 6.167754470775659e-05, + "loss": 0.9752, + "step": 4949 + }, + { + "epoch": 0.4422900797462416, + "grad_norm": 0.46598201990127563, + "learning_rate": 6.166347369569975e-05, + "loss": 0.9576, + "step": 4950 + }, + { + "epoch": 0.4423794312775035, + "grad_norm": 0.4167962372303009, + "learning_rate": 6.164940170670266e-05, + "loss": 0.9705, + "step": 4951 + }, + { + "epoch": 0.4424687828087654, + "grad_norm": 0.4604121744632721, + "learning_rate": 6.163532874194401e-05, + "loss": 0.9491, + "step": 4952 + }, + { + "epoch": 0.4425581343400273, + "grad_norm": 0.4519181549549103, + "learning_rate": 6.162125480260257e-05, + "loss": 0.9529, + "step": 4953 + }, + { + "epoch": 0.4426474858712891, + "grad_norm": 0.3763716220855713, + "learning_rate": 6.160717988985714e-05, + "loss": 0.962, + "step": 4954 + }, + { + "epoch": 0.442736837402551, + "grad_norm": 0.3946007788181305, + "learning_rate": 6.15931040048867e-05, + "loss": 1.0416, + "step": 4955 + }, + { + "epoch": 0.44282618893381287, + "grad_norm": 0.4197639226913452, + "learning_rate": 6.15790271488702e-05, + "loss": 1.05, + "step": 4956 + }, + { + "epoch": 0.4429155404650747, + "grad_norm": 0.40052515268325806, + "learning_rate": 6.156494932298678e-05, + "loss": 0.95, + "step": 4957 + }, + { + "epoch": 0.4430048919963366, + "grad_norm": 0.4258505702018738, + "learning_rate": 6.155087052841555e-05, + "loss": 0.9829, + "step": 4958 + }, + { + "epoch": 0.44309424352759846, + "grad_norm": 0.41184738278388977, + "learning_rate": 6.153679076633581e-05, + "loss": 0.9795, + "step": 4959 + }, + { + "epoch": 0.44318359505886035, + "grad_norm": 0.5452486872673035, + "learning_rate": 6.152271003792686e-05, + "loss": 0.8949, + "step": 4960 + }, + { + "epoch": 0.4432729465901222, + "grad_norm": 0.47022590041160583, + "learning_rate": 6.150862834436811e-05, + "loss": 0.9618, + "step": 4961 + }, + { + "epoch": 0.44336229812138406, + "grad_norm": 0.4404926300048828, + "learning_rate": 6.149454568683909e-05, + "loss": 1.0751, + "step": 4962 + }, + { + "epoch": 0.44345164965264594, + "grad_norm": 0.4911143481731415, + "learning_rate": 6.148046206651932e-05, + "loss": 0.9599, + "step": 4963 + }, + { + "epoch": 0.44354100118390777, + "grad_norm": 0.4653252959251404, + "learning_rate": 6.146637748458849e-05, + "loss": 1.0376, + "step": 4964 + }, + { + "epoch": 0.44363035271516965, + "grad_norm": 0.5367299914360046, + "learning_rate": 6.145229194222633e-05, + "loss": 0.9954, + "step": 4965 + }, + { + "epoch": 0.44371970424643153, + "grad_norm": 0.39032191038131714, + "learning_rate": 6.143820544061263e-05, + "loss": 0.9678, + "step": 4966 + }, + { + "epoch": 0.4438090557776934, + "grad_norm": 0.46807265281677246, + "learning_rate": 6.142411798092731e-05, + "loss": 0.9434, + "step": 4967 + }, + { + "epoch": 0.44389840730895525, + "grad_norm": 0.3878948390483856, + "learning_rate": 6.141002956435034e-05, + "loss": 0.9584, + "step": 4968 + }, + { + "epoch": 0.44398775884021713, + "grad_norm": 0.4154819846153259, + "learning_rate": 6.139594019206178e-05, + "loss": 0.9433, + "step": 4969 + }, + { + "epoch": 0.444077110371479, + "grad_norm": 0.4845220744609833, + "learning_rate": 6.138184986524175e-05, + "loss": 1.0408, + "step": 4970 + }, + { + "epoch": 0.44416646190274084, + "grad_norm": 0.4787101149559021, + "learning_rate": 6.136775858507046e-05, + "loss": 0.9348, + "step": 4971 + }, + { + "epoch": 0.4442558134340027, + "grad_norm": 0.4312731921672821, + "learning_rate": 6.135366635272824e-05, + "loss": 0.9757, + "step": 4972 + }, + { + "epoch": 0.4443451649652646, + "grad_norm": 0.4321444034576416, + "learning_rate": 6.133957316939543e-05, + "loss": 0.9491, + "step": 4973 + }, + { + "epoch": 0.44443451649652643, + "grad_norm": 0.5200006365776062, + "learning_rate": 6.132547903625249e-05, + "loss": 0.897, + "step": 4974 + }, + { + "epoch": 0.4445238680277883, + "grad_norm": 0.44951632618904114, + "learning_rate": 6.131138395447997e-05, + "loss": 0.991, + "step": 4975 + }, + { + "epoch": 0.4446132195590502, + "grad_norm": 0.556250274181366, + "learning_rate": 6.129728792525846e-05, + "loss": 0.9118, + "step": 4976 + }, + { + "epoch": 0.4447025710903121, + "grad_norm": 0.5766691565513611, + "learning_rate": 6.128319094976868e-05, + "loss": 0.9496, + "step": 4977 + }, + { + "epoch": 0.4447919226215739, + "grad_norm": 0.472476601600647, + "learning_rate": 6.126909302919138e-05, + "loss": 0.9091, + "step": 4978 + }, + { + "epoch": 0.4448812741528358, + "grad_norm": 0.41769498586654663, + "learning_rate": 6.125499416470742e-05, + "loss": 1.0127, + "step": 4979 + }, + { + "epoch": 0.4449706256840977, + "grad_norm": 0.44769975543022156, + "learning_rate": 6.124089435749772e-05, + "loss": 0.9084, + "step": 4980 + }, + { + "epoch": 0.4450599772153595, + "grad_norm": 0.42755383253097534, + "learning_rate": 6.122679360874331e-05, + "loss": 1.0314, + "step": 4981 + }, + { + "epoch": 0.4451493287466214, + "grad_norm": 0.4499891698360443, + "learning_rate": 6.121269191962527e-05, + "loss": 0.9204, + "step": 4982 + }, + { + "epoch": 0.44523868027788327, + "grad_norm": 0.5186624526977539, + "learning_rate": 6.119858929132475e-05, + "loss": 0.973, + "step": 4983 + }, + { + "epoch": 0.44532803180914515, + "grad_norm": 0.43319734930992126, + "learning_rate": 6.118448572502302e-05, + "loss": 1.0126, + "step": 4984 + }, + { + "epoch": 0.445417383340407, + "grad_norm": 0.41849610209465027, + "learning_rate": 6.117038122190139e-05, + "loss": 1.0017, + "step": 4985 + }, + { + "epoch": 0.44550673487166886, + "grad_norm": 0.5016182065010071, + "learning_rate": 6.115627578314125e-05, + "loss": 0.9203, + "step": 4986 + }, + { + "epoch": 0.44559608640293075, + "grad_norm": 0.47967931628227234, + "learning_rate": 6.114216940992411e-05, + "loss": 0.9842, + "step": 4987 + }, + { + "epoch": 0.4456854379341926, + "grad_norm": 0.4858255088329315, + "learning_rate": 6.112806210343152e-05, + "loss": 0.9719, + "step": 4988 + }, + { + "epoch": 0.44577478946545446, + "grad_norm": 0.5705578327178955, + "learning_rate": 6.111395386484511e-05, + "loss": 0.8807, + "step": 4989 + }, + { + "epoch": 0.44586414099671634, + "grad_norm": 0.4477182626724243, + "learning_rate": 6.109984469534659e-05, + "loss": 0.9386, + "step": 4990 + }, + { + "epoch": 0.4459534925279782, + "grad_norm": 0.4489213824272156, + "learning_rate": 6.108573459611776e-05, + "loss": 1.0481, + "step": 4991 + }, + { + "epoch": 0.44604284405924005, + "grad_norm": 0.36626997590065, + "learning_rate": 6.10716235683405e-05, + "loss": 0.908, + "step": 4992 + }, + { + "epoch": 0.44613219559050193, + "grad_norm": 0.409940630197525, + "learning_rate": 6.105751161319675e-05, + "loss": 0.9784, + "step": 4993 + }, + { + "epoch": 0.4462215471217638, + "grad_norm": 0.42009782791137695, + "learning_rate": 6.104339873186855e-05, + "loss": 1.0009, + "step": 4994 + }, + { + "epoch": 0.44631089865302565, + "grad_norm": 0.4435548186302185, + "learning_rate": 6.102928492553796e-05, + "loss": 1.0153, + "step": 4995 + }, + { + "epoch": 0.44640025018428753, + "grad_norm": 0.574257493019104, + "learning_rate": 6.101517019538721e-05, + "loss": 0.8759, + "step": 4996 + }, + { + "epoch": 0.4464896017155494, + "grad_norm": 0.43628671765327454, + "learning_rate": 6.1001054542598534e-05, + "loss": 0.9219, + "step": 4997 + }, + { + "epoch": 0.4465789532468113, + "grad_norm": 0.3821120262145996, + "learning_rate": 6.0986937968354295e-05, + "loss": 1.0096, + "step": 4998 + }, + { + "epoch": 0.4466683047780731, + "grad_norm": 0.41857802867889404, + "learning_rate": 6.097282047383688e-05, + "loss": 0.9917, + "step": 4999 + }, + { + "epoch": 0.446757656309335, + "grad_norm": 0.6450624465942383, + "learning_rate": 6.095870206022879e-05, + "loss": 0.8897, + "step": 5000 + }, + { + "epoch": 0.4468470078405969, + "grad_norm": 0.4506264925003052, + "learning_rate": 6.0944582728712585e-05, + "loss": 0.9337, + "step": 5001 + }, + { + "epoch": 0.4469363593718587, + "grad_norm": 0.5254955887794495, + "learning_rate": 6.093046248047092e-05, + "loss": 0.9374, + "step": 5002 + }, + { + "epoch": 0.4470257109031206, + "grad_norm": 0.4589659869670868, + "learning_rate": 6.091634131668652e-05, + "loss": 0.9522, + "step": 5003 + }, + { + "epoch": 0.4471150624343825, + "grad_norm": 0.5653790831565857, + "learning_rate": 6.090221923854217e-05, + "loss": 0.9442, + "step": 5004 + }, + { + "epoch": 0.4472044139656443, + "grad_norm": 0.4403340220451355, + "learning_rate": 6.088809624722074e-05, + "loss": 0.9674, + "step": 5005 + }, + { + "epoch": 0.4472937654969062, + "grad_norm": 0.5026565790176392, + "learning_rate": 6.0873972343905206e-05, + "loss": 0.9168, + "step": 5006 + }, + { + "epoch": 0.4473831170281681, + "grad_norm": 0.4866418242454529, + "learning_rate": 6.085984752977857e-05, + "loss": 0.9227, + "step": 5007 + }, + { + "epoch": 0.44747246855942996, + "grad_norm": 0.43236756324768066, + "learning_rate": 6.0845721806023945e-05, + "loss": 0.9651, + "step": 5008 + }, + { + "epoch": 0.4475618200906918, + "grad_norm": 0.49595317244529724, + "learning_rate": 6.083159517382452e-05, + "loss": 0.8973, + "step": 5009 + }, + { + "epoch": 0.44765117162195367, + "grad_norm": 0.3765192925930023, + "learning_rate": 6.0817467634363535e-05, + "loss": 0.9991, + "step": 5010 + }, + { + "epoch": 0.44774052315321555, + "grad_norm": 0.49171602725982666, + "learning_rate": 6.0803339188824326e-05, + "loss": 1.0389, + "step": 5011 + }, + { + "epoch": 0.4478298746844774, + "grad_norm": 0.47733286023139954, + "learning_rate": 6.078920983839031e-05, + "loss": 0.974, + "step": 5012 + }, + { + "epoch": 0.44791922621573926, + "grad_norm": 0.5364671945571899, + "learning_rate": 6.0775079584244976e-05, + "loss": 0.957, + "step": 5013 + }, + { + "epoch": 0.44800857774700115, + "grad_norm": 0.5192524790763855, + "learning_rate": 6.076094842757185e-05, + "loss": 0.8715, + "step": 5014 + }, + { + "epoch": 0.44809792927826303, + "grad_norm": 0.38916611671447754, + "learning_rate": 6.07468163695546e-05, + "loss": 1.0494, + "step": 5015 + }, + { + "epoch": 0.44818728080952486, + "grad_norm": 0.4378674030303955, + "learning_rate": 6.0732683411376935e-05, + "loss": 1.0302, + "step": 5016 + }, + { + "epoch": 0.44827663234078674, + "grad_norm": 0.4529155492782593, + "learning_rate": 6.0718549554222614e-05, + "loss": 0.9895, + "step": 5017 + }, + { + "epoch": 0.4483659838720486, + "grad_norm": 0.4706246256828308, + "learning_rate": 6.070441479927554e-05, + "loss": 0.9995, + "step": 5018 + }, + { + "epoch": 0.44845533540331045, + "grad_norm": 0.515957772731781, + "learning_rate": 6.069027914771961e-05, + "loss": 0.898, + "step": 5019 + }, + { + "epoch": 0.44854468693457233, + "grad_norm": 0.43126967549324036, + "learning_rate": 6.067614260073885e-05, + "loss": 1.0187, + "step": 5020 + }, + { + "epoch": 0.4486340384658342, + "grad_norm": 0.44768694043159485, + "learning_rate": 6.0662005159517354e-05, + "loss": 1.047, + "step": 5021 + }, + { + "epoch": 0.4487233899970961, + "grad_norm": 0.4717482328414917, + "learning_rate": 6.064786682523928e-05, + "loss": 0.9254, + "step": 5022 + }, + { + "epoch": 0.44881274152835793, + "grad_norm": 0.40896281599998474, + "learning_rate": 6.063372759908885e-05, + "loss": 0.9374, + "step": 5023 + }, + { + "epoch": 0.4489020930596198, + "grad_norm": 0.43339207768440247, + "learning_rate": 6.06195874822504e-05, + "loss": 0.9764, + "step": 5024 + }, + { + "epoch": 0.4489914445908817, + "grad_norm": 0.4062994718551636, + "learning_rate": 6.06054464759083e-05, + "loss": 0.9822, + "step": 5025 + }, + { + "epoch": 0.4490807961221435, + "grad_norm": 0.4177139401435852, + "learning_rate": 6.0591304581247e-05, + "loss": 1.0045, + "step": 5026 + }, + { + "epoch": 0.4491701476534054, + "grad_norm": 0.46026477217674255, + "learning_rate": 6.0577161799451054e-05, + "loss": 0.9932, + "step": 5027 + }, + { + "epoch": 0.4492594991846673, + "grad_norm": 0.3944912552833557, + "learning_rate": 6.0563018131705063e-05, + "loss": 1.0112, + "step": 5028 + }, + { + "epoch": 0.44934885071592917, + "grad_norm": 0.49842584133148193, + "learning_rate": 6.054887357919371e-05, + "loss": 1.0421, + "step": 5029 + }, + { + "epoch": 0.449438202247191, + "grad_norm": 0.4486824572086334, + "learning_rate": 6.053472814310175e-05, + "loss": 0.9671, + "step": 5030 + }, + { + "epoch": 0.4495275537784529, + "grad_norm": 0.41824930906295776, + "learning_rate": 6.052058182461401e-05, + "loss": 0.9581, + "step": 5031 + }, + { + "epoch": 0.44961690530971476, + "grad_norm": 0.500913143157959, + "learning_rate": 6.0506434624915396e-05, + "loss": 0.9364, + "step": 5032 + }, + { + "epoch": 0.4497062568409766, + "grad_norm": 0.4475401043891907, + "learning_rate": 6.049228654519091e-05, + "loss": 0.9434, + "step": 5033 + }, + { + "epoch": 0.4497956083722385, + "grad_norm": 0.47360408306121826, + "learning_rate": 6.0478137586625584e-05, + "loss": 0.959, + "step": 5034 + }, + { + "epoch": 0.44988495990350036, + "grad_norm": 0.4704830050468445, + "learning_rate": 6.046398775040454e-05, + "loss": 0.9246, + "step": 5035 + }, + { + "epoch": 0.4499743114347622, + "grad_norm": 0.41631588339805603, + "learning_rate": 6.0449837037712976e-05, + "loss": 1.0194, + "step": 5036 + }, + { + "epoch": 0.45006366296602407, + "grad_norm": 0.47200682759284973, + "learning_rate": 6.043568544973618e-05, + "loss": 0.9078, + "step": 5037 + }, + { + "epoch": 0.45015301449728595, + "grad_norm": 0.4732605516910553, + "learning_rate": 6.0421532987659504e-05, + "loss": 1.0336, + "step": 5038 + }, + { + "epoch": 0.45024236602854784, + "grad_norm": 0.4652157425880432, + "learning_rate": 6.040737965266834e-05, + "loss": 0.9257, + "step": 5039 + }, + { + "epoch": 0.45033171755980966, + "grad_norm": 0.5224878191947937, + "learning_rate": 6.039322544594819e-05, + "loss": 0.9199, + "step": 5040 + }, + { + "epoch": 0.45042106909107155, + "grad_norm": 0.4996034801006317, + "learning_rate": 6.037907036868464e-05, + "loss": 1.0375, + "step": 5041 + }, + { + "epoch": 0.45051042062233343, + "grad_norm": 0.4124327600002289, + "learning_rate": 6.0364914422063304e-05, + "loss": 0.9245, + "step": 5042 + }, + { + "epoch": 0.45059977215359526, + "grad_norm": 0.441354900598526, + "learning_rate": 6.0350757607269904e-05, + "loss": 0.9574, + "step": 5043 + }, + { + "epoch": 0.45068912368485714, + "grad_norm": 0.4461125433444977, + "learning_rate": 6.033659992549023e-05, + "loss": 0.8827, + "step": 5044 + }, + { + "epoch": 0.450778475216119, + "grad_norm": 0.3978963792324066, + "learning_rate": 6.0322441377910135e-05, + "loss": 0.9883, + "step": 5045 + }, + { + "epoch": 0.4508678267473809, + "grad_norm": 0.4818345010280609, + "learning_rate": 6.030828196571553e-05, + "loss": 0.9646, + "step": 5046 + }, + { + "epoch": 0.45095717827864273, + "grad_norm": 0.5546659231185913, + "learning_rate": 6.029412169009243e-05, + "loss": 0.9554, + "step": 5047 + }, + { + "epoch": 0.4510465298099046, + "grad_norm": 0.4250444173812866, + "learning_rate": 6.0279960552226934e-05, + "loss": 0.9748, + "step": 5048 + }, + { + "epoch": 0.4511358813411665, + "grad_norm": 0.44978564977645874, + "learning_rate": 6.026579855330515e-05, + "loss": 0.9755, + "step": 5049 + }, + { + "epoch": 0.45122523287242833, + "grad_norm": 0.4644257426261902, + "learning_rate": 6.025163569451331e-05, + "loss": 1.003, + "step": 5050 + }, + { + "epoch": 0.4513145844036902, + "grad_norm": 0.41228753328323364, + "learning_rate": 6.0237471977037706e-05, + "loss": 1.0315, + "step": 5051 + }, + { + "epoch": 0.4514039359349521, + "grad_norm": 0.5227345824241638, + "learning_rate": 6.02233074020647e-05, + "loss": 0.8891, + "step": 5052 + }, + { + "epoch": 0.451493287466214, + "grad_norm": 0.46746036410331726, + "learning_rate": 6.020914197078074e-05, + "loss": 0.9664, + "step": 5053 + }, + { + "epoch": 0.4515826389974758, + "grad_norm": 0.3485402762889862, + "learning_rate": 6.01949756843723e-05, + "loss": 1.0188, + "step": 5054 + }, + { + "epoch": 0.4516719905287377, + "grad_norm": 0.4057181179523468, + "learning_rate": 6.018080854402599e-05, + "loss": 0.9448, + "step": 5055 + }, + { + "epoch": 0.45176134205999957, + "grad_norm": 0.39723464846611023, + "learning_rate": 6.0166640550928434e-05, + "loss": 0.97, + "step": 5056 + }, + { + "epoch": 0.4518506935912614, + "grad_norm": 0.3836243152618408, + "learning_rate": 6.015247170626637e-05, + "loss": 0.9494, + "step": 5057 + }, + { + "epoch": 0.4519400451225233, + "grad_norm": 0.4795580208301544, + "learning_rate": 6.013830201122659e-05, + "loss": 0.9054, + "step": 5058 + }, + { + "epoch": 0.45202939665378516, + "grad_norm": 0.4811798930168152, + "learning_rate": 6.0124131466995936e-05, + "loss": 0.9832, + "step": 5059 + }, + { + "epoch": 0.45211874818504705, + "grad_norm": 0.41265878081321716, + "learning_rate": 6.0109960074761374e-05, + "loss": 0.9763, + "step": 5060 + }, + { + "epoch": 0.4522080997163089, + "grad_norm": 0.4164207875728607, + "learning_rate": 6.009578783570987e-05, + "loss": 0.9297, + "step": 5061 + }, + { + "epoch": 0.45229745124757076, + "grad_norm": 0.392967164516449, + "learning_rate": 6.008161475102853e-05, + "loss": 0.953, + "step": 5062 + }, + { + "epoch": 0.45238680277883264, + "grad_norm": 0.42946505546569824, + "learning_rate": 6.00674408219045e-05, + "loss": 0.979, + "step": 5063 + }, + { + "epoch": 0.45247615431009447, + "grad_norm": 0.47703489661216736, + "learning_rate": 6.005326604952498e-05, + "loss": 0.9656, + "step": 5064 + }, + { + "epoch": 0.45256550584135635, + "grad_norm": 0.4116286635398865, + "learning_rate": 6.003909043507727e-05, + "loss": 0.943, + "step": 5065 + }, + { + "epoch": 0.45265485737261824, + "grad_norm": 0.4779094159603119, + "learning_rate": 6.002491397974872e-05, + "loss": 0.9066, + "step": 5066 + }, + { + "epoch": 0.45274420890388006, + "grad_norm": 0.4313408136367798, + "learning_rate": 6.0010736684726774e-05, + "loss": 1.01, + "step": 5067 + }, + { + "epoch": 0.45283356043514195, + "grad_norm": 0.4300564229488373, + "learning_rate": 5.999655855119893e-05, + "loss": 0.9609, + "step": 5068 + }, + { + "epoch": 0.45292291196640383, + "grad_norm": 0.4585869610309601, + "learning_rate": 5.998237958035274e-05, + "loss": 0.9914, + "step": 5069 + }, + { + "epoch": 0.4530122634976657, + "grad_norm": 0.5256367325782776, + "learning_rate": 5.996819977337587e-05, + "loss": 0.9232, + "step": 5070 + }, + { + "epoch": 0.45310161502892754, + "grad_norm": 0.38605356216430664, + "learning_rate": 5.9954019131456e-05, + "loss": 1.0481, + "step": 5071 + }, + { + "epoch": 0.4531909665601894, + "grad_norm": 0.41398924589157104, + "learning_rate": 5.993983765578093e-05, + "loss": 1.0571, + "step": 5072 + }, + { + "epoch": 0.4532803180914513, + "grad_norm": 0.45383667945861816, + "learning_rate": 5.9925655347538504e-05, + "loss": 0.9514, + "step": 5073 + }, + { + "epoch": 0.45336966962271313, + "grad_norm": 0.5158712863922119, + "learning_rate": 5.991147220791665e-05, + "loss": 0.8985, + "step": 5074 + }, + { + "epoch": 0.453459021153975, + "grad_norm": 0.4722534716129303, + "learning_rate": 5.989728823810335e-05, + "loss": 0.9492, + "step": 5075 + }, + { + "epoch": 0.4535483726852369, + "grad_norm": 0.517153799533844, + "learning_rate": 5.9883103439286646e-05, + "loss": 0.9614, + "step": 5076 + }, + { + "epoch": 0.4536377242164988, + "grad_norm": 0.4288327097892761, + "learning_rate": 5.986891781265471e-05, + "loss": 0.9977, + "step": 5077 + }, + { + "epoch": 0.4537270757477606, + "grad_norm": 0.3851867616176605, + "learning_rate": 5.98547313593957e-05, + "loss": 0.9741, + "step": 5078 + }, + { + "epoch": 0.4538164272790225, + "grad_norm": 0.41952255368232727, + "learning_rate": 5.9840544080697904e-05, + "loss": 0.901, + "step": 5079 + }, + { + "epoch": 0.4539057788102844, + "grad_norm": 0.4248369634151459, + "learning_rate": 5.9826355977749624e-05, + "loss": 1.004, + "step": 5080 + }, + { + "epoch": 0.4539951303415462, + "grad_norm": 0.3857657015323639, + "learning_rate": 5.98121670517393e-05, + "loss": 1.0764, + "step": 5081 + }, + { + "epoch": 0.4540844818728081, + "grad_norm": 0.4897959232330322, + "learning_rate": 5.97979773038554e-05, + "loss": 0.9433, + "step": 5082 + }, + { + "epoch": 0.45417383340406997, + "grad_norm": 0.4853098392486572, + "learning_rate": 5.978378673528645e-05, + "loss": 1.0143, + "step": 5083 + }, + { + "epoch": 0.45426318493533185, + "grad_norm": 0.3813380300998688, + "learning_rate": 5.9769595347221096e-05, + "loss": 0.991, + "step": 5084 + }, + { + "epoch": 0.4543525364665937, + "grad_norm": 0.3379877507686615, + "learning_rate": 5.9755403140847974e-05, + "loss": 1.0139, + "step": 5085 + }, + { + "epoch": 0.45444188799785556, + "grad_norm": 0.41478732228279114, + "learning_rate": 5.974121011735586e-05, + "loss": 0.9286, + "step": 5086 + }, + { + "epoch": 0.45453123952911745, + "grad_norm": 0.4362654685974121, + "learning_rate": 5.972701627793357e-05, + "loss": 0.9713, + "step": 5087 + }, + { + "epoch": 0.4546205910603793, + "grad_norm": 0.44205188751220703, + "learning_rate": 5.9712821623769976e-05, + "loss": 0.9442, + "step": 5088 + }, + { + "epoch": 0.45470994259164116, + "grad_norm": 0.4520505666732788, + "learning_rate": 5.969862615605405e-05, + "loss": 0.9221, + "step": 5089 + }, + { + "epoch": 0.45479929412290304, + "grad_norm": 0.3804233968257904, + "learning_rate": 5.96844298759748e-05, + "loss": 0.9983, + "step": 5090 + }, + { + "epoch": 0.4548886456541649, + "grad_norm": 0.45884522795677185, + "learning_rate": 5.967023278472131e-05, + "loss": 0.9408, + "step": 5091 + }, + { + "epoch": 0.45497799718542675, + "grad_norm": 0.5373643040657043, + "learning_rate": 5.965603488348276e-05, + "loss": 0.9206, + "step": 5092 + }, + { + "epoch": 0.45506734871668864, + "grad_norm": 0.4332393705844879, + "learning_rate": 5.964183617344836e-05, + "loss": 0.9796, + "step": 5093 + }, + { + "epoch": 0.4551567002479505, + "grad_norm": 0.5239064693450928, + "learning_rate": 5.962763665580741e-05, + "loss": 0.9123, + "step": 5094 + }, + { + "epoch": 0.45524605177921235, + "grad_norm": 0.41141557693481445, + "learning_rate": 5.961343633174926e-05, + "loss": 0.9346, + "step": 5095 + }, + { + "epoch": 0.45533540331047423, + "grad_norm": 0.43024352192878723, + "learning_rate": 5.9599235202463344e-05, + "loss": 0.9898, + "step": 5096 + }, + { + "epoch": 0.4554247548417361, + "grad_norm": 0.4978499412536621, + "learning_rate": 5.958503326913916e-05, + "loss": 0.9262, + "step": 5097 + }, + { + "epoch": 0.45551410637299794, + "grad_norm": 0.37708181142807007, + "learning_rate": 5.957083053296626e-05, + "loss": 1.0786, + "step": 5098 + }, + { + "epoch": 0.4556034579042598, + "grad_norm": 0.42339766025543213, + "learning_rate": 5.955662699513431e-05, + "loss": 0.9934, + "step": 5099 + }, + { + "epoch": 0.4556928094355217, + "grad_norm": 0.515396773815155, + "learning_rate": 5.954242265683296e-05, + "loss": 0.9361, + "step": 5100 + }, + { + "epoch": 0.4557821609667836, + "grad_norm": 0.502129316329956, + "learning_rate": 5.9528217519252003e-05, + "loss": 0.9834, + "step": 5101 + }, + { + "epoch": 0.4558715124980454, + "grad_norm": 0.44378775358200073, + "learning_rate": 5.9514011583581275e-05, + "loss": 0.9662, + "step": 5102 + }, + { + "epoch": 0.4559608640293073, + "grad_norm": 0.4403262436389923, + "learning_rate": 5.9499804851010655e-05, + "loss": 0.986, + "step": 5103 + }, + { + "epoch": 0.4560502155605692, + "grad_norm": 0.38599881529808044, + "learning_rate": 5.948559732273013e-05, + "loss": 1.0328, + "step": 5104 + }, + { + "epoch": 0.456139567091831, + "grad_norm": 0.5018365979194641, + "learning_rate": 5.9471388999929714e-05, + "loss": 0.9439, + "step": 5105 + }, + { + "epoch": 0.4562289186230929, + "grad_norm": 0.4095374047756195, + "learning_rate": 5.9457179883799496e-05, + "loss": 0.9718, + "step": 5106 + }, + { + "epoch": 0.4563182701543548, + "grad_norm": 0.49552807211875916, + "learning_rate": 5.944296997552967e-05, + "loss": 0.9708, + "step": 5107 + }, + { + "epoch": 0.45640762168561666, + "grad_norm": 0.45015573501586914, + "learning_rate": 5.942875927631045e-05, + "loss": 0.9136, + "step": 5108 + }, + { + "epoch": 0.4564969732168785, + "grad_norm": 0.4981837868690491, + "learning_rate": 5.9414547787332155e-05, + "loss": 0.8726, + "step": 5109 + }, + { + "epoch": 0.45658632474814037, + "grad_norm": 0.38596221804618835, + "learning_rate": 5.940033550978511e-05, + "loss": 1.0169, + "step": 5110 + }, + { + "epoch": 0.45667567627940225, + "grad_norm": 0.4662768840789795, + "learning_rate": 5.938612244485977e-05, + "loss": 0.8954, + "step": 5111 + }, + { + "epoch": 0.4567650278106641, + "grad_norm": 0.5224105715751648, + "learning_rate": 5.937190859374664e-05, + "loss": 0.9889, + "step": 5112 + }, + { + "epoch": 0.45685437934192596, + "grad_norm": 0.5646452307701111, + "learning_rate": 5.935769395763625e-05, + "loss": 0.8719, + "step": 5113 + }, + { + "epoch": 0.45694373087318785, + "grad_norm": 0.4161957800388336, + "learning_rate": 5.934347853771927e-05, + "loss": 0.9331, + "step": 5114 + }, + { + "epoch": 0.45703308240444973, + "grad_norm": 0.394827276468277, + "learning_rate": 5.9329262335186344e-05, + "loss": 0.954, + "step": 5115 + }, + { + "epoch": 0.45712243393571156, + "grad_norm": 0.5495721697807312, + "learning_rate": 5.931504535122825e-05, + "loss": 1.0314, + "step": 5116 + }, + { + "epoch": 0.45721178546697344, + "grad_norm": 0.48053717613220215, + "learning_rate": 5.9300827587035825e-05, + "loss": 0.9705, + "step": 5117 + }, + { + "epoch": 0.4573011369982353, + "grad_norm": 0.48041045665740967, + "learning_rate": 5.9286609043799945e-05, + "loss": 0.941, + "step": 5118 + }, + { + "epoch": 0.45739048852949715, + "grad_norm": 0.46086958050727844, + "learning_rate": 5.9272389722711586e-05, + "loss": 0.9866, + "step": 5119 + }, + { + "epoch": 0.45747984006075904, + "grad_norm": 0.4236505925655365, + "learning_rate": 5.925816962496175e-05, + "loss": 0.9627, + "step": 5120 + }, + { + "epoch": 0.4575691915920209, + "grad_norm": 0.41899070143699646, + "learning_rate": 5.9243948751741505e-05, + "loss": 0.9541, + "step": 5121 + }, + { + "epoch": 0.4576585431232828, + "grad_norm": 0.37787696719169617, + "learning_rate": 5.922972710424203e-05, + "loss": 0.9989, + "step": 5122 + }, + { + "epoch": 0.45774789465454463, + "grad_norm": 0.40664857625961304, + "learning_rate": 5.921550468365452e-05, + "loss": 1.0442, + "step": 5123 + }, + { + "epoch": 0.4578372461858065, + "grad_norm": 0.4115360975265503, + "learning_rate": 5.920128149117028e-05, + "loss": 0.9699, + "step": 5124 + }, + { + "epoch": 0.4579265977170684, + "grad_norm": 0.44247210025787354, + "learning_rate": 5.918705752798062e-05, + "loss": 0.9151, + "step": 5125 + }, + { + "epoch": 0.4580159492483302, + "grad_norm": 0.4786369800567627, + "learning_rate": 5.917283279527697e-05, + "loss": 0.9834, + "step": 5126 + }, + { + "epoch": 0.4581053007795921, + "grad_norm": 0.5580417513847351, + "learning_rate": 5.915860729425079e-05, + "loss": 0.7882, + "step": 5127 + }, + { + "epoch": 0.458194652310854, + "grad_norm": 0.4049915373325348, + "learning_rate": 5.914438102609364e-05, + "loss": 0.9563, + "step": 5128 + }, + { + "epoch": 0.4582840038421158, + "grad_norm": 0.4272023141384125, + "learning_rate": 5.91301539919971e-05, + "loss": 0.9359, + "step": 5129 + }, + { + "epoch": 0.4583733553733777, + "grad_norm": 0.4311048686504364, + "learning_rate": 5.911592619315286e-05, + "loss": 0.9599, + "step": 5130 + }, + { + "epoch": 0.4584627069046396, + "grad_norm": 0.4287171959877014, + "learning_rate": 5.910169763075261e-05, + "loss": 0.9668, + "step": 5131 + }, + { + "epoch": 0.45855205843590147, + "grad_norm": 0.4313339293003082, + "learning_rate": 5.908746830598817e-05, + "loss": 0.9181, + "step": 5132 + }, + { + "epoch": 0.4586414099671633, + "grad_norm": 0.4437636435031891, + "learning_rate": 5.9073238220051394e-05, + "loss": 1.0319, + "step": 5133 + }, + { + "epoch": 0.4587307614984252, + "grad_norm": 0.4116639196872711, + "learning_rate": 5.905900737413421e-05, + "loss": 0.9471, + "step": 5134 + }, + { + "epoch": 0.45882011302968706, + "grad_norm": 0.5594172477722168, + "learning_rate": 5.9044775769428594e-05, + "loss": 0.8384, + "step": 5135 + }, + { + "epoch": 0.4589094645609489, + "grad_norm": 0.4292183816432953, + "learning_rate": 5.903054340712659e-05, + "loss": 1.0041, + "step": 5136 + }, + { + "epoch": 0.45899881609221077, + "grad_norm": 0.5219025015830994, + "learning_rate": 5.9016310288420316e-05, + "loss": 0.8519, + "step": 5137 + }, + { + "epoch": 0.45908816762347265, + "grad_norm": 0.4652097821235657, + "learning_rate": 5.900207641450195e-05, + "loss": 0.9249, + "step": 5138 + }, + { + "epoch": 0.45917751915473454, + "grad_norm": 0.41842982172966003, + "learning_rate": 5.898784178656375e-05, + "loss": 1.0074, + "step": 5139 + }, + { + "epoch": 0.45926687068599636, + "grad_norm": 0.41296178102493286, + "learning_rate": 5.897360640579798e-05, + "loss": 0.9087, + "step": 5140 + }, + { + "epoch": 0.45935622221725825, + "grad_norm": 0.49167054891586304, + "learning_rate": 5.895937027339701e-05, + "loss": 0.9117, + "step": 5141 + }, + { + "epoch": 0.45944557374852013, + "grad_norm": 0.3634302020072937, + "learning_rate": 5.894513339055329e-05, + "loss": 1.0143, + "step": 5142 + }, + { + "epoch": 0.45953492527978196, + "grad_norm": 0.522591233253479, + "learning_rate": 5.89308957584593e-05, + "loss": 0.9263, + "step": 5143 + }, + { + "epoch": 0.45962427681104384, + "grad_norm": 0.49898669123649597, + "learning_rate": 5.89166573783076e-05, + "loss": 0.9532, + "step": 5144 + }, + { + "epoch": 0.4597136283423057, + "grad_norm": 0.44929975271224976, + "learning_rate": 5.890241825129079e-05, + "loss": 0.9635, + "step": 5145 + }, + { + "epoch": 0.4598029798735676, + "grad_norm": 0.4347250461578369, + "learning_rate": 5.8888178378601565e-05, + "loss": 0.9352, + "step": 5146 + }, + { + "epoch": 0.45989233140482944, + "grad_norm": 0.44016188383102417, + "learning_rate": 5.887393776143267e-05, + "loss": 0.9607, + "step": 5147 + }, + { + "epoch": 0.4599816829360913, + "grad_norm": 0.5058707594871521, + "learning_rate": 5.8859696400976884e-05, + "loss": 0.965, + "step": 5148 + }, + { + "epoch": 0.4600710344673532, + "grad_norm": 0.40825730562210083, + "learning_rate": 5.8845454298427094e-05, + "loss": 1.0115, + "step": 5149 + }, + { + "epoch": 0.46016038599861503, + "grad_norm": 0.4351537227630615, + "learning_rate": 5.883121145497622e-05, + "loss": 0.9622, + "step": 5150 + }, + { + "epoch": 0.4602497375298769, + "grad_norm": 0.4099435806274414, + "learning_rate": 5.881696787181724e-05, + "loss": 0.9724, + "step": 5151 + }, + { + "epoch": 0.4603390890611388, + "grad_norm": 0.44913604855537415, + "learning_rate": 5.8802723550143225e-05, + "loss": 0.9009, + "step": 5152 + }, + { + "epoch": 0.4604284405924007, + "grad_norm": 0.417484313249588, + "learning_rate": 5.878847849114728e-05, + "loss": 0.9643, + "step": 5153 + }, + { + "epoch": 0.4605177921236625, + "grad_norm": 0.43109041452407837, + "learning_rate": 5.8774232696022604e-05, + "loss": 0.9845, + "step": 5154 + }, + { + "epoch": 0.4606071436549244, + "grad_norm": 0.432075172662735, + "learning_rate": 5.8759986165962386e-05, + "loss": 1.0115, + "step": 5155 + }, + { + "epoch": 0.4606964951861863, + "grad_norm": 0.49210742115974426, + "learning_rate": 5.8745738902159965e-05, + "loss": 0.8991, + "step": 5156 + }, + { + "epoch": 0.4607858467174481, + "grad_norm": 0.41765424609184265, + "learning_rate": 5.873149090580868e-05, + "loss": 0.9901, + "step": 5157 + }, + { + "epoch": 0.46087519824871, + "grad_norm": 0.5431698560714722, + "learning_rate": 5.8717242178101975e-05, + "loss": 0.9571, + "step": 5158 + }, + { + "epoch": 0.46096454977997187, + "grad_norm": 0.5281333923339844, + "learning_rate": 5.8702992720233296e-05, + "loss": 0.9096, + "step": 5159 + }, + { + "epoch": 0.4610539013112337, + "grad_norm": 0.4179060757160187, + "learning_rate": 5.868874253339624e-05, + "loss": 0.937, + "step": 5160 + }, + { + "epoch": 0.4611432528424956, + "grad_norm": 0.4784322679042816, + "learning_rate": 5.867449161878435e-05, + "loss": 0.989, + "step": 5161 + }, + { + "epoch": 0.46123260437375746, + "grad_norm": 0.431211918592453, + "learning_rate": 5.8660239977591344e-05, + "loss": 1.0558, + "step": 5162 + }, + { + "epoch": 0.46132195590501934, + "grad_norm": 0.4160225987434387, + "learning_rate": 5.8645987611010935e-05, + "loss": 0.9614, + "step": 5163 + }, + { + "epoch": 0.46141130743628117, + "grad_norm": 0.41637277603149414, + "learning_rate": 5.863173452023688e-05, + "loss": 0.9515, + "step": 5164 + }, + { + "epoch": 0.46150065896754305, + "grad_norm": 0.35038280487060547, + "learning_rate": 5.861748070646307e-05, + "loss": 0.9557, + "step": 5165 + }, + { + "epoch": 0.46159001049880494, + "grad_norm": 0.49989181756973267, + "learning_rate": 5.860322617088339e-05, + "loss": 0.9531, + "step": 5166 + }, + { + "epoch": 0.46167936203006676, + "grad_norm": 0.3543355166912079, + "learning_rate": 5.858897091469181e-05, + "loss": 0.971, + "step": 5167 + }, + { + "epoch": 0.46176871356132865, + "grad_norm": 0.4254325032234192, + "learning_rate": 5.857471493908237e-05, + "loss": 1.009, + "step": 5168 + }, + { + "epoch": 0.46185806509259053, + "grad_norm": 0.4708648920059204, + "learning_rate": 5.856045824524914e-05, + "loss": 0.9253, + "step": 5169 + }, + { + "epoch": 0.4619474166238524, + "grad_norm": 0.48328697681427, + "learning_rate": 5.85462008343863e-05, + "loss": 0.8906, + "step": 5170 + }, + { + "epoch": 0.46203676815511424, + "grad_norm": 0.42431074380874634, + "learning_rate": 5.8531942707688034e-05, + "loss": 0.9893, + "step": 5171 + }, + { + "epoch": 0.4621261196863761, + "grad_norm": 0.37147825956344604, + "learning_rate": 5.851768386634863e-05, + "loss": 1.0255, + "step": 5172 + }, + { + "epoch": 0.462215471217638, + "grad_norm": 0.47785985469818115, + "learning_rate": 5.8503424311562406e-05, + "loss": 0.9222, + "step": 5173 + }, + { + "epoch": 0.46230482274889984, + "grad_norm": 0.4230659604072571, + "learning_rate": 5.848916404452375e-05, + "loss": 1.0169, + "step": 5174 + }, + { + "epoch": 0.4623941742801617, + "grad_norm": 0.45284929871559143, + "learning_rate": 5.8474903066427136e-05, + "loss": 1.0053, + "step": 5175 + }, + { + "epoch": 0.4624835258114236, + "grad_norm": 0.4324350357055664, + "learning_rate": 5.846064137846704e-05, + "loss": 0.9689, + "step": 5176 + }, + { + "epoch": 0.4625728773426855, + "grad_norm": 0.4609506130218506, + "learning_rate": 5.844637898183805e-05, + "loss": 0.9361, + "step": 5177 + }, + { + "epoch": 0.4626622288739473, + "grad_norm": 0.45454010367393494, + "learning_rate": 5.8432115877734775e-05, + "loss": 0.8882, + "step": 5178 + }, + { + "epoch": 0.4627515804052092, + "grad_norm": 0.47145554423332214, + "learning_rate": 5.841785206735192e-05, + "loss": 0.9578, + "step": 5179 + }, + { + "epoch": 0.4628409319364711, + "grad_norm": 0.5458246469497681, + "learning_rate": 5.8403587551884244e-05, + "loss": 0.9361, + "step": 5180 + }, + { + "epoch": 0.4629302834677329, + "grad_norm": 0.46121159195899963, + "learning_rate": 5.838932233252652e-05, + "loss": 0.8609, + "step": 5181 + }, + { + "epoch": 0.4630196349989948, + "grad_norm": 0.40931928157806396, + "learning_rate": 5.837505641047364e-05, + "loss": 1.0475, + "step": 5182 + }, + { + "epoch": 0.46310898653025667, + "grad_norm": 0.3603355288505554, + "learning_rate": 5.83607897869205e-05, + "loss": 0.9752, + "step": 5183 + }, + { + "epoch": 0.46319833806151856, + "grad_norm": 0.40766477584838867, + "learning_rate": 5.834652246306209e-05, + "loss": 0.9826, + "step": 5184 + }, + { + "epoch": 0.4632876895927804, + "grad_norm": 0.43302759528160095, + "learning_rate": 5.833225444009348e-05, + "loss": 0.954, + "step": 5185 + }, + { + "epoch": 0.46337704112404227, + "grad_norm": 0.518700122833252, + "learning_rate": 5.831798571920973e-05, + "loss": 1.0633, + "step": 5186 + }, + { + "epoch": 0.46346639265530415, + "grad_norm": 0.4842970371246338, + "learning_rate": 5.8303716301606e-05, + "loss": 0.9383, + "step": 5187 + }, + { + "epoch": 0.463555744186566, + "grad_norm": 0.42370370030403137, + "learning_rate": 5.828944618847753e-05, + "loss": 0.9952, + "step": 5188 + }, + { + "epoch": 0.46364509571782786, + "grad_norm": 0.47464433312416077, + "learning_rate": 5.827517538101959e-05, + "loss": 0.9543, + "step": 5189 + }, + { + "epoch": 0.46373444724908974, + "grad_norm": 0.48624545335769653, + "learning_rate": 5.826090388042751e-05, + "loss": 0.9396, + "step": 5190 + }, + { + "epoch": 0.4638237987803516, + "grad_norm": 0.45458412170410156, + "learning_rate": 5.8246631687896656e-05, + "loss": 0.9381, + "step": 5191 + }, + { + "epoch": 0.46391315031161345, + "grad_norm": 0.43564853072166443, + "learning_rate": 5.823235880462251e-05, + "loss": 0.9548, + "step": 5192 + }, + { + "epoch": 0.46400250184287534, + "grad_norm": 0.46724554896354675, + "learning_rate": 5.8218085231800544e-05, + "loss": 0.9358, + "step": 5193 + }, + { + "epoch": 0.4640918533741372, + "grad_norm": 0.4100496172904968, + "learning_rate": 5.820381097062636e-05, + "loss": 0.9394, + "step": 5194 + }, + { + "epoch": 0.46418120490539905, + "grad_norm": 0.5092671513557434, + "learning_rate": 5.818953602229557e-05, + "loss": 0.9381, + "step": 5195 + }, + { + "epoch": 0.46427055643666093, + "grad_norm": 0.410299688577652, + "learning_rate": 5.8175260388003813e-05, + "loss": 1.0375, + "step": 5196 + }, + { + "epoch": 0.4643599079679228, + "grad_norm": 0.4275777041912079, + "learning_rate": 5.816098406894688e-05, + "loss": 0.9305, + "step": 5197 + }, + { + "epoch": 0.46444925949918464, + "grad_norm": 0.4333481192588806, + "learning_rate": 5.814670706632054e-05, + "loss": 0.9786, + "step": 5198 + }, + { + "epoch": 0.4645386110304465, + "grad_norm": 0.5642595291137695, + "learning_rate": 5.813242938132064e-05, + "loss": 0.9754, + "step": 5199 + }, + { + "epoch": 0.4646279625617084, + "grad_norm": 0.5947056412696838, + "learning_rate": 5.8118151015143094e-05, + "loss": 0.9364, + "step": 5200 + }, + { + "epoch": 0.4647173140929703, + "grad_norm": 0.4831881523132324, + "learning_rate": 5.810387196898387e-05, + "loss": 0.9705, + "step": 5201 + }, + { + "epoch": 0.4648066656242321, + "grad_norm": 0.4384908676147461, + "learning_rate": 5.808959224403898e-05, + "loss": 0.9407, + "step": 5202 + }, + { + "epoch": 0.464896017155494, + "grad_norm": 0.4423673748970032, + "learning_rate": 5.807531184150452e-05, + "loss": 1.0036, + "step": 5203 + }, + { + "epoch": 0.4649853686867559, + "grad_norm": 0.4656173288822174, + "learning_rate": 5.806103076257661e-05, + "loss": 0.9284, + "step": 5204 + }, + { + "epoch": 0.4650747202180177, + "grad_norm": 0.4020898938179016, + "learning_rate": 5.804674900845145e-05, + "loss": 0.9451, + "step": 5205 + }, + { + "epoch": 0.4651640717492796, + "grad_norm": 0.3898525536060333, + "learning_rate": 5.803246658032528e-05, + "loss": 0.9848, + "step": 5206 + }, + { + "epoch": 0.4652534232805415, + "grad_norm": 0.41617026925086975, + "learning_rate": 5.801818347939443e-05, + "loss": 0.9434, + "step": 5207 + }, + { + "epoch": 0.46534277481180336, + "grad_norm": 0.3756409287452698, + "learning_rate": 5.8003899706855235e-05, + "loss": 1.0008, + "step": 5208 + }, + { + "epoch": 0.4654321263430652, + "grad_norm": 0.48038768768310547, + "learning_rate": 5.798961526390413e-05, + "loss": 0.9255, + "step": 5209 + }, + { + "epoch": 0.46552147787432707, + "grad_norm": 0.42943793535232544, + "learning_rate": 5.797533015173759e-05, + "loss": 1.0199, + "step": 5210 + }, + { + "epoch": 0.46561082940558896, + "grad_norm": 0.5280621647834778, + "learning_rate": 5.796104437155213e-05, + "loss": 0.9118, + "step": 5211 + }, + { + "epoch": 0.4657001809368508, + "grad_norm": 0.4121812582015991, + "learning_rate": 5.794675792454434e-05, + "loss": 0.9665, + "step": 5212 + }, + { + "epoch": 0.46578953246811267, + "grad_norm": 0.43791842460632324, + "learning_rate": 5.793247081191087e-05, + "loss": 0.9876, + "step": 5213 + }, + { + "epoch": 0.46587888399937455, + "grad_norm": 0.38042962551116943, + "learning_rate": 5.791818303484842e-05, + "loss": 0.916, + "step": 5214 + }, + { + "epoch": 0.46596823553063643, + "grad_norm": 0.4212323725223541, + "learning_rate": 5.790389459455374e-05, + "loss": 0.9963, + "step": 5215 + }, + { + "epoch": 0.46605758706189826, + "grad_norm": 0.4514927566051483, + "learning_rate": 5.788960549222364e-05, + "loss": 0.9769, + "step": 5216 + }, + { + "epoch": 0.46614693859316014, + "grad_norm": 0.4762234687805176, + "learning_rate": 5.787531572905498e-05, + "loss": 1.0555, + "step": 5217 + }, + { + "epoch": 0.466236290124422, + "grad_norm": 0.3893480598926544, + "learning_rate": 5.786102530624469e-05, + "loss": 0.9298, + "step": 5218 + }, + { + "epoch": 0.46632564165568385, + "grad_norm": 0.4439602494239807, + "learning_rate": 5.784673422498972e-05, + "loss": 0.9701, + "step": 5219 + }, + { + "epoch": 0.46641499318694574, + "grad_norm": 0.44786638021469116, + "learning_rate": 5.783244248648714e-05, + "loss": 0.9189, + "step": 5220 + }, + { + "epoch": 0.4665043447182076, + "grad_norm": 0.4381410479545593, + "learning_rate": 5.7818150091934e-05, + "loss": 1.0335, + "step": 5221 + }, + { + "epoch": 0.4665936962494695, + "grad_norm": 0.4737434685230255, + "learning_rate": 5.780385704252746e-05, + "loss": 0.9773, + "step": 5222 + }, + { + "epoch": 0.46668304778073133, + "grad_norm": 0.41768383979797363, + "learning_rate": 5.7789563339464695e-05, + "loss": 0.9492, + "step": 5223 + }, + { + "epoch": 0.4667723993119932, + "grad_norm": 0.5461154580116272, + "learning_rate": 5.777526898394298e-05, + "loss": 0.8902, + "step": 5224 + }, + { + "epoch": 0.4668617508432551, + "grad_norm": 0.46472278237342834, + "learning_rate": 5.7760973977159615e-05, + "loss": 0.9915, + "step": 5225 + }, + { + "epoch": 0.4669511023745169, + "grad_norm": 0.4949001669883728, + "learning_rate": 5.7746678320311954e-05, + "loss": 0.8738, + "step": 5226 + }, + { + "epoch": 0.4670404539057788, + "grad_norm": 0.5098089575767517, + "learning_rate": 5.77323820145974e-05, + "loss": 0.9527, + "step": 5227 + }, + { + "epoch": 0.4671298054370407, + "grad_norm": 0.4948689341545105, + "learning_rate": 5.7718085061213436e-05, + "loss": 0.9125, + "step": 5228 + }, + { + "epoch": 0.4672191569683025, + "grad_norm": 0.4423523247241974, + "learning_rate": 5.7703787461357575e-05, + "loss": 1.0395, + "step": 5229 + }, + { + "epoch": 0.4673085084995644, + "grad_norm": 0.4000280499458313, + "learning_rate": 5.7689489216227413e-05, + "loss": 0.9811, + "step": 5230 + }, + { + "epoch": 0.4673978600308263, + "grad_norm": 0.4417012333869934, + "learning_rate": 5.767519032702055e-05, + "loss": 0.9685, + "step": 5231 + }, + { + "epoch": 0.46748721156208817, + "grad_norm": 0.5120064616203308, + "learning_rate": 5.7660890794934676e-05, + "loss": 0.8708, + "step": 5232 + }, + { + "epoch": 0.46757656309335, + "grad_norm": 0.4820534288883209, + "learning_rate": 5.7646590621167564e-05, + "loss": 0.9201, + "step": 5233 + }, + { + "epoch": 0.4676659146246119, + "grad_norm": 0.46045050024986267, + "learning_rate": 5.763228980691696e-05, + "loss": 0.9647, + "step": 5234 + }, + { + "epoch": 0.46775526615587376, + "grad_norm": 0.448191374540329, + "learning_rate": 5.7617988353380746e-05, + "loss": 0.9384, + "step": 5235 + }, + { + "epoch": 0.4678446176871356, + "grad_norm": 0.5090450644493103, + "learning_rate": 5.76036862617568e-05, + "loss": 0.8526, + "step": 5236 + }, + { + "epoch": 0.46793396921839747, + "grad_norm": 0.4289693236351013, + "learning_rate": 5.758938353324308e-05, + "loss": 0.9773, + "step": 5237 + }, + { + "epoch": 0.46802332074965936, + "grad_norm": 0.522016704082489, + "learning_rate": 5.757508016903759e-05, + "loss": 0.867, + "step": 5238 + }, + { + "epoch": 0.46811267228092124, + "grad_norm": 0.48837369680404663, + "learning_rate": 5.756077617033838e-05, + "loss": 0.9976, + "step": 5239 + }, + { + "epoch": 0.46820202381218307, + "grad_norm": 0.41904178261756897, + "learning_rate": 5.75464715383436e-05, + "loss": 1.0333, + "step": 5240 + }, + { + "epoch": 0.46829137534344495, + "grad_norm": 0.4596676230430603, + "learning_rate": 5.7532166274251364e-05, + "loss": 0.9308, + "step": 5241 + }, + { + "epoch": 0.46838072687470683, + "grad_norm": 0.40436744689941406, + "learning_rate": 5.751786037925993e-05, + "loss": 1.0501, + "step": 5242 + }, + { + "epoch": 0.46847007840596866, + "grad_norm": 0.4699536859989166, + "learning_rate": 5.750355385456755e-05, + "loss": 0.9808, + "step": 5243 + }, + { + "epoch": 0.46855942993723054, + "grad_norm": 0.4330219030380249, + "learning_rate": 5.7489246701372545e-05, + "loss": 0.9724, + "step": 5244 + }, + { + "epoch": 0.4686487814684924, + "grad_norm": 0.421366810798645, + "learning_rate": 5.74749389208733e-05, + "loss": 1.0166, + "step": 5245 + }, + { + "epoch": 0.4687381329997543, + "grad_norm": 0.39331406354904175, + "learning_rate": 5.746063051426827e-05, + "loss": 0.9864, + "step": 5246 + }, + { + "epoch": 0.46882748453101614, + "grad_norm": 0.5093923807144165, + "learning_rate": 5.744632148275588e-05, + "loss": 0.9477, + "step": 5247 + }, + { + "epoch": 0.468916836062278, + "grad_norm": 0.480220228433609, + "learning_rate": 5.74320118275347e-05, + "loss": 0.934, + "step": 5248 + }, + { + "epoch": 0.4690061875935399, + "grad_norm": 0.4359627664089203, + "learning_rate": 5.741770154980331e-05, + "loss": 0.9851, + "step": 5249 + }, + { + "epoch": 0.46909553912480173, + "grad_norm": 0.4476899802684784, + "learning_rate": 5.7403390650760356e-05, + "loss": 0.8952, + "step": 5250 + }, + { + "epoch": 0.4691848906560636, + "grad_norm": 0.42261365056037903, + "learning_rate": 5.738907913160452e-05, + "loss": 0.9202, + "step": 5251 + }, + { + "epoch": 0.4692742421873255, + "grad_norm": 0.4985414743423462, + "learning_rate": 5.7374766993534545e-05, + "loss": 0.8766, + "step": 5252 + }, + { + "epoch": 0.4693635937185874, + "grad_norm": 0.5040563941001892, + "learning_rate": 5.736045423774923e-05, + "loss": 0.9752, + "step": 5253 + }, + { + "epoch": 0.4694529452498492, + "grad_norm": 0.4096861779689789, + "learning_rate": 5.7346140865447395e-05, + "loss": 1.0323, + "step": 5254 + }, + { + "epoch": 0.4695422967811111, + "grad_norm": 0.4268275499343872, + "learning_rate": 5.7331826877827965e-05, + "loss": 1.0152, + "step": 5255 + }, + { + "epoch": 0.469631648312373, + "grad_norm": 0.4520736336708069, + "learning_rate": 5.73175122760899e-05, + "loss": 0.9099, + "step": 5256 + }, + { + "epoch": 0.4697209998436348, + "grad_norm": 0.4920952618122101, + "learning_rate": 5.7303197061432165e-05, + "loss": 0.9575, + "step": 5257 + }, + { + "epoch": 0.4698103513748967, + "grad_norm": 0.44144368171691895, + "learning_rate": 5.728888123505384e-05, + "loss": 0.9656, + "step": 5258 + }, + { + "epoch": 0.46989970290615857, + "grad_norm": 0.39283469319343567, + "learning_rate": 5.727456479815401e-05, + "loss": 1.0424, + "step": 5259 + }, + { + "epoch": 0.4699890544374204, + "grad_norm": 0.4499633312225342, + "learning_rate": 5.726024775193184e-05, + "loss": 0.9702, + "step": 5260 + }, + { + "epoch": 0.4700784059686823, + "grad_norm": 0.48507824540138245, + "learning_rate": 5.724593009758653e-05, + "loss": 0.9632, + "step": 5261 + }, + { + "epoch": 0.47016775749994416, + "grad_norm": 0.4047746956348419, + "learning_rate": 5.723161183631734e-05, + "loss": 0.9593, + "step": 5262 + }, + { + "epoch": 0.47025710903120604, + "grad_norm": 0.41394850611686707, + "learning_rate": 5.7217292969323576e-05, + "loss": 0.9266, + "step": 5263 + }, + { + "epoch": 0.47034646056246787, + "grad_norm": 0.45513802766799927, + "learning_rate": 5.720297349780459e-05, + "loss": 1.0043, + "step": 5264 + }, + { + "epoch": 0.47043581209372975, + "grad_norm": 0.4691244959831238, + "learning_rate": 5.718865342295979e-05, + "loss": 0.9583, + "step": 5265 + }, + { + "epoch": 0.47052516362499164, + "grad_norm": 0.49911266565322876, + "learning_rate": 5.7174332745988666e-05, + "loss": 0.924, + "step": 5266 + }, + { + "epoch": 0.47061451515625347, + "grad_norm": 0.441734254360199, + "learning_rate": 5.716001146809068e-05, + "loss": 0.9989, + "step": 5267 + }, + { + "epoch": 0.47070386668751535, + "grad_norm": 0.40593159198760986, + "learning_rate": 5.714568959046542e-05, + "loss": 0.936, + "step": 5268 + }, + { + "epoch": 0.47079321821877723, + "grad_norm": 0.3953614830970764, + "learning_rate": 5.713136711431249e-05, + "loss": 0.9676, + "step": 5269 + }, + { + "epoch": 0.4708825697500391, + "grad_norm": 0.48885035514831543, + "learning_rate": 5.7117044040831544e-05, + "loss": 0.9002, + "step": 5270 + }, + { + "epoch": 0.47097192128130094, + "grad_norm": 0.4319499433040619, + "learning_rate": 5.7102720371222305e-05, + "loss": 1.0525, + "step": 5271 + }, + { + "epoch": 0.4710612728125628, + "grad_norm": 0.48925381898880005, + "learning_rate": 5.708839610668453e-05, + "loss": 1.023, + "step": 5272 + }, + { + "epoch": 0.4711506243438247, + "grad_norm": 0.39478790760040283, + "learning_rate": 5.7074071248418e-05, + "loss": 0.9983, + "step": 5273 + }, + { + "epoch": 0.47123997587508654, + "grad_norm": 0.5163027048110962, + "learning_rate": 5.7059745797622624e-05, + "loss": 0.9257, + "step": 5274 + }, + { + "epoch": 0.4713293274063484, + "grad_norm": 0.5084983110427856, + "learning_rate": 5.7045419755498264e-05, + "loss": 0.9036, + "step": 5275 + }, + { + "epoch": 0.4714186789376103, + "grad_norm": 0.5043264031410217, + "learning_rate": 5.7031093123244925e-05, + "loss": 1.005, + "step": 5276 + }, + { + "epoch": 0.4715080304688722, + "grad_norm": 0.4621896743774414, + "learning_rate": 5.7016765902062574e-05, + "loss": 0.9614, + "step": 5277 + }, + { + "epoch": 0.471597382000134, + "grad_norm": 0.45665669441223145, + "learning_rate": 5.700243809315129e-05, + "loss": 0.9324, + "step": 5278 + }, + { + "epoch": 0.4716867335313959, + "grad_norm": 0.4607732892036438, + "learning_rate": 5.6988109697711164e-05, + "loss": 1.0348, + "step": 5279 + }, + { + "epoch": 0.4717760850626578, + "grad_norm": 0.3779040277004242, + "learning_rate": 5.697378071694238e-05, + "loss": 1.0202, + "step": 5280 + }, + { + "epoch": 0.4718654365939196, + "grad_norm": 0.4088393449783325, + "learning_rate": 5.6959451152045126e-05, + "loss": 0.9944, + "step": 5281 + }, + { + "epoch": 0.4719547881251815, + "grad_norm": 0.5540783405303955, + "learning_rate": 5.694512100421965e-05, + "loss": 0.9225, + "step": 5282 + }, + { + "epoch": 0.4720441396564434, + "grad_norm": 0.4990440011024475, + "learning_rate": 5.6930790274666256e-05, + "loss": 0.9773, + "step": 5283 + }, + { + "epoch": 0.47213349118770526, + "grad_norm": 0.4320768713951111, + "learning_rate": 5.691645896458529e-05, + "loss": 1.0435, + "step": 5284 + }, + { + "epoch": 0.4722228427189671, + "grad_norm": 0.47398531436920166, + "learning_rate": 5.690212707517719e-05, + "loss": 0.9129, + "step": 5285 + }, + { + "epoch": 0.47231219425022897, + "grad_norm": 0.468569815158844, + "learning_rate": 5.688779460764235e-05, + "loss": 0.9181, + "step": 5286 + }, + { + "epoch": 0.47240154578149085, + "grad_norm": 0.37630656361579895, + "learning_rate": 5.687346156318132e-05, + "loss": 1.0331, + "step": 5287 + }, + { + "epoch": 0.4724908973127527, + "grad_norm": 0.49889349937438965, + "learning_rate": 5.68591279429946e-05, + "loss": 0.9137, + "step": 5288 + }, + { + "epoch": 0.47258024884401456, + "grad_norm": 0.4064541161060333, + "learning_rate": 5.68447937482828e-05, + "loss": 1.0404, + "step": 5289 + }, + { + "epoch": 0.47266960037527644, + "grad_norm": 0.416142076253891, + "learning_rate": 5.6830458980246574e-05, + "loss": 0.9972, + "step": 5290 + }, + { + "epoch": 0.47275895190653827, + "grad_norm": 0.5175201892852783, + "learning_rate": 5.681612364008659e-05, + "loss": 0.9116, + "step": 5291 + }, + { + "epoch": 0.47284830343780015, + "grad_norm": 0.5164797902107239, + "learning_rate": 5.6801787729003595e-05, + "loss": 0.941, + "step": 5292 + }, + { + "epoch": 0.47293765496906204, + "grad_norm": 0.4773990213871002, + "learning_rate": 5.6787451248198384e-05, + "loss": 0.8495, + "step": 5293 + }, + { + "epoch": 0.4730270065003239, + "grad_norm": 0.4284185469150543, + "learning_rate": 5.677311419887178e-05, + "loss": 1.0105, + "step": 5294 + }, + { + "epoch": 0.47311635803158575, + "grad_norm": 0.39999738335609436, + "learning_rate": 5.675877658222466e-05, + "loss": 0.9562, + "step": 5295 + }, + { + "epoch": 0.47320570956284763, + "grad_norm": 0.537138044834137, + "learning_rate": 5.674443839945797e-05, + "loss": 1.0542, + "step": 5296 + }, + { + "epoch": 0.4732950610941095, + "grad_norm": 0.40088164806365967, + "learning_rate": 5.673009965177266e-05, + "loss": 0.9687, + "step": 5297 + }, + { + "epoch": 0.47338441262537134, + "grad_norm": 0.4030899405479431, + "learning_rate": 5.6715760340369775e-05, + "loss": 1.0254, + "step": 5298 + }, + { + "epoch": 0.4734737641566332, + "grad_norm": 0.41325297951698303, + "learning_rate": 5.670142046645037e-05, + "loss": 0.9592, + "step": 5299 + }, + { + "epoch": 0.4735631156878951, + "grad_norm": 0.5127687454223633, + "learning_rate": 5.668708003121557e-05, + "loss": 0.9605, + "step": 5300 + }, + { + "epoch": 0.473652467219157, + "grad_norm": 0.4479494094848633, + "learning_rate": 5.667273903586655e-05, + "loss": 0.9453, + "step": 5301 + }, + { + "epoch": 0.4737418187504188, + "grad_norm": 0.41398364305496216, + "learning_rate": 5.66583974816045e-05, + "loss": 0.8632, + "step": 5302 + }, + { + "epoch": 0.4738311702816807, + "grad_norm": 0.3753226101398468, + "learning_rate": 5.66440553696307e-05, + "loss": 1.002, + "step": 5303 + }, + { + "epoch": 0.4739205218129426, + "grad_norm": 0.4559895694255829, + "learning_rate": 5.662971270114642e-05, + "loss": 0.9741, + "step": 5304 + }, + { + "epoch": 0.4740098733442044, + "grad_norm": 0.4614042043685913, + "learning_rate": 5.661536947735304e-05, + "loss": 0.9813, + "step": 5305 + }, + { + "epoch": 0.4740992248754663, + "grad_norm": 0.4653113782405853, + "learning_rate": 5.660102569945197e-05, + "loss": 0.9023, + "step": 5306 + }, + { + "epoch": 0.4741885764067282, + "grad_norm": 0.4278392493724823, + "learning_rate": 5.658668136864462e-05, + "loss": 0.9543, + "step": 5307 + }, + { + "epoch": 0.47427792793799006, + "grad_norm": 0.4447217583656311, + "learning_rate": 5.6572336486132495e-05, + "loss": 0.9646, + "step": 5308 + }, + { + "epoch": 0.4743672794692519, + "grad_norm": 0.4308806359767914, + "learning_rate": 5.655799105311713e-05, + "loss": 0.8999, + "step": 5309 + }, + { + "epoch": 0.4744566310005138, + "grad_norm": 0.401542603969574, + "learning_rate": 5.654364507080011e-05, + "loss": 0.9943, + "step": 5310 + }, + { + "epoch": 0.47454598253177566, + "grad_norm": 0.4810407757759094, + "learning_rate": 5.652929854038309e-05, + "loss": 0.8805, + "step": 5311 + }, + { + "epoch": 0.4746353340630375, + "grad_norm": 0.3643689453601837, + "learning_rate": 5.65149514630677e-05, + "loss": 0.9883, + "step": 5312 + }, + { + "epoch": 0.47472468559429937, + "grad_norm": 0.34783321619033813, + "learning_rate": 5.650060384005568e-05, + "loss": 0.9739, + "step": 5313 + }, + { + "epoch": 0.47481403712556125, + "grad_norm": 0.45785385370254517, + "learning_rate": 5.6486255672548794e-05, + "loss": 0.9802, + "step": 5314 + }, + { + "epoch": 0.47490338865682313, + "grad_norm": 0.5383105278015137, + "learning_rate": 5.6471906961748856e-05, + "loss": 0.9742, + "step": 5315 + }, + { + "epoch": 0.47499274018808496, + "grad_norm": 0.5157361626625061, + "learning_rate": 5.6457557708857745e-05, + "loss": 0.9478, + "step": 5316 + }, + { + "epoch": 0.47508209171934684, + "grad_norm": 0.43339022994041443, + "learning_rate": 5.644320791507732e-05, + "loss": 0.965, + "step": 5317 + }, + { + "epoch": 0.4751714432506087, + "grad_norm": 0.4871036112308502, + "learning_rate": 5.642885758160956e-05, + "loss": 0.9936, + "step": 5318 + }, + { + "epoch": 0.47526079478187055, + "grad_norm": 0.4251110553741455, + "learning_rate": 5.641450670965646e-05, + "loss": 1.0055, + "step": 5319 + }, + { + "epoch": 0.47535014631313244, + "grad_norm": 0.46318215131759644, + "learning_rate": 5.640015530042004e-05, + "loss": 0.9703, + "step": 5320 + }, + { + "epoch": 0.4754394978443943, + "grad_norm": 0.4937099814414978, + "learning_rate": 5.6385803355102395e-05, + "loss": 0.9328, + "step": 5321 + }, + { + "epoch": 0.47552884937565615, + "grad_norm": 0.4998278021812439, + "learning_rate": 5.637145087490566e-05, + "loss": 0.9198, + "step": 5322 + }, + { + "epoch": 0.47561820090691803, + "grad_norm": 0.43673455715179443, + "learning_rate": 5.6357097861031984e-05, + "loss": 0.9822, + "step": 5323 + }, + { + "epoch": 0.4757075524381799, + "grad_norm": 0.5582587122917175, + "learning_rate": 5.63427443146836e-05, + "loss": 0.9003, + "step": 5324 + }, + { + "epoch": 0.4757969039694418, + "grad_norm": 0.47949889302253723, + "learning_rate": 5.632839023706278e-05, + "loss": 0.9357, + "step": 5325 + }, + { + "epoch": 0.4758862555007036, + "grad_norm": 0.480672150850296, + "learning_rate": 5.631403562937183e-05, + "loss": 0.9649, + "step": 5326 + }, + { + "epoch": 0.4759756070319655, + "grad_norm": 0.44741761684417725, + "learning_rate": 5.629968049281308e-05, + "loss": 1.0556, + "step": 5327 + }, + { + "epoch": 0.4760649585632274, + "grad_norm": 0.5157933235168457, + "learning_rate": 5.6285324828588934e-05, + "loss": 1.0001, + "step": 5328 + }, + { + "epoch": 0.4761543100944892, + "grad_norm": 0.45148682594299316, + "learning_rate": 5.6270968637901844e-05, + "loss": 0.9291, + "step": 5329 + }, + { + "epoch": 0.4762436616257511, + "grad_norm": 0.43948331475257874, + "learning_rate": 5.625661192195428e-05, + "loss": 0.9562, + "step": 5330 + }, + { + "epoch": 0.476333013157013, + "grad_norm": 0.49232932925224304, + "learning_rate": 5.624225468194879e-05, + "loss": 0.8804, + "step": 5331 + }, + { + "epoch": 0.47642236468827487, + "grad_norm": 0.44230684638023376, + "learning_rate": 5.622789691908794e-05, + "loss": 0.9769, + "step": 5332 + }, + { + "epoch": 0.4765117162195367, + "grad_norm": 0.46092236042022705, + "learning_rate": 5.621353863457431e-05, + "loss": 0.9966, + "step": 5333 + }, + { + "epoch": 0.4766010677507986, + "grad_norm": 0.45058542490005493, + "learning_rate": 5.61991798296106e-05, + "loss": 0.921, + "step": 5334 + }, + { + "epoch": 0.47669041928206046, + "grad_norm": 0.4571652114391327, + "learning_rate": 5.6184820505399514e-05, + "loss": 1.0159, + "step": 5335 + }, + { + "epoch": 0.4767797708133223, + "grad_norm": 0.45455148816108704, + "learning_rate": 5.617046066314377e-05, + "loss": 0.9191, + "step": 5336 + }, + { + "epoch": 0.4768691223445842, + "grad_norm": 0.5318629741668701, + "learning_rate": 5.6156100304046186e-05, + "loss": 0.8992, + "step": 5337 + }, + { + "epoch": 0.47695847387584606, + "grad_norm": 0.42745161056518555, + "learning_rate": 5.614173942930958e-05, + "loss": 1.0188, + "step": 5338 + }, + { + "epoch": 0.47704782540710794, + "grad_norm": 0.4684041738510132, + "learning_rate": 5.6127378040136826e-05, + "loss": 1.0166, + "step": 5339 + }, + { + "epoch": 0.47713717693836977, + "grad_norm": 0.5044588446617126, + "learning_rate": 5.6113016137730844e-05, + "loss": 0.9859, + "step": 5340 + }, + { + "epoch": 0.47722652846963165, + "grad_norm": 0.41825881600379944, + "learning_rate": 5.6098653723294604e-05, + "loss": 0.9519, + "step": 5341 + }, + { + "epoch": 0.47731588000089353, + "grad_norm": 0.5564168095588684, + "learning_rate": 5.6084290798031116e-05, + "loss": 0.9854, + "step": 5342 + }, + { + "epoch": 0.47740523153215536, + "grad_norm": 0.45696184039115906, + "learning_rate": 5.6069927363143405e-05, + "loss": 0.9567, + "step": 5343 + }, + { + "epoch": 0.47749458306341724, + "grad_norm": 0.49420660734176636, + "learning_rate": 5.6055563419834575e-05, + "loss": 0.8842, + "step": 5344 + }, + { + "epoch": 0.4775839345946791, + "grad_norm": 0.4271746873855591, + "learning_rate": 5.604119896930777e-05, + "loss": 0.9697, + "step": 5345 + }, + { + "epoch": 0.477673286125941, + "grad_norm": 0.4472793936729431, + "learning_rate": 5.602683401276615e-05, + "loss": 0.9975, + "step": 5346 + }, + { + "epoch": 0.47776263765720284, + "grad_norm": 0.5239841341972351, + "learning_rate": 5.601246855141296e-05, + "loss": 0.9098, + "step": 5347 + }, + { + "epoch": 0.4778519891884647, + "grad_norm": 0.4218825697898865, + "learning_rate": 5.599810258645142e-05, + "loss": 0.9914, + "step": 5348 + }, + { + "epoch": 0.4779413407197266, + "grad_norm": 0.4484994113445282, + "learning_rate": 5.598373611908485e-05, + "loss": 0.953, + "step": 5349 + }, + { + "epoch": 0.47803069225098843, + "grad_norm": 0.49346399307250977, + "learning_rate": 5.596936915051662e-05, + "loss": 0.9513, + "step": 5350 + }, + { + "epoch": 0.4781200437822503, + "grad_norm": 0.5029492974281311, + "learning_rate": 5.595500168195007e-05, + "loss": 0.9715, + "step": 5351 + }, + { + "epoch": 0.4782093953135122, + "grad_norm": 0.4443117678165436, + "learning_rate": 5.5940633714588675e-05, + "loss": 0.9222, + "step": 5352 + }, + { + "epoch": 0.478298746844774, + "grad_norm": 0.4409370720386505, + "learning_rate": 5.592626524963587e-05, + "loss": 0.9049, + "step": 5353 + }, + { + "epoch": 0.4783880983760359, + "grad_norm": 0.44364142417907715, + "learning_rate": 5.5911896288295183e-05, + "loss": 0.9859, + "step": 5354 + }, + { + "epoch": 0.4784774499072978, + "grad_norm": 0.52082759141922, + "learning_rate": 5.5897526831770175e-05, + "loss": 0.9021, + "step": 5355 + }, + { + "epoch": 0.4785668014385597, + "grad_norm": 0.38146278262138367, + "learning_rate": 5.588315688126442e-05, + "loss": 1.0069, + "step": 5356 + }, + { + "epoch": 0.4786561529698215, + "grad_norm": 0.44152483344078064, + "learning_rate": 5.586878643798158e-05, + "loss": 1.0141, + "step": 5357 + }, + { + "epoch": 0.4787455045010834, + "grad_norm": 0.5023549795150757, + "learning_rate": 5.58544155031253e-05, + "loss": 0.964, + "step": 5358 + }, + { + "epoch": 0.47883485603234527, + "grad_norm": 0.43117037415504456, + "learning_rate": 5.584004407789933e-05, + "loss": 1.0089, + "step": 5359 + }, + { + "epoch": 0.4789242075636071, + "grad_norm": 0.44485437870025635, + "learning_rate": 5.582567216350741e-05, + "loss": 0.9615, + "step": 5360 + }, + { + "epoch": 0.479013559094869, + "grad_norm": 0.4267984628677368, + "learning_rate": 5.581129976115336e-05, + "loss": 0.9292, + "step": 5361 + }, + { + "epoch": 0.47910291062613086, + "grad_norm": 0.4499671459197998, + "learning_rate": 5.579692687204101e-05, + "loss": 0.9711, + "step": 5362 + }, + { + "epoch": 0.47919226215739275, + "grad_norm": 0.5573053956031799, + "learning_rate": 5.578255349737423e-05, + "loss": 0.965, + "step": 5363 + }, + { + "epoch": 0.4792816136886546, + "grad_norm": 0.41541776061058044, + "learning_rate": 5.5768179638356963e-05, + "loss": 0.9682, + "step": 5364 + }, + { + "epoch": 0.47937096521991646, + "grad_norm": 0.4482407569885254, + "learning_rate": 5.575380529619317e-05, + "loss": 0.9534, + "step": 5365 + }, + { + "epoch": 0.47946031675117834, + "grad_norm": 0.4427946209907532, + "learning_rate": 5.573943047208685e-05, + "loss": 1.0162, + "step": 5366 + }, + { + "epoch": 0.47954966828244017, + "grad_norm": 0.5755056738853455, + "learning_rate": 5.572505516724207e-05, + "loss": 0.9427, + "step": 5367 + }, + { + "epoch": 0.47963901981370205, + "grad_norm": 0.45547792315483093, + "learning_rate": 5.5710679382862874e-05, + "loss": 0.9308, + "step": 5368 + }, + { + "epoch": 0.47972837134496393, + "grad_norm": 0.5226430892944336, + "learning_rate": 5.5696303120153405e-05, + "loss": 0.9416, + "step": 5369 + }, + { + "epoch": 0.4798177228762258, + "grad_norm": 0.4200889468193054, + "learning_rate": 5.568192638031784e-05, + "loss": 1.042, + "step": 5370 + }, + { + "epoch": 0.47990707440748764, + "grad_norm": 0.4149283468723297, + "learning_rate": 5.566754916456037e-05, + "loss": 0.9486, + "step": 5371 + }, + { + "epoch": 0.4799964259387495, + "grad_norm": 0.5608735680580139, + "learning_rate": 5.5653171474085254e-05, + "loss": 0.8783, + "step": 5372 + }, + { + "epoch": 0.4800857774700114, + "grad_norm": 0.47211307287216187, + "learning_rate": 5.563879331009677e-05, + "loss": 0.9475, + "step": 5373 + }, + { + "epoch": 0.48017512900127324, + "grad_norm": 0.38663092255592346, + "learning_rate": 5.562441467379923e-05, + "loss": 0.9769, + "step": 5374 + }, + { + "epoch": 0.4802644805325351, + "grad_norm": 0.36634334921836853, + "learning_rate": 5.5610035566397014e-05, + "loss": 0.9618, + "step": 5375 + }, + { + "epoch": 0.480353832063797, + "grad_norm": 0.4378538131713867, + "learning_rate": 5.559565598909452e-05, + "loss": 0.9778, + "step": 5376 + }, + { + "epoch": 0.4804431835950589, + "grad_norm": 0.40313366055488586, + "learning_rate": 5.55812759430962e-05, + "loss": 0.9487, + "step": 5377 + }, + { + "epoch": 0.4805325351263207, + "grad_norm": 0.4929332733154297, + "learning_rate": 5.5566895429606516e-05, + "loss": 0.9624, + "step": 5378 + }, + { + "epoch": 0.4806218866575826, + "grad_norm": 0.5455753803253174, + "learning_rate": 5.5552514449830005e-05, + "loss": 0.9789, + "step": 5379 + }, + { + "epoch": 0.4807112381888445, + "grad_norm": 0.3865823745727539, + "learning_rate": 5.5538133004971216e-05, + "loss": 0.9508, + "step": 5380 + }, + { + "epoch": 0.4808005897201063, + "grad_norm": 0.48918184638023376, + "learning_rate": 5.5523751096234765e-05, + "loss": 0.951, + "step": 5381 + }, + { + "epoch": 0.4808899412513682, + "grad_norm": 0.49451667070388794, + "learning_rate": 5.550936872482528e-05, + "loss": 0.9803, + "step": 5382 + }, + { + "epoch": 0.4809792927826301, + "grad_norm": 0.40212544798851013, + "learning_rate": 5.549498589194744e-05, + "loss": 0.9332, + "step": 5383 + }, + { + "epoch": 0.4810686443138919, + "grad_norm": 0.44129249453544617, + "learning_rate": 5.5480602598805955e-05, + "loss": 0.9288, + "step": 5384 + }, + { + "epoch": 0.4811579958451538, + "grad_norm": 0.4733429551124573, + "learning_rate": 5.546621884660558e-05, + "loss": 0.9687, + "step": 5385 + }, + { + "epoch": 0.48124734737641567, + "grad_norm": 0.44276171922683716, + "learning_rate": 5.545183463655112e-05, + "loss": 0.9343, + "step": 5386 + }, + { + "epoch": 0.48133669890767755, + "grad_norm": 0.505408525466919, + "learning_rate": 5.54374499698474e-05, + "loss": 0.8595, + "step": 5387 + }, + { + "epoch": 0.4814260504389394, + "grad_norm": 0.4253001809120178, + "learning_rate": 5.542306484769927e-05, + "loss": 0.9035, + "step": 5388 + }, + { + "epoch": 0.48151540197020126, + "grad_norm": 0.5153298377990723, + "learning_rate": 5.540867927131166e-05, + "loss": 0.9208, + "step": 5389 + }, + { + "epoch": 0.48160475350146315, + "grad_norm": 0.443081796169281, + "learning_rate": 5.5394293241889516e-05, + "loss": 0.9694, + "step": 5390 + }, + { + "epoch": 0.481694105032725, + "grad_norm": 0.4136885404586792, + "learning_rate": 5.5379906760637814e-05, + "loss": 0.9473, + "step": 5391 + }, + { + "epoch": 0.48178345656398686, + "grad_norm": 0.3998454809188843, + "learning_rate": 5.536551982876157e-05, + "loss": 0.9494, + "step": 5392 + }, + { + "epoch": 0.48187280809524874, + "grad_norm": 0.410869836807251, + "learning_rate": 5.535113244746585e-05, + "loss": 0.979, + "step": 5393 + }, + { + "epoch": 0.4819621596265106, + "grad_norm": 0.4814499318599701, + "learning_rate": 5.5336744617955735e-05, + "loss": 0.9871, + "step": 5394 + }, + { + "epoch": 0.48205151115777245, + "grad_norm": 0.44015729427337646, + "learning_rate": 5.532235634143639e-05, + "loss": 1.0168, + "step": 5395 + }, + { + "epoch": 0.48214086268903433, + "grad_norm": 0.38507404923439026, + "learning_rate": 5.530796761911295e-05, + "loss": 0.9503, + "step": 5396 + }, + { + "epoch": 0.4822302142202962, + "grad_norm": 0.5117488503456116, + "learning_rate": 5.5293578452190675e-05, + "loss": 0.9804, + "step": 5397 + }, + { + "epoch": 0.48231956575155804, + "grad_norm": 0.4578893780708313, + "learning_rate": 5.527918884187475e-05, + "loss": 1.038, + "step": 5398 + }, + { + "epoch": 0.4824089172828199, + "grad_norm": 0.5064082145690918, + "learning_rate": 5.5264798789370496e-05, + "loss": 0.8768, + "step": 5399 + }, + { + "epoch": 0.4824982688140818, + "grad_norm": 0.42315277457237244, + "learning_rate": 5.525040829588323e-05, + "loss": 0.9454, + "step": 5400 + }, + { + "epoch": 0.4825876203453437, + "grad_norm": 0.3969995379447937, + "learning_rate": 5.52360173626183e-05, + "loss": 0.9996, + "step": 5401 + }, + { + "epoch": 0.4826769718766055, + "grad_norm": 0.5402255654335022, + "learning_rate": 5.5221625990781115e-05, + "loss": 0.9203, + "step": 5402 + }, + { + "epoch": 0.4827663234078674, + "grad_norm": 0.4401646554470062, + "learning_rate": 5.520723418157707e-05, + "loss": 0.9595, + "step": 5403 + }, + { + "epoch": 0.4828556749391293, + "grad_norm": 0.4190303683280945, + "learning_rate": 5.519284193621167e-05, + "loss": 1.0173, + "step": 5404 + }, + { + "epoch": 0.4829450264703911, + "grad_norm": 0.38738173246383667, + "learning_rate": 5.517844925589041e-05, + "loss": 0.9696, + "step": 5405 + }, + { + "epoch": 0.483034378001653, + "grad_norm": 0.4196470081806183, + "learning_rate": 5.516405614181883e-05, + "loss": 0.9762, + "step": 5406 + }, + { + "epoch": 0.4831237295329149, + "grad_norm": 0.4423119127750397, + "learning_rate": 5.514966259520249e-05, + "loss": 0.9455, + "step": 5407 + }, + { + "epoch": 0.48321308106417676, + "grad_norm": 0.45127880573272705, + "learning_rate": 5.513526861724703e-05, + "loss": 0.994, + "step": 5408 + }, + { + "epoch": 0.4833024325954386, + "grad_norm": 0.3754526674747467, + "learning_rate": 5.512087420915808e-05, + "loss": 0.9263, + "step": 5409 + }, + { + "epoch": 0.4833917841267005, + "grad_norm": 0.4244709014892578, + "learning_rate": 5.510647937214133e-05, + "loss": 1.0555, + "step": 5410 + }, + { + "epoch": 0.48348113565796236, + "grad_norm": 0.4102063477039337, + "learning_rate": 5.509208410740249e-05, + "loss": 0.9678, + "step": 5411 + }, + { + "epoch": 0.4835704871892242, + "grad_norm": 0.4029648005962372, + "learning_rate": 5.5077688416147345e-05, + "loss": 1.0338, + "step": 5412 + }, + { + "epoch": 0.48365983872048607, + "grad_norm": 0.4508571922779083, + "learning_rate": 5.506329229958166e-05, + "loss": 1.035, + "step": 5413 + }, + { + "epoch": 0.48374919025174795, + "grad_norm": 0.38453125953674316, + "learning_rate": 5.504889575891128e-05, + "loss": 1.0721, + "step": 5414 + }, + { + "epoch": 0.4838385417830098, + "grad_norm": 0.4159301221370697, + "learning_rate": 5.503449879534206e-05, + "loss": 0.9116, + "step": 5415 + }, + { + "epoch": 0.48392789331427166, + "grad_norm": 0.5297693610191345, + "learning_rate": 5.5020101410079896e-05, + "loss": 0.9292, + "step": 5416 + }, + { + "epoch": 0.48401724484553355, + "grad_norm": 0.3714810907840729, + "learning_rate": 5.5005703604330736e-05, + "loss": 0.8904, + "step": 5417 + }, + { + "epoch": 0.48410659637679543, + "grad_norm": 0.4217725694179535, + "learning_rate": 5.499130537930055e-05, + "loss": 0.9627, + "step": 5418 + }, + { + "epoch": 0.48419594790805726, + "grad_norm": 0.5042391419410706, + "learning_rate": 5.497690673619532e-05, + "loss": 0.8922, + "step": 5419 + }, + { + "epoch": 0.48428529943931914, + "grad_norm": 0.4610849916934967, + "learning_rate": 5.496250767622111e-05, + "loss": 0.9797, + "step": 5420 + }, + { + "epoch": 0.484374650970581, + "grad_norm": 0.4403340518474579, + "learning_rate": 5.494810820058398e-05, + "loss": 0.9383, + "step": 5421 + }, + { + "epoch": 0.48446400250184285, + "grad_norm": 0.38612040877342224, + "learning_rate": 5.493370831049004e-05, + "loss": 0.9801, + "step": 5422 + }, + { + "epoch": 0.48455335403310473, + "grad_norm": 0.4851481318473816, + "learning_rate": 5.4919308007145465e-05, + "loss": 0.9842, + "step": 5423 + }, + { + "epoch": 0.4846427055643666, + "grad_norm": 0.4784315824508667, + "learning_rate": 5.490490729175638e-05, + "loss": 0.9389, + "step": 5424 + }, + { + "epoch": 0.4847320570956285, + "grad_norm": 0.39346399903297424, + "learning_rate": 5.489050616552905e-05, + "loss": 1.0562, + "step": 5425 + }, + { + "epoch": 0.4848214086268903, + "grad_norm": 0.5939106941223145, + "learning_rate": 5.487610462966969e-05, + "loss": 0.9138, + "step": 5426 + }, + { + "epoch": 0.4849107601581522, + "grad_norm": 0.4027441442012787, + "learning_rate": 5.4861702685384586e-05, + "loss": 1.0134, + "step": 5427 + }, + { + "epoch": 0.4850001116894141, + "grad_norm": 0.5542457699775696, + "learning_rate": 5.484730033388007e-05, + "loss": 0.9148, + "step": 5428 + }, + { + "epoch": 0.4850894632206759, + "grad_norm": 0.40890124440193176, + "learning_rate": 5.483289757636247e-05, + "loss": 1.011, + "step": 5429 + }, + { + "epoch": 0.4851788147519378, + "grad_norm": 0.3923655152320862, + "learning_rate": 5.481849441403819e-05, + "loss": 0.9849, + "step": 5430 + }, + { + "epoch": 0.4852681662831997, + "grad_norm": 0.444996178150177, + "learning_rate": 5.480409084811363e-05, + "loss": 0.9099, + "step": 5431 + }, + { + "epoch": 0.48535751781446157, + "grad_norm": 0.5601452589035034, + "learning_rate": 5.4789686879795263e-05, + "loss": 0.8231, + "step": 5432 + }, + { + "epoch": 0.4854468693457234, + "grad_norm": 0.4075312316417694, + "learning_rate": 5.477528251028957e-05, + "loss": 0.9326, + "step": 5433 + }, + { + "epoch": 0.4855362208769853, + "grad_norm": 0.4870973825454712, + "learning_rate": 5.4760877740803066e-05, + "loss": 0.9654, + "step": 5434 + }, + { + "epoch": 0.48562557240824716, + "grad_norm": 0.5276638269424438, + "learning_rate": 5.47464725725423e-05, + "loss": 0.9561, + "step": 5435 + }, + { + "epoch": 0.485714923939509, + "grad_norm": 0.39273542165756226, + "learning_rate": 5.473206700671386e-05, + "loss": 0.9866, + "step": 5436 + }, + { + "epoch": 0.4858042754707709, + "grad_norm": 0.48121076822280884, + "learning_rate": 5.471766104452436e-05, + "loss": 0.9523, + "step": 5437 + }, + { + "epoch": 0.48589362700203276, + "grad_norm": 0.5641562938690186, + "learning_rate": 5.4703254687180486e-05, + "loss": 0.9552, + "step": 5438 + }, + { + "epoch": 0.48598297853329464, + "grad_norm": 0.5305027961730957, + "learning_rate": 5.468884793588888e-05, + "loss": 0.953, + "step": 5439 + }, + { + "epoch": 0.48607233006455647, + "grad_norm": 0.40917256474494934, + "learning_rate": 5.467444079185629e-05, + "loss": 0.9913, + "step": 5440 + }, + { + "epoch": 0.48616168159581835, + "grad_norm": 0.5030852556228638, + "learning_rate": 5.466003325628945e-05, + "loss": 0.9434, + "step": 5441 + }, + { + "epoch": 0.48625103312708023, + "grad_norm": 0.3704419434070587, + "learning_rate": 5.464562533039515e-05, + "loss": 1.023, + "step": 5442 + }, + { + "epoch": 0.48634038465834206, + "grad_norm": 0.46701717376708984, + "learning_rate": 5.463121701538021e-05, + "loss": 0.8984, + "step": 5443 + }, + { + "epoch": 0.48642973618960395, + "grad_norm": 0.43691638112068176, + "learning_rate": 5.46168083124515e-05, + "loss": 1.0049, + "step": 5444 + }, + { + "epoch": 0.48651908772086583, + "grad_norm": 0.4441946744918823, + "learning_rate": 5.460239922281586e-05, + "loss": 0.9694, + "step": 5445 + }, + { + "epoch": 0.48660843925212766, + "grad_norm": 0.42461204528808594, + "learning_rate": 5.458798974768022e-05, + "loss": 0.9686, + "step": 5446 + }, + { + "epoch": 0.48669779078338954, + "grad_norm": 0.44621115922927856, + "learning_rate": 5.457357988825155e-05, + "loss": 1.0075, + "step": 5447 + }, + { + "epoch": 0.4867871423146514, + "grad_norm": 0.5380200743675232, + "learning_rate": 5.455916964573682e-05, + "loss": 0.8571, + "step": 5448 + }, + { + "epoch": 0.4868764938459133, + "grad_norm": 0.5656547546386719, + "learning_rate": 5.4544759021343016e-05, + "loss": 0.8964, + "step": 5449 + }, + { + "epoch": 0.48696584537717513, + "grad_norm": 0.43907156586647034, + "learning_rate": 5.453034801627721e-05, + "loss": 1.0135, + "step": 5450 + }, + { + "epoch": 0.487055196908437, + "grad_norm": 0.4475903809070587, + "learning_rate": 5.451593663174647e-05, + "loss": 0.8743, + "step": 5451 + }, + { + "epoch": 0.4871445484396989, + "grad_norm": 0.42074811458587646, + "learning_rate": 5.4501524868957886e-05, + "loss": 0.9357, + "step": 5452 + }, + { + "epoch": 0.4872338999709607, + "grad_norm": 0.47343945503234863, + "learning_rate": 5.448711272911864e-05, + "loss": 0.9863, + "step": 5453 + }, + { + "epoch": 0.4873232515022226, + "grad_norm": 0.5782976746559143, + "learning_rate": 5.4472700213435854e-05, + "loss": 0.9197, + "step": 5454 + }, + { + "epoch": 0.4874126030334845, + "grad_norm": 0.36520230770111084, + "learning_rate": 5.4458287323116755e-05, + "loss": 0.9455, + "step": 5455 + }, + { + "epoch": 0.4875019545647464, + "grad_norm": 0.5519242882728577, + "learning_rate": 5.444387405936856e-05, + "loss": 0.8343, + "step": 5456 + }, + { + "epoch": 0.4875913060960082, + "grad_norm": 0.4189887046813965, + "learning_rate": 5.442946042339855e-05, + "loss": 0.9955, + "step": 5457 + }, + { + "epoch": 0.4876806576272701, + "grad_norm": 0.46009689569473267, + "learning_rate": 5.441504641641403e-05, + "loss": 0.9444, + "step": 5458 + }, + { + "epoch": 0.48777000915853197, + "grad_norm": 0.5175769925117493, + "learning_rate": 5.4400632039622293e-05, + "loss": 0.9562, + "step": 5459 + }, + { + "epoch": 0.4878593606897938, + "grad_norm": 0.41818684339523315, + "learning_rate": 5.438621729423072e-05, + "loss": 0.9337, + "step": 5460 + }, + { + "epoch": 0.4879487122210557, + "grad_norm": 0.42660924792289734, + "learning_rate": 5.4371802181446694e-05, + "loss": 1.0022, + "step": 5461 + }, + { + "epoch": 0.48803806375231756, + "grad_norm": 0.4058513939380646, + "learning_rate": 5.4357386702477645e-05, + "loss": 0.99, + "step": 5462 + }, + { + "epoch": 0.48812741528357945, + "grad_norm": 0.43901243805885315, + "learning_rate": 5.434297085853101e-05, + "loss": 0.8644, + "step": 5463 + }, + { + "epoch": 0.4882167668148413, + "grad_norm": 0.410495787858963, + "learning_rate": 5.432855465081427e-05, + "loss": 0.9906, + "step": 5464 + }, + { + "epoch": 0.48830611834610316, + "grad_norm": 0.5046043395996094, + "learning_rate": 5.431413808053492e-05, + "loss": 1.0235, + "step": 5465 + }, + { + "epoch": 0.48839546987736504, + "grad_norm": 0.4128507077693939, + "learning_rate": 5.4299721148900554e-05, + "loss": 1.0187, + "step": 5466 + }, + { + "epoch": 0.48848482140862687, + "grad_norm": 0.4370626211166382, + "learning_rate": 5.4285303857118685e-05, + "loss": 0.935, + "step": 5467 + }, + { + "epoch": 0.48857417293988875, + "grad_norm": 0.41134530305862427, + "learning_rate": 5.4270886206396956e-05, + "loss": 0.9916, + "step": 5468 + }, + { + "epoch": 0.48866352447115063, + "grad_norm": 0.6058627367019653, + "learning_rate": 5.4256468197942964e-05, + "loss": 0.9549, + "step": 5469 + }, + { + "epoch": 0.4887528760024125, + "grad_norm": 0.47504645586013794, + "learning_rate": 5.42420498329644e-05, + "loss": 1.0599, + "step": 5470 + }, + { + "epoch": 0.48884222753367434, + "grad_norm": 0.42550885677337646, + "learning_rate": 5.4227631112668955e-05, + "loss": 0.9998, + "step": 5471 + }, + { + "epoch": 0.48893157906493623, + "grad_norm": 0.4544508755207062, + "learning_rate": 5.4213212038264325e-05, + "loss": 0.9259, + "step": 5472 + }, + { + "epoch": 0.4890209305961981, + "grad_norm": 0.41492947936058044, + "learning_rate": 5.419879261095829e-05, + "loss": 0.9139, + "step": 5473 + }, + { + "epoch": 0.48911028212745994, + "grad_norm": 0.3988552391529083, + "learning_rate": 5.418437283195862e-05, + "loss": 1.0173, + "step": 5474 + }, + { + "epoch": 0.4891996336587218, + "grad_norm": 0.48259106278419495, + "learning_rate": 5.4169952702473114e-05, + "loss": 0.9367, + "step": 5475 + }, + { + "epoch": 0.4892889851899837, + "grad_norm": 0.38662248849868774, + "learning_rate": 5.415553222370963e-05, + "loss": 1.0017, + "step": 5476 + }, + { + "epoch": 0.48937833672124553, + "grad_norm": 0.4289376139640808, + "learning_rate": 5.414111139687601e-05, + "loss": 0.9943, + "step": 5477 + }, + { + "epoch": 0.4894676882525074, + "grad_norm": 0.42582669854164124, + "learning_rate": 5.4126690223180175e-05, + "loss": 1.0044, + "step": 5478 + }, + { + "epoch": 0.4895570397837693, + "grad_norm": 0.3854537606239319, + "learning_rate": 5.411226870383006e-05, + "loss": 0.9932, + "step": 5479 + }, + { + "epoch": 0.4896463913150312, + "grad_norm": 0.41988447308540344, + "learning_rate": 5.4097846840033604e-05, + "loss": 0.9692, + "step": 5480 + }, + { + "epoch": 0.489735742846293, + "grad_norm": 0.424517959356308, + "learning_rate": 5.408342463299878e-05, + "loss": 0.9933, + "step": 5481 + }, + { + "epoch": 0.4898250943775549, + "grad_norm": 0.524263858795166, + "learning_rate": 5.4069002083933625e-05, + "loss": 0.921, + "step": 5482 + }, + { + "epoch": 0.4899144459088168, + "grad_norm": 0.5599950551986694, + "learning_rate": 5.405457919404619e-05, + "loss": 0.9442, + "step": 5483 + }, + { + "epoch": 0.4900037974400786, + "grad_norm": 0.3951818645000458, + "learning_rate": 5.4040155964544506e-05, + "loss": 0.9275, + "step": 5484 + }, + { + "epoch": 0.4900931489713405, + "grad_norm": 0.5091139674186707, + "learning_rate": 5.40257323966367e-05, + "loss": 1.0093, + "step": 5485 + }, + { + "epoch": 0.49018250050260237, + "grad_norm": 0.46656838059425354, + "learning_rate": 5.40113084915309e-05, + "loss": 0.9953, + "step": 5486 + }, + { + "epoch": 0.49027185203386425, + "grad_norm": 0.41893553733825684, + "learning_rate": 5.399688425043524e-05, + "loss": 0.9857, + "step": 5487 + }, + { + "epoch": 0.4903612035651261, + "grad_norm": 0.5395665764808655, + "learning_rate": 5.398245967455795e-05, + "loss": 0.9104, + "step": 5488 + }, + { + "epoch": 0.49045055509638796, + "grad_norm": 0.5200545191764832, + "learning_rate": 5.396803476510719e-05, + "loss": 0.8928, + "step": 5489 + }, + { + "epoch": 0.49053990662764985, + "grad_norm": 0.5178142786026001, + "learning_rate": 5.3953609523291225e-05, + "loss": 0.9587, + "step": 5490 + }, + { + "epoch": 0.4906292581589117, + "grad_norm": 0.46901631355285645, + "learning_rate": 5.3939183950318316e-05, + "loss": 0.9875, + "step": 5491 + }, + { + "epoch": 0.49071860969017356, + "grad_norm": 0.3886154294013977, + "learning_rate": 5.392475804739677e-05, + "loss": 0.9903, + "step": 5492 + }, + { + "epoch": 0.49080796122143544, + "grad_norm": 0.4371306598186493, + "learning_rate": 5.391033181573491e-05, + "loss": 0.9537, + "step": 5493 + }, + { + "epoch": 0.4908973127526973, + "grad_norm": 0.5079873204231262, + "learning_rate": 5.3895905256541066e-05, + "loss": 0.934, + "step": 5494 + }, + { + "epoch": 0.49098666428395915, + "grad_norm": 0.46429815888404846, + "learning_rate": 5.3881478371023655e-05, + "loss": 0.9762, + "step": 5495 + }, + { + "epoch": 0.49107601581522103, + "grad_norm": 0.44685813784599304, + "learning_rate": 5.3867051160391025e-05, + "loss": 1.0016, + "step": 5496 + }, + { + "epoch": 0.4911653673464829, + "grad_norm": 0.48742127418518066, + "learning_rate": 5.385262362585165e-05, + "loss": 0.8865, + "step": 5497 + }, + { + "epoch": 0.49125471887774474, + "grad_norm": 0.410982221364975, + "learning_rate": 5.3838195768613995e-05, + "loss": 0.9393, + "step": 5498 + }, + { + "epoch": 0.49134407040900663, + "grad_norm": 0.4197956919670105, + "learning_rate": 5.382376758988652e-05, + "loss": 0.9513, + "step": 5499 + }, + { + "epoch": 0.4914334219402685, + "grad_norm": 0.48656147718429565, + "learning_rate": 5.3809339090877745e-05, + "loss": 0.8835, + "step": 5500 + }, + { + "epoch": 0.4915227734715304, + "grad_norm": 0.43064260482788086, + "learning_rate": 5.379491027279622e-05, + "loss": 1.0597, + "step": 5501 + }, + { + "epoch": 0.4916121250027922, + "grad_norm": 0.39117228984832764, + "learning_rate": 5.3780481136850505e-05, + "loss": 0.9828, + "step": 5502 + }, + { + "epoch": 0.4917014765340541, + "grad_norm": 0.5010277032852173, + "learning_rate": 5.37660516842492e-05, + "loss": 1.0373, + "step": 5503 + }, + { + "epoch": 0.491790828065316, + "grad_norm": 0.367341011762619, + "learning_rate": 5.375162191620093e-05, + "loss": 0.9854, + "step": 5504 + }, + { + "epoch": 0.4918801795965778, + "grad_norm": 0.5271230340003967, + "learning_rate": 5.373719183391434e-05, + "loss": 0.8852, + "step": 5505 + }, + { + "epoch": 0.4919695311278397, + "grad_norm": 0.4072398543357849, + "learning_rate": 5.372276143859809e-05, + "loss": 0.9513, + "step": 5506 + }, + { + "epoch": 0.4920588826591016, + "grad_norm": 0.4533540606498718, + "learning_rate": 5.3708330731460876e-05, + "loss": 0.9483, + "step": 5507 + }, + { + "epoch": 0.49214823419036346, + "grad_norm": 0.48236024379730225, + "learning_rate": 5.369389971371145e-05, + "loss": 1.027, + "step": 5508 + }, + { + "epoch": 0.4922375857216253, + "grad_norm": 0.4861671030521393, + "learning_rate": 5.367946838655855e-05, + "loss": 0.905, + "step": 5509 + }, + { + "epoch": 0.4923269372528872, + "grad_norm": 0.5689902305603027, + "learning_rate": 5.366503675121095e-05, + "loss": 0.9575, + "step": 5510 + }, + { + "epoch": 0.49241628878414906, + "grad_norm": 0.5556491613388062, + "learning_rate": 5.3650604808877456e-05, + "loss": 0.9894, + "step": 5511 + }, + { + "epoch": 0.4925056403154109, + "grad_norm": 0.3819589912891388, + "learning_rate": 5.36361725607669e-05, + "loss": 0.9288, + "step": 5512 + }, + { + "epoch": 0.49259499184667277, + "grad_norm": 0.4342455267906189, + "learning_rate": 5.3621740008088126e-05, + "loss": 1.0173, + "step": 5513 + }, + { + "epoch": 0.49268434337793465, + "grad_norm": 0.5042043924331665, + "learning_rate": 5.360730715205002e-05, + "loss": 1.0367, + "step": 5514 + }, + { + "epoch": 0.4927736949091965, + "grad_norm": 0.4823912978172302, + "learning_rate": 5.35928739938615e-05, + "loss": 0.9668, + "step": 5515 + }, + { + "epoch": 0.49286304644045836, + "grad_norm": 0.4222562313079834, + "learning_rate": 5.357844053473148e-05, + "loss": 0.9825, + "step": 5516 + }, + { + "epoch": 0.49295239797172025, + "grad_norm": 0.48596012592315674, + "learning_rate": 5.356400677586891e-05, + "loss": 0.9142, + "step": 5517 + }, + { + "epoch": 0.49304174950298213, + "grad_norm": 0.4577832818031311, + "learning_rate": 5.3549572718482785e-05, + "loss": 0.9555, + "step": 5518 + }, + { + "epoch": 0.49313110103424396, + "grad_norm": 0.4585660696029663, + "learning_rate": 5.353513836378213e-05, + "loss": 0.9421, + "step": 5519 + }, + { + "epoch": 0.49322045256550584, + "grad_norm": 0.44608667492866516, + "learning_rate": 5.352070371297594e-05, + "loss": 1.0034, + "step": 5520 + }, + { + "epoch": 0.4933098040967677, + "grad_norm": 0.4560176432132721, + "learning_rate": 5.350626876727328e-05, + "loss": 0.9055, + "step": 5521 + }, + { + "epoch": 0.49339915562802955, + "grad_norm": 0.4542362689971924, + "learning_rate": 5.349183352788325e-05, + "loss": 0.976, + "step": 5522 + }, + { + "epoch": 0.49348850715929143, + "grad_norm": 0.4418618679046631, + "learning_rate": 5.347739799601493e-05, + "loss": 0.9314, + "step": 5523 + }, + { + "epoch": 0.4935778586905533, + "grad_norm": 0.4947437345981598, + "learning_rate": 5.3462962172877475e-05, + "loss": 0.9936, + "step": 5524 + }, + { + "epoch": 0.4936672102218152, + "grad_norm": 0.47864702343940735, + "learning_rate": 5.3448526059680016e-05, + "loss": 0.9143, + "step": 5525 + }, + { + "epoch": 0.49375656175307703, + "grad_norm": 0.40519431233406067, + "learning_rate": 5.343408965763174e-05, + "loss": 0.9579, + "step": 5526 + }, + { + "epoch": 0.4938459132843389, + "grad_norm": 0.4214836061000824, + "learning_rate": 5.341965296794185e-05, + "loss": 1.0252, + "step": 5527 + }, + { + "epoch": 0.4939352648156008, + "grad_norm": 0.4515914022922516, + "learning_rate": 5.340521599181959e-05, + "loss": 0.9455, + "step": 5528 + }, + { + "epoch": 0.4940246163468626, + "grad_norm": 0.39432477951049805, + "learning_rate": 5.339077873047418e-05, + "loss": 0.9309, + "step": 5529 + }, + { + "epoch": 0.4941139678781245, + "grad_norm": 0.4477103054523468, + "learning_rate": 5.337634118511491e-05, + "loss": 0.9542, + "step": 5530 + }, + { + "epoch": 0.4942033194093864, + "grad_norm": 0.48979735374450684, + "learning_rate": 5.336190335695107e-05, + "loss": 0.9266, + "step": 5531 + }, + { + "epoch": 0.49429267094064827, + "grad_norm": 0.470674991607666, + "learning_rate": 5.3347465247192e-05, + "loss": 0.9846, + "step": 5532 + }, + { + "epoch": 0.4943820224719101, + "grad_norm": 0.5746048092842102, + "learning_rate": 5.333302685704702e-05, + "loss": 0.9958, + "step": 5533 + }, + { + "epoch": 0.494471374003172, + "grad_norm": 0.44571205973625183, + "learning_rate": 5.331858818772553e-05, + "loss": 0.9015, + "step": 5534 + }, + { + "epoch": 0.49456072553443386, + "grad_norm": 0.39946070313453674, + "learning_rate": 5.330414924043689e-05, + "loss": 1.0127, + "step": 5535 + }, + { + "epoch": 0.4946500770656957, + "grad_norm": 0.3653550148010254, + "learning_rate": 5.3289710016390535e-05, + "loss": 0.9527, + "step": 5536 + }, + { + "epoch": 0.4947394285969576, + "grad_norm": 0.4395412802696228, + "learning_rate": 5.327527051679591e-05, + "loss": 0.9398, + "step": 5537 + }, + { + "epoch": 0.49482878012821946, + "grad_norm": 0.5096445679664612, + "learning_rate": 5.3260830742862456e-05, + "loss": 0.9331, + "step": 5538 + }, + { + "epoch": 0.49491813165948134, + "grad_norm": 0.5146742463111877, + "learning_rate": 5.324639069579968e-05, + "loss": 0.9474, + "step": 5539 + }, + { + "epoch": 0.49500748319074317, + "grad_norm": 0.41445231437683105, + "learning_rate": 5.3231950376817065e-05, + "loss": 0.9408, + "step": 5540 + }, + { + "epoch": 0.49509683472200505, + "grad_norm": 0.4815199077129364, + "learning_rate": 5.3217509787124165e-05, + "loss": 0.9785, + "step": 5541 + }, + { + "epoch": 0.49518618625326694, + "grad_norm": 0.43417826294898987, + "learning_rate": 5.320306892793052e-05, + "loss": 0.9017, + "step": 5542 + }, + { + "epoch": 0.49527553778452876, + "grad_norm": 0.4291759133338928, + "learning_rate": 5.318862780044571e-05, + "loss": 0.9912, + "step": 5543 + }, + { + "epoch": 0.49536488931579065, + "grad_norm": 0.41811853647232056, + "learning_rate": 5.317418640587934e-05, + "loss": 0.9695, + "step": 5544 + }, + { + "epoch": 0.49545424084705253, + "grad_norm": 0.4489332139492035, + "learning_rate": 5.315974474544102e-05, + "loss": 0.9603, + "step": 5545 + }, + { + "epoch": 0.49554359237831436, + "grad_norm": 0.4664028584957123, + "learning_rate": 5.3145302820340404e-05, + "loss": 0.9628, + "step": 5546 + }, + { + "epoch": 0.49563294390957624, + "grad_norm": 0.5637416243553162, + "learning_rate": 5.313086063178715e-05, + "loss": 1.0238, + "step": 5547 + }, + { + "epoch": 0.4957222954408381, + "grad_norm": 0.4180266559123993, + "learning_rate": 5.311641818099093e-05, + "loss": 0.9883, + "step": 5548 + }, + { + "epoch": 0.4958116469721, + "grad_norm": 0.3877975344657898, + "learning_rate": 5.310197546916149e-05, + "loss": 0.93, + "step": 5549 + }, + { + "epoch": 0.49590099850336183, + "grad_norm": 0.3823062777519226, + "learning_rate": 5.308753249750853e-05, + "loss": 1.0135, + "step": 5550 + }, + { + "epoch": 0.4959903500346237, + "grad_norm": 0.4974936246871948, + "learning_rate": 5.3073089267241804e-05, + "loss": 0.9422, + "step": 5551 + }, + { + "epoch": 0.4960797015658856, + "grad_norm": 0.44719165563583374, + "learning_rate": 5.305864577957111e-05, + "loss": 0.9379, + "step": 5552 + }, + { + "epoch": 0.49616905309714743, + "grad_norm": 0.4038889706134796, + "learning_rate": 5.30442020357062e-05, + "loss": 0.9951, + "step": 5553 + }, + { + "epoch": 0.4962584046284093, + "grad_norm": 0.497601181268692, + "learning_rate": 5.3029758036856955e-05, + "loss": 0.9611, + "step": 5554 + }, + { + "epoch": 0.4963477561596712, + "grad_norm": 0.4878460764884949, + "learning_rate": 5.301531378423317e-05, + "loss": 0.9078, + "step": 5555 + }, + { + "epoch": 0.4964371076909331, + "grad_norm": 0.5067799687385559, + "learning_rate": 5.300086927904471e-05, + "loss": 0.9569, + "step": 5556 + }, + { + "epoch": 0.4965264592221949, + "grad_norm": 0.3875921368598938, + "learning_rate": 5.298642452250145e-05, + "loss": 0.9997, + "step": 5557 + }, + { + "epoch": 0.4966158107534568, + "grad_norm": 0.3980264365673065, + "learning_rate": 5.297197951581332e-05, + "loss": 1.0059, + "step": 5558 + }, + { + "epoch": 0.49670516228471867, + "grad_norm": 0.42988121509552, + "learning_rate": 5.295753426019022e-05, + "loss": 0.9343, + "step": 5559 + }, + { + "epoch": 0.4967945138159805, + "grad_norm": 0.4407704174518585, + "learning_rate": 5.294308875684211e-05, + "loss": 0.9433, + "step": 5560 + }, + { + "epoch": 0.4968838653472424, + "grad_norm": 0.4002476930618286, + "learning_rate": 5.2928643006978926e-05, + "loss": 1.0739, + "step": 5561 + }, + { + "epoch": 0.49697321687850426, + "grad_norm": 0.44479769468307495, + "learning_rate": 5.291419701181068e-05, + "loss": 0.9789, + "step": 5562 + }, + { + "epoch": 0.49706256840976615, + "grad_norm": 0.4247623383998871, + "learning_rate": 5.2899750772547385e-05, + "loss": 0.9605, + "step": 5563 + }, + { + "epoch": 0.497151919941028, + "grad_norm": 0.4519825279712677, + "learning_rate": 5.288530429039904e-05, + "loss": 0.9244, + "step": 5564 + }, + { + "epoch": 0.49724127147228986, + "grad_norm": 0.42883196473121643, + "learning_rate": 5.287085756657573e-05, + "loss": 0.955, + "step": 5565 + }, + { + "epoch": 0.49733062300355174, + "grad_norm": 0.3723878562450409, + "learning_rate": 5.2856410602287465e-05, + "loss": 0.9387, + "step": 5566 + }, + { + "epoch": 0.49741997453481357, + "grad_norm": 0.4619128704071045, + "learning_rate": 5.284196339874438e-05, + "loss": 0.9301, + "step": 5567 + }, + { + "epoch": 0.49750932606607545, + "grad_norm": 0.42548492550849915, + "learning_rate": 5.282751595715656e-05, + "loss": 1.0291, + "step": 5568 + }, + { + "epoch": 0.49759867759733734, + "grad_norm": 0.43272992968559265, + "learning_rate": 5.281306827873416e-05, + "loss": 1.0186, + "step": 5569 + }, + { + "epoch": 0.4976880291285992, + "grad_norm": 0.4682878851890564, + "learning_rate": 5.279862036468728e-05, + "loss": 0.9629, + "step": 5570 + }, + { + "epoch": 0.49777738065986105, + "grad_norm": 0.5456759929656982, + "learning_rate": 5.2784172216226124e-05, + "loss": 0.8809, + "step": 5571 + }, + { + "epoch": 0.49786673219112293, + "grad_norm": 0.38989564776420593, + "learning_rate": 5.2769723834560855e-05, + "loss": 0.9556, + "step": 5572 + }, + { + "epoch": 0.4979560837223848, + "grad_norm": 0.4226441979408264, + "learning_rate": 5.275527522090171e-05, + "loss": 0.9417, + "step": 5573 + }, + { + "epoch": 0.49804543525364664, + "grad_norm": 0.4417705833911896, + "learning_rate": 5.2740826376458894e-05, + "loss": 0.9943, + "step": 5574 + }, + { + "epoch": 0.4981347867849085, + "grad_norm": 0.4658608138561249, + "learning_rate": 5.2726377302442656e-05, + "loss": 0.907, + "step": 5575 + }, + { + "epoch": 0.4982241383161704, + "grad_norm": 0.516185998916626, + "learning_rate": 5.2711928000063245e-05, + "loss": 0.9793, + "step": 5576 + }, + { + "epoch": 0.49831348984743223, + "grad_norm": 0.4596862494945526, + "learning_rate": 5.269747847053096e-05, + "loss": 0.9666, + "step": 5577 + }, + { + "epoch": 0.4984028413786941, + "grad_norm": 0.43760859966278076, + "learning_rate": 5.268302871505611e-05, + "loss": 0.9403, + "step": 5578 + }, + { + "epoch": 0.498492192909956, + "grad_norm": 0.4713875949382782, + "learning_rate": 5.2668578734849e-05, + "loss": 0.867, + "step": 5579 + }, + { + "epoch": 0.4985815444412179, + "grad_norm": 0.4082961678504944, + "learning_rate": 5.265412853111997e-05, + "loss": 1.002, + "step": 5580 + }, + { + "epoch": 0.4986708959724797, + "grad_norm": 0.4304913282394409, + "learning_rate": 5.2639678105079394e-05, + "loss": 0.9176, + "step": 5581 + }, + { + "epoch": 0.4987602475037416, + "grad_norm": 0.5440099835395813, + "learning_rate": 5.262522745793764e-05, + "loss": 0.8735, + "step": 5582 + }, + { + "epoch": 0.4988495990350035, + "grad_norm": 0.43295422196388245, + "learning_rate": 5.26107765909051e-05, + "loss": 0.9699, + "step": 5583 + }, + { + "epoch": 0.4989389505662653, + "grad_norm": 0.5452165603637695, + "learning_rate": 5.25963255051922e-05, + "loss": 1.0142, + "step": 5584 + }, + { + "epoch": 0.4990283020975272, + "grad_norm": 0.4139503240585327, + "learning_rate": 5.258187420200935e-05, + "loss": 0.9684, + "step": 5585 + }, + { + "epoch": 0.49911765362878907, + "grad_norm": 0.4647497832775116, + "learning_rate": 5.256742268256703e-05, + "loss": 0.9009, + "step": 5586 + }, + { + "epoch": 0.49920700516005095, + "grad_norm": 0.41619575023651123, + "learning_rate": 5.255297094807568e-05, + "loss": 0.9714, + "step": 5587 + }, + { + "epoch": 0.4992963566913128, + "grad_norm": 0.46064916253089905, + "learning_rate": 5.25385189997458e-05, + "loss": 0.9339, + "step": 5588 + }, + { + "epoch": 0.49938570822257466, + "grad_norm": 0.46546754240989685, + "learning_rate": 5.252406683878791e-05, + "loss": 0.9525, + "step": 5589 + }, + { + "epoch": 0.49947505975383655, + "grad_norm": 0.5033959150314331, + "learning_rate": 5.250961446641251e-05, + "loss": 0.9203, + "step": 5590 + }, + { + "epoch": 0.4995644112850984, + "grad_norm": 0.44631263613700867, + "learning_rate": 5.249516188383014e-05, + "loss": 0.9998, + "step": 5591 + }, + { + "epoch": 0.49965376281636026, + "grad_norm": 0.4394984543323517, + "learning_rate": 5.248070909225136e-05, + "loss": 0.9694, + "step": 5592 + }, + { + "epoch": 0.49974311434762214, + "grad_norm": 0.5696101784706116, + "learning_rate": 5.2466256092886754e-05, + "loss": 0.93, + "step": 5593 + }, + { + "epoch": 0.499832465878884, + "grad_norm": 0.4666324555873871, + "learning_rate": 5.245180288694692e-05, + "loss": 0.8524, + "step": 5594 + }, + { + "epoch": 0.49992181741014585, + "grad_norm": 0.5041062235832214, + "learning_rate": 5.243734947564247e-05, + "loss": 0.9306, + "step": 5595 + }, + { + "epoch": 0.5000111689414077, + "grad_norm": 0.4425108730792999, + "learning_rate": 5.242289586018401e-05, + "loss": 0.9774, + "step": 5596 + }, + { + "epoch": 0.5001005204726696, + "grad_norm": 0.38095951080322266, + "learning_rate": 5.240844204178219e-05, + "loss": 1.0063, + "step": 5597 + }, + { + "epoch": 0.5001898720039315, + "grad_norm": 0.4874595105648041, + "learning_rate": 5.23939880216477e-05, + "loss": 0.8665, + "step": 5598 + }, + { + "epoch": 0.5002792235351934, + "grad_norm": 0.4805736839771271, + "learning_rate": 5.237953380099116e-05, + "loss": 0.9152, + "step": 5599 + }, + { + "epoch": 0.5003685750664552, + "grad_norm": 0.60560142993927, + "learning_rate": 5.236507938102334e-05, + "loss": 0.8751, + "step": 5600 + }, + { + "epoch": 0.500457926597717, + "grad_norm": 0.37698978185653687, + "learning_rate": 5.2350624762954884e-05, + "loss": 0.9901, + "step": 5601 + }, + { + "epoch": 0.5005472781289789, + "grad_norm": 0.5520381331443787, + "learning_rate": 5.233616994799655e-05, + "loss": 0.9581, + "step": 5602 + }, + { + "epoch": 0.5006366296602408, + "grad_norm": 0.4423448443412781, + "learning_rate": 5.232171493735909e-05, + "loss": 0.936, + "step": 5603 + }, + { + "epoch": 0.5007259811915027, + "grad_norm": 0.4027252495288849, + "learning_rate": 5.230725973225324e-05, + "loss": 1.0058, + "step": 5604 + }, + { + "epoch": 0.5008153327227646, + "grad_norm": 0.39916396141052246, + "learning_rate": 5.229280433388982e-05, + "loss": 1.1366, + "step": 5605 + }, + { + "epoch": 0.5009046842540265, + "grad_norm": 0.49851128458976746, + "learning_rate": 5.227834874347958e-05, + "loss": 0.8975, + "step": 5606 + }, + { + "epoch": 0.5009940357852882, + "grad_norm": 0.37986597418785095, + "learning_rate": 5.226389296223335e-05, + "loss": 1.0004, + "step": 5607 + }, + { + "epoch": 0.5010833873165501, + "grad_norm": 0.41239768266677856, + "learning_rate": 5.224943699136195e-05, + "loss": 0.9784, + "step": 5608 + }, + { + "epoch": 0.501172738847812, + "grad_norm": 0.40920108556747437, + "learning_rate": 5.223498083207622e-05, + "loss": 0.9932, + "step": 5609 + }, + { + "epoch": 0.5012620903790739, + "grad_norm": 0.4581597149372101, + "learning_rate": 5.222052448558704e-05, + "loss": 0.914, + "step": 5610 + }, + { + "epoch": 0.5013514419103358, + "grad_norm": 0.39035099744796753, + "learning_rate": 5.2206067953105255e-05, + "loss": 0.992, + "step": 5611 + }, + { + "epoch": 0.5014407934415976, + "grad_norm": 0.4067721962928772, + "learning_rate": 5.219161123584176e-05, + "loss": 0.9821, + "step": 5612 + }, + { + "epoch": 0.5015301449728595, + "grad_norm": 0.5137872695922852, + "learning_rate": 5.217715433500747e-05, + "loss": 0.9398, + "step": 5613 + }, + { + "epoch": 0.5016194965041213, + "grad_norm": 0.5111674070358276, + "learning_rate": 5.21626972518133e-05, + "loss": 0.9266, + "step": 5614 + }, + { + "epoch": 0.5017088480353832, + "grad_norm": 0.41246017813682556, + "learning_rate": 5.214823998747017e-05, + "loss": 0.965, + "step": 5615 + }, + { + "epoch": 0.5017981995666451, + "grad_norm": 0.48276659846305847, + "learning_rate": 5.213378254318906e-05, + "loss": 0.9673, + "step": 5616 + }, + { + "epoch": 0.501887551097907, + "grad_norm": 0.48614341020584106, + "learning_rate": 5.211932492018091e-05, + "loss": 0.874, + "step": 5617 + }, + { + "epoch": 0.5019769026291688, + "grad_norm": 0.42692437767982483, + "learning_rate": 5.21048671196567e-05, + "loss": 0.9371, + "step": 5618 + }, + { + "epoch": 0.5020662541604307, + "grad_norm": 0.41222038865089417, + "learning_rate": 5.209040914282744e-05, + "loss": 0.9588, + "step": 5619 + }, + { + "epoch": 0.5021556056916925, + "grad_norm": 0.4768015146255493, + "learning_rate": 5.2075950990904133e-05, + "loss": 0.943, + "step": 5620 + }, + { + "epoch": 0.5022449572229544, + "grad_norm": 0.37785643339157104, + "learning_rate": 5.20614926650978e-05, + "loss": 0.9968, + "step": 5621 + }, + { + "epoch": 0.5023343087542163, + "grad_norm": 0.4530482590198517, + "learning_rate": 5.2047034166619476e-05, + "loss": 0.9125, + "step": 5622 + }, + { + "epoch": 0.5024236602854781, + "grad_norm": 0.4255998432636261, + "learning_rate": 5.2032575496680224e-05, + "loss": 0.9464, + "step": 5623 + }, + { + "epoch": 0.50251301181674, + "grad_norm": 0.41948202252388, + "learning_rate": 5.201811665649111e-05, + "loss": 0.9442, + "step": 5624 + }, + { + "epoch": 0.5026023633480019, + "grad_norm": 0.38996177911758423, + "learning_rate": 5.200365764726323e-05, + "loss": 0.9536, + "step": 5625 + }, + { + "epoch": 0.5026917148792638, + "grad_norm": 0.4734954833984375, + "learning_rate": 5.198919847020765e-05, + "loss": 0.9781, + "step": 5626 + }, + { + "epoch": 0.5027810664105256, + "grad_norm": 0.4030005633831024, + "learning_rate": 5.197473912653549e-05, + "loss": 0.9611, + "step": 5627 + }, + { + "epoch": 0.5028704179417874, + "grad_norm": 0.4445195496082306, + "learning_rate": 5.1960279617457874e-05, + "loss": 0.9721, + "step": 5628 + }, + { + "epoch": 0.5029597694730493, + "grad_norm": 0.3933766186237335, + "learning_rate": 5.1945819944185944e-05, + "loss": 0.9375, + "step": 5629 + }, + { + "epoch": 0.5030491210043112, + "grad_norm": 0.40946388244628906, + "learning_rate": 5.193136010793088e-05, + "loss": 1.0194, + "step": 5630 + }, + { + "epoch": 0.5031384725355731, + "grad_norm": 0.49432265758514404, + "learning_rate": 5.1916900109903796e-05, + "loss": 0.9872, + "step": 5631 + }, + { + "epoch": 0.503227824066835, + "grad_norm": 0.5955692529678345, + "learning_rate": 5.190243995131591e-05, + "loss": 0.9548, + "step": 5632 + }, + { + "epoch": 0.5033171755980969, + "grad_norm": 0.47443050146102905, + "learning_rate": 5.188797963337839e-05, + "loss": 0.9318, + "step": 5633 + }, + { + "epoch": 0.5034065271293586, + "grad_norm": 0.5481427907943726, + "learning_rate": 5.187351915730245e-05, + "loss": 0.8878, + "step": 5634 + }, + { + "epoch": 0.5034958786606205, + "grad_norm": 0.5294390320777893, + "learning_rate": 5.185905852429933e-05, + "loss": 0.8749, + "step": 5635 + }, + { + "epoch": 0.5035852301918824, + "grad_norm": 0.4480739235877991, + "learning_rate": 5.184459773558022e-05, + "loss": 1.0006, + "step": 5636 + }, + { + "epoch": 0.5036745817231443, + "grad_norm": 0.4594273865222931, + "learning_rate": 5.183013679235639e-05, + "loss": 1.0186, + "step": 5637 + }, + { + "epoch": 0.5037639332544062, + "grad_norm": 0.4672705829143524, + "learning_rate": 5.1815675695839095e-05, + "loss": 0.8973, + "step": 5638 + }, + { + "epoch": 0.503853284785668, + "grad_norm": 0.405490517616272, + "learning_rate": 5.180121444723961e-05, + "loss": 0.9966, + "step": 5639 + }, + { + "epoch": 0.5039426363169299, + "grad_norm": 0.4211828410625458, + "learning_rate": 5.178675304776923e-05, + "loss": 0.9086, + "step": 5640 + }, + { + "epoch": 0.5040319878481917, + "grad_norm": 0.4052455723285675, + "learning_rate": 5.177229149863922e-05, + "loss": 0.9668, + "step": 5641 + }, + { + "epoch": 0.5041213393794536, + "grad_norm": 0.4231189787387848, + "learning_rate": 5.1757829801060905e-05, + "loss": 0.9387, + "step": 5642 + }, + { + "epoch": 0.5042106909107155, + "grad_norm": 0.48749008774757385, + "learning_rate": 5.1743367956245614e-05, + "loss": 0.9537, + "step": 5643 + }, + { + "epoch": 0.5043000424419773, + "grad_norm": 0.45534324645996094, + "learning_rate": 5.172890596540466e-05, + "loss": 0.9535, + "step": 5644 + }, + { + "epoch": 0.5043893939732392, + "grad_norm": 0.39456576108932495, + "learning_rate": 5.171444382974943e-05, + "loss": 1.0038, + "step": 5645 + }, + { + "epoch": 0.5044787455045011, + "grad_norm": 0.4487841725349426, + "learning_rate": 5.169998155049123e-05, + "loss": 0.9386, + "step": 5646 + }, + { + "epoch": 0.504568097035763, + "grad_norm": 0.4985732138156891, + "learning_rate": 5.168551912884147e-05, + "loss": 0.9678, + "step": 5647 + }, + { + "epoch": 0.5046574485670248, + "grad_norm": 0.42822030186653137, + "learning_rate": 5.16710565660115e-05, + "loss": 1.0048, + "step": 5648 + }, + { + "epoch": 0.5047468000982867, + "grad_norm": 0.4911450445652008, + "learning_rate": 5.165659386321273e-05, + "loss": 0.9387, + "step": 5649 + }, + { + "epoch": 0.5048361516295485, + "grad_norm": 0.43765896558761597, + "learning_rate": 5.164213102165659e-05, + "loss": 0.9642, + "step": 5650 + }, + { + "epoch": 0.5049255031608104, + "grad_norm": 0.4351087212562561, + "learning_rate": 5.1627668042554454e-05, + "loss": 0.9759, + "step": 5651 + }, + { + "epoch": 0.5050148546920723, + "grad_norm": 0.49687519669532776, + "learning_rate": 5.1613204927117785e-05, + "loss": 0.889, + "step": 5652 + }, + { + "epoch": 0.5051042062233342, + "grad_norm": 0.4558013379573822, + "learning_rate": 5.1598741676557995e-05, + "loss": 1.0162, + "step": 5653 + }, + { + "epoch": 0.5051935577545961, + "grad_norm": 0.5322263240814209, + "learning_rate": 5.158427829208655e-05, + "loss": 0.929, + "step": 5654 + }, + { + "epoch": 0.5052829092858578, + "grad_norm": 0.4086034893989563, + "learning_rate": 5.1569814774914916e-05, + "loss": 0.9369, + "step": 5655 + }, + { + "epoch": 0.5053722608171197, + "grad_norm": 0.44374528527259827, + "learning_rate": 5.155535112625456e-05, + "loss": 0.9484, + "step": 5656 + }, + { + "epoch": 0.5054616123483816, + "grad_norm": 0.495838463306427, + "learning_rate": 5.154088734731696e-05, + "loss": 0.9707, + "step": 5657 + }, + { + "epoch": 0.5055509638796435, + "grad_norm": 0.44730693101882935, + "learning_rate": 5.152642343931362e-05, + "loss": 0.956, + "step": 5658 + }, + { + "epoch": 0.5056403154109054, + "grad_norm": 0.4501069188117981, + "learning_rate": 5.1511959403456055e-05, + "loss": 0.9622, + "step": 5659 + }, + { + "epoch": 0.5057296669421673, + "grad_norm": 0.41660791635513306, + "learning_rate": 5.149749524095577e-05, + "loss": 0.9941, + "step": 5660 + }, + { + "epoch": 0.5058190184734291, + "grad_norm": 0.4883604347705841, + "learning_rate": 5.1483030953024305e-05, + "loss": 1.0128, + "step": 5661 + }, + { + "epoch": 0.5059083700046909, + "grad_norm": 0.4125032424926758, + "learning_rate": 5.146856654087318e-05, + "loss": 1.0034, + "step": 5662 + }, + { + "epoch": 0.5059977215359528, + "grad_norm": 0.4337124824523926, + "learning_rate": 5.145410200571395e-05, + "loss": 0.9983, + "step": 5663 + }, + { + "epoch": 0.5060870730672147, + "grad_norm": 0.42638298869132996, + "learning_rate": 5.1439637348758175e-05, + "loss": 0.9504, + "step": 5664 + }, + { + "epoch": 0.5061764245984766, + "grad_norm": 0.5545623302459717, + "learning_rate": 5.142517257121744e-05, + "loss": 0.9618, + "step": 5665 + }, + { + "epoch": 0.5062657761297384, + "grad_norm": 0.3784050941467285, + "learning_rate": 5.14107076743033e-05, + "loss": 1.0023, + "step": 5666 + }, + { + "epoch": 0.5063551276610003, + "grad_norm": 0.5684899687767029, + "learning_rate": 5.139624265922737e-05, + "loss": 0.9455, + "step": 5667 + }, + { + "epoch": 0.5064444791922622, + "grad_norm": 0.5042484402656555, + "learning_rate": 5.138177752720122e-05, + "loss": 0.9237, + "step": 5668 + }, + { + "epoch": 0.506533830723524, + "grad_norm": 0.4414721727371216, + "learning_rate": 5.1367312279436486e-05, + "loss": 0.9426, + "step": 5669 + }, + { + "epoch": 0.5066231822547859, + "grad_norm": 0.5522531867027283, + "learning_rate": 5.135284691714477e-05, + "loss": 0.9738, + "step": 5670 + }, + { + "epoch": 0.5067125337860477, + "grad_norm": 0.45710471272468567, + "learning_rate": 5.13383814415377e-05, + "loss": 0.9844, + "step": 5671 + }, + { + "epoch": 0.5068018853173096, + "grad_norm": 0.47510063648223877, + "learning_rate": 5.132391585382692e-05, + "loss": 1.033, + "step": 5672 + }, + { + "epoch": 0.5068912368485715, + "grad_norm": 0.4167405366897583, + "learning_rate": 5.130945015522407e-05, + "loss": 0.9195, + "step": 5673 + }, + { + "epoch": 0.5069805883798334, + "grad_norm": 0.4073796570301056, + "learning_rate": 5.129498434694081e-05, + "loss": 0.9487, + "step": 5674 + }, + { + "epoch": 0.5070699399110953, + "grad_norm": 0.37733787298202515, + "learning_rate": 5.128051843018882e-05, + "loss": 0.9742, + "step": 5675 + }, + { + "epoch": 0.507159291442357, + "grad_norm": 0.3933600187301636, + "learning_rate": 5.126605240617975e-05, + "loss": 0.9361, + "step": 5676 + }, + { + "epoch": 0.5072486429736189, + "grad_norm": 0.41629257798194885, + "learning_rate": 5.1251586276125305e-05, + "loss": 0.9511, + "step": 5677 + }, + { + "epoch": 0.5073379945048808, + "grad_norm": 0.47717419266700745, + "learning_rate": 5.123712004123716e-05, + "loss": 0.9537, + "step": 5678 + }, + { + "epoch": 0.5074273460361427, + "grad_norm": 0.5077764391899109, + "learning_rate": 5.122265370272703e-05, + "loss": 0.9123, + "step": 5679 + }, + { + "epoch": 0.5075166975674046, + "grad_norm": 0.4773484170436859, + "learning_rate": 5.1208187261806615e-05, + "loss": 0.9243, + "step": 5680 + }, + { + "epoch": 0.5076060490986665, + "grad_norm": 0.46315327286720276, + "learning_rate": 5.119372071968764e-05, + "loss": 0.962, + "step": 5681 + }, + { + "epoch": 0.5076954006299284, + "grad_norm": 0.4640251100063324, + "learning_rate": 5.117925407758184e-05, + "loss": 0.9902, + "step": 5682 + }, + { + "epoch": 0.5077847521611901, + "grad_norm": 0.5840420722961426, + "learning_rate": 5.116478733670092e-05, + "loss": 0.9444, + "step": 5683 + }, + { + "epoch": 0.507874103692452, + "grad_norm": 0.46668434143066406, + "learning_rate": 5.115032049825667e-05, + "loss": 0.9198, + "step": 5684 + }, + { + "epoch": 0.5079634552237139, + "grad_norm": 0.4504847526550293, + "learning_rate": 5.113585356346081e-05, + "loss": 1.0119, + "step": 5685 + }, + { + "epoch": 0.5080528067549758, + "grad_norm": 0.531743586063385, + "learning_rate": 5.1121386533525105e-05, + "loss": 0.8559, + "step": 5686 + }, + { + "epoch": 0.5081421582862377, + "grad_norm": 0.47331151366233826, + "learning_rate": 5.110691940966133e-05, + "loss": 0.9896, + "step": 5687 + }, + { + "epoch": 0.5082315098174995, + "grad_norm": 0.42783498764038086, + "learning_rate": 5.109245219308124e-05, + "loss": 0.9287, + "step": 5688 + }, + { + "epoch": 0.5083208613487613, + "grad_norm": 0.47757160663604736, + "learning_rate": 5.107798488499664e-05, + "loss": 1.0162, + "step": 5689 + }, + { + "epoch": 0.5084102128800232, + "grad_norm": 0.44240617752075195, + "learning_rate": 5.106351748661932e-05, + "loss": 0.9406, + "step": 5690 + }, + { + "epoch": 0.5084995644112851, + "grad_norm": 0.4076976776123047, + "learning_rate": 5.104904999916108e-05, + "loss": 1.0159, + "step": 5691 + }, + { + "epoch": 0.508588915942547, + "grad_norm": 0.45423173904418945, + "learning_rate": 5.103458242383371e-05, + "loss": 0.948, + "step": 5692 + }, + { + "epoch": 0.5086782674738088, + "grad_norm": 0.42676976323127747, + "learning_rate": 5.102011476184903e-05, + "loss": 0.9556, + "step": 5693 + }, + { + "epoch": 0.5087676190050707, + "grad_norm": 0.4629276990890503, + "learning_rate": 5.1005647014418866e-05, + "loss": 0.9159, + "step": 5694 + }, + { + "epoch": 0.5088569705363326, + "grad_norm": 0.47380349040031433, + "learning_rate": 5.0991179182755046e-05, + "loss": 0.9767, + "step": 5695 + }, + { + "epoch": 0.5089463220675944, + "grad_norm": 0.563545823097229, + "learning_rate": 5.097671126806942e-05, + "loss": 0.9585, + "step": 5696 + }, + { + "epoch": 0.5090356735988563, + "grad_norm": 0.48258113861083984, + "learning_rate": 5.0962243271573795e-05, + "loss": 0.9296, + "step": 5697 + }, + { + "epoch": 0.5091250251301181, + "grad_norm": 0.42884641885757446, + "learning_rate": 5.094777519448005e-05, + "loss": 0.9912, + "step": 5698 + }, + { + "epoch": 0.50921437666138, + "grad_norm": 0.47126665711402893, + "learning_rate": 5.093330703800002e-05, + "loss": 1.0121, + "step": 5699 + }, + { + "epoch": 0.5093037281926419, + "grad_norm": 0.4382041096687317, + "learning_rate": 5.091883880334558e-05, + "loss": 0.9886, + "step": 5700 + }, + { + "epoch": 0.5093930797239038, + "grad_norm": 0.4659852087497711, + "learning_rate": 5.090437049172862e-05, + "loss": 0.9174, + "step": 5701 + }, + { + "epoch": 0.5094824312551657, + "grad_norm": 0.5495339632034302, + "learning_rate": 5.0889902104360965e-05, + "loss": 0.9337, + "step": 5702 + }, + { + "epoch": 0.5095717827864275, + "grad_norm": 0.42241787910461426, + "learning_rate": 5.087543364245454e-05, + "loss": 0.9787, + "step": 5703 + }, + { + "epoch": 0.5096611343176893, + "grad_norm": 0.47585710883140564, + "learning_rate": 5.08609651072212e-05, + "loss": 0.9195, + "step": 5704 + }, + { + "epoch": 0.5097504858489512, + "grad_norm": 0.49689507484436035, + "learning_rate": 5.084649649987285e-05, + "loss": 0.9603, + "step": 5705 + }, + { + "epoch": 0.5098398373802131, + "grad_norm": 0.4226880669593811, + "learning_rate": 5.083202782162142e-05, + "loss": 1.0116, + "step": 5706 + }, + { + "epoch": 0.509929188911475, + "grad_norm": 0.4046030640602112, + "learning_rate": 5.0817559073678775e-05, + "loss": 0.9555, + "step": 5707 + }, + { + "epoch": 0.5100185404427369, + "grad_norm": 0.4440669119358063, + "learning_rate": 5.080309025725685e-05, + "loss": 0.9408, + "step": 5708 + }, + { + "epoch": 0.5101078919739988, + "grad_norm": 0.5712856650352478, + "learning_rate": 5.0788621373567545e-05, + "loss": 0.922, + "step": 5709 + }, + { + "epoch": 0.5101972435052605, + "grad_norm": 0.39347535371780396, + "learning_rate": 5.0774152423822796e-05, + "loss": 1.0552, + "step": 5710 + }, + { + "epoch": 0.5102865950365224, + "grad_norm": 0.45367705821990967, + "learning_rate": 5.075968340923454e-05, + "loss": 0.9869, + "step": 5711 + }, + { + "epoch": 0.5103759465677843, + "grad_norm": 0.48188331723213196, + "learning_rate": 5.074521433101469e-05, + "loss": 0.9637, + "step": 5712 + }, + { + "epoch": 0.5104652980990462, + "grad_norm": 0.4748827815055847, + "learning_rate": 5.07307451903752e-05, + "loss": 0.9533, + "step": 5713 + }, + { + "epoch": 0.510554649630308, + "grad_norm": 0.4055231809616089, + "learning_rate": 5.0716275988528005e-05, + "loss": 0.9954, + "step": 5714 + }, + { + "epoch": 0.5106440011615699, + "grad_norm": 0.4364885091781616, + "learning_rate": 5.0701806726685076e-05, + "loss": 0.9841, + "step": 5715 + }, + { + "epoch": 0.5107333526928318, + "grad_norm": 0.3934551775455475, + "learning_rate": 5.0687337406058345e-05, + "loss": 0.9707, + "step": 5716 + }, + { + "epoch": 0.5108227042240936, + "grad_norm": 0.505912721157074, + "learning_rate": 5.0672868027859774e-05, + "loss": 0.9605, + "step": 5717 + }, + { + "epoch": 0.5109120557553555, + "grad_norm": 0.4747815728187561, + "learning_rate": 5.065839859330134e-05, + "loss": 0.9746, + "step": 5718 + }, + { + "epoch": 0.5110014072866174, + "grad_norm": 0.5368762016296387, + "learning_rate": 5.0643929103595e-05, + "loss": 0.9514, + "step": 5719 + }, + { + "epoch": 0.5110907588178792, + "grad_norm": 0.45678213238716125, + "learning_rate": 5.0629459559952754e-05, + "loss": 0.9288, + "step": 5720 + }, + { + "epoch": 0.5111801103491411, + "grad_norm": 0.407548725605011, + "learning_rate": 5.0614989963586536e-05, + "loss": 0.9745, + "step": 5721 + }, + { + "epoch": 0.511269461880403, + "grad_norm": 0.4418816864490509, + "learning_rate": 5.060052031570837e-05, + "loss": 0.9858, + "step": 5722 + }, + { + "epoch": 0.5113588134116649, + "grad_norm": 0.4085575342178345, + "learning_rate": 5.0586050617530214e-05, + "loss": 1.0397, + "step": 5723 + }, + { + "epoch": 0.5114481649429267, + "grad_norm": 0.4091111719608307, + "learning_rate": 5.057158087026408e-05, + "loss": 1.0209, + "step": 5724 + }, + { + "epoch": 0.5115375164741885, + "grad_norm": 0.42167025804519653, + "learning_rate": 5.0557111075121944e-05, + "loss": 0.9675, + "step": 5725 + }, + { + "epoch": 0.5116268680054504, + "grad_norm": 0.5113321542739868, + "learning_rate": 5.0542641233315834e-05, + "loss": 0.8551, + "step": 5726 + }, + { + "epoch": 0.5117162195367123, + "grad_norm": 0.4538382291793823, + "learning_rate": 5.0528171346057725e-05, + "loss": 0.9376, + "step": 5727 + }, + { + "epoch": 0.5118055710679742, + "grad_norm": 0.4386774003505707, + "learning_rate": 5.0513701414559635e-05, + "loss": 0.9178, + "step": 5728 + }, + { + "epoch": 0.5118949225992361, + "grad_norm": 0.45510563254356384, + "learning_rate": 5.049923144003358e-05, + "loss": 0.9116, + "step": 5729 + }, + { + "epoch": 0.511984274130498, + "grad_norm": 0.39837151765823364, + "learning_rate": 5.048476142369156e-05, + "loss": 0.9675, + "step": 5730 + }, + { + "epoch": 0.5120736256617597, + "grad_norm": 0.5683768391609192, + "learning_rate": 5.047029136674563e-05, + "loss": 0.9143, + "step": 5731 + }, + { + "epoch": 0.5121629771930216, + "grad_norm": 0.44197937846183777, + "learning_rate": 5.0455821270407763e-05, + "loss": 0.9469, + "step": 5732 + }, + { + "epoch": 0.5122523287242835, + "grad_norm": 0.4030378758907318, + "learning_rate": 5.0441351135890004e-05, + "loss": 0.9871, + "step": 5733 + }, + { + "epoch": 0.5123416802555454, + "grad_norm": 0.40224945545196533, + "learning_rate": 5.042688096440439e-05, + "loss": 0.9674, + "step": 5734 + }, + { + "epoch": 0.5124310317868073, + "grad_norm": 0.43630731105804443, + "learning_rate": 5.041241075716294e-05, + "loss": 0.9477, + "step": 5735 + }, + { + "epoch": 0.5125203833180692, + "grad_norm": 0.4436815679073334, + "learning_rate": 5.039794051537771e-05, + "loss": 0.9675, + "step": 5736 + }, + { + "epoch": 0.512609734849331, + "grad_norm": 0.45859208703041077, + "learning_rate": 5.0383470240260713e-05, + "loss": 0.9066, + "step": 5737 + }, + { + "epoch": 0.5126990863805928, + "grad_norm": 0.4863572120666504, + "learning_rate": 5.0368999933024e-05, + "loss": 0.9758, + "step": 5738 + }, + { + "epoch": 0.5127884379118547, + "grad_norm": 0.4681277871131897, + "learning_rate": 5.035452959487959e-05, + "loss": 0.9519, + "step": 5739 + }, + { + "epoch": 0.5128777894431166, + "grad_norm": 0.5145890712738037, + "learning_rate": 5.034005922703956e-05, + "loss": 0.8922, + "step": 5740 + }, + { + "epoch": 0.5129671409743785, + "grad_norm": 0.46464183926582336, + "learning_rate": 5.032558883071594e-05, + "loss": 0.9765, + "step": 5741 + }, + { + "epoch": 0.5130564925056403, + "grad_norm": 0.4565260410308838, + "learning_rate": 5.031111840712079e-05, + "loss": 0.9139, + "step": 5742 + }, + { + "epoch": 0.5131458440369022, + "grad_norm": 0.4922895133495331, + "learning_rate": 5.029664795746616e-05, + "loss": 0.8892, + "step": 5743 + }, + { + "epoch": 0.5132351955681641, + "grad_norm": 0.4101276993751526, + "learning_rate": 5.028217748296409e-05, + "loss": 1.0292, + "step": 5744 + }, + { + "epoch": 0.5133245470994259, + "grad_norm": 0.44919613003730774, + "learning_rate": 5.0267706984826644e-05, + "loss": 0.9039, + "step": 5745 + }, + { + "epoch": 0.5134138986306878, + "grad_norm": 0.403489887714386, + "learning_rate": 5.025323646426591e-05, + "loss": 0.9693, + "step": 5746 + }, + { + "epoch": 0.5135032501619496, + "grad_norm": 0.4634898602962494, + "learning_rate": 5.02387659224939e-05, + "loss": 0.9674, + "step": 5747 + }, + { + "epoch": 0.5135926016932115, + "grad_norm": 0.5770649909973145, + "learning_rate": 5.022429536072271e-05, + "loss": 0.9314, + "step": 5748 + }, + { + "epoch": 0.5136819532244734, + "grad_norm": 0.36833301186561584, + "learning_rate": 5.0209824780164386e-05, + "loss": 1.0579, + "step": 5749 + }, + { + "epoch": 0.5137713047557353, + "grad_norm": 0.44532787799835205, + "learning_rate": 5.0195354182031e-05, + "loss": 1.0662, + "step": 5750 + }, + { + "epoch": 0.5138606562869971, + "grad_norm": 0.40112441778182983, + "learning_rate": 5.018088356753463e-05, + "loss": 0.9897, + "step": 5751 + }, + { + "epoch": 0.513950007818259, + "grad_norm": 0.3955027759075165, + "learning_rate": 5.016641293788732e-05, + "loss": 1.0451, + "step": 5752 + }, + { + "epoch": 0.5140393593495208, + "grad_norm": 0.4612029194831848, + "learning_rate": 5.0151942294301155e-05, + "loss": 0.951, + "step": 5753 + }, + { + "epoch": 0.5141287108807827, + "grad_norm": 0.5001698136329651, + "learning_rate": 5.013747163798821e-05, + "loss": 0.9077, + "step": 5754 + }, + { + "epoch": 0.5142180624120446, + "grad_norm": 0.46220487356185913, + "learning_rate": 5.012300097016055e-05, + "loss": 0.9542, + "step": 5755 + }, + { + "epoch": 0.5143074139433065, + "grad_norm": 0.5143389701843262, + "learning_rate": 5.010853029203024e-05, + "loss": 0.9437, + "step": 5756 + }, + { + "epoch": 0.5143967654745684, + "grad_norm": 0.45577070116996765, + "learning_rate": 5.0094059604809375e-05, + "loss": 0.977, + "step": 5757 + }, + { + "epoch": 0.5144861170058301, + "grad_norm": 0.4429572522640228, + "learning_rate": 5.0079588909710005e-05, + "loss": 0.9611, + "step": 5758 + }, + { + "epoch": 0.514575468537092, + "grad_norm": 0.3987106382846832, + "learning_rate": 5.0065118207944216e-05, + "loss": 0.9967, + "step": 5759 + }, + { + "epoch": 0.5146648200683539, + "grad_norm": 0.37299370765686035, + "learning_rate": 5.005064750072408e-05, + "loss": 0.9641, + "step": 5760 + }, + { + "epoch": 0.5147541715996158, + "grad_norm": 0.5281698107719421, + "learning_rate": 5.003617678926169e-05, + "loss": 0.967, + "step": 5761 + }, + { + "epoch": 0.5148435231308777, + "grad_norm": 0.5716055631637573, + "learning_rate": 5.0021706074769095e-05, + "loss": 0.8772, + "step": 5762 + }, + { + "epoch": 0.5149328746621396, + "grad_norm": 0.5003583431243896, + "learning_rate": 5.000723535845838e-05, + "loss": 1.0233, + "step": 5763 + }, + { + "epoch": 0.5150222261934014, + "grad_norm": 0.40355581045150757, + "learning_rate": 4.999276464154164e-05, + "loss": 0.9501, + "step": 5764 + }, + { + "epoch": 0.5151115777246632, + "grad_norm": 0.4532722234725952, + "learning_rate": 4.997829392523093e-05, + "loss": 0.9233, + "step": 5765 + }, + { + "epoch": 0.5152009292559251, + "grad_norm": 0.46640169620513916, + "learning_rate": 4.9963823210738335e-05, + "loss": 0.9473, + "step": 5766 + }, + { + "epoch": 0.515290280787187, + "grad_norm": 0.4269860088825226, + "learning_rate": 4.994935249927593e-05, + "loss": 0.9754, + "step": 5767 + }, + { + "epoch": 0.5153796323184489, + "grad_norm": 0.45868611335754395, + "learning_rate": 4.9934881792055796e-05, + "loss": 1.0073, + "step": 5768 + }, + { + "epoch": 0.5154689838497107, + "grad_norm": 0.4609023332595825, + "learning_rate": 4.9920411090290006e-05, + "loss": 0.9759, + "step": 5769 + }, + { + "epoch": 0.5155583353809726, + "grad_norm": 0.4061686098575592, + "learning_rate": 4.990594039519063e-05, + "loss": 0.9157, + "step": 5770 + }, + { + "epoch": 0.5156476869122345, + "grad_norm": 0.4474005103111267, + "learning_rate": 4.9891469707969765e-05, + "loss": 0.9675, + "step": 5771 + }, + { + "epoch": 0.5157370384434963, + "grad_norm": 0.41910743713378906, + "learning_rate": 4.987699902983946e-05, + "loss": 0.8943, + "step": 5772 + }, + { + "epoch": 0.5158263899747582, + "grad_norm": 0.5283576250076294, + "learning_rate": 4.9862528362011794e-05, + "loss": 0.8617, + "step": 5773 + }, + { + "epoch": 0.51591574150602, + "grad_norm": 0.6106252670288086, + "learning_rate": 4.9848057705698856e-05, + "loss": 0.9183, + "step": 5774 + }, + { + "epoch": 0.5160050930372819, + "grad_norm": 0.40918898582458496, + "learning_rate": 4.98335870621127e-05, + "loss": 0.9387, + "step": 5775 + }, + { + "epoch": 0.5160944445685438, + "grad_norm": 0.582862138748169, + "learning_rate": 4.981911643246539e-05, + "loss": 0.9047, + "step": 5776 + }, + { + "epoch": 0.5161837960998057, + "grad_norm": 0.4943019449710846, + "learning_rate": 4.9804645817969015e-05, + "loss": 0.8898, + "step": 5777 + }, + { + "epoch": 0.5162731476310676, + "grad_norm": 0.41431719064712524, + "learning_rate": 4.9790175219835626e-05, + "loss": 0.9774, + "step": 5778 + }, + { + "epoch": 0.5163624991623293, + "grad_norm": 0.5077488422393799, + "learning_rate": 4.97757046392773e-05, + "loss": 0.985, + "step": 5779 + }, + { + "epoch": 0.5164518506935912, + "grad_norm": 0.5016276240348816, + "learning_rate": 4.976123407750611e-05, + "loss": 0.9377, + "step": 5780 + }, + { + "epoch": 0.5165412022248531, + "grad_norm": 0.3875533938407898, + "learning_rate": 4.9746763535734104e-05, + "loss": 1.008, + "step": 5781 + }, + { + "epoch": 0.516630553756115, + "grad_norm": 0.5634098649024963, + "learning_rate": 4.973229301517335e-05, + "loss": 0.9303, + "step": 5782 + }, + { + "epoch": 0.5167199052873769, + "grad_norm": 0.446728378534317, + "learning_rate": 4.971782251703591e-05, + "loss": 0.9349, + "step": 5783 + }, + { + "epoch": 0.5168092568186388, + "grad_norm": 0.3732495903968811, + "learning_rate": 4.9703352042533866e-05, + "loss": 0.9877, + "step": 5784 + }, + { + "epoch": 0.5168986083499006, + "grad_norm": 0.4483281075954437, + "learning_rate": 4.9688881592879233e-05, + "loss": 0.875, + "step": 5785 + }, + { + "epoch": 0.5169879598811624, + "grad_norm": 0.4210112392902374, + "learning_rate": 4.967441116928407e-05, + "loss": 0.9735, + "step": 5786 + }, + { + "epoch": 0.5170773114124243, + "grad_norm": 0.40223428606987, + "learning_rate": 4.9659940772960456e-05, + "loss": 0.946, + "step": 5787 + }, + { + "epoch": 0.5171666629436862, + "grad_norm": 0.5056480169296265, + "learning_rate": 4.964547040512042e-05, + "loss": 0.8455, + "step": 5788 + }, + { + "epoch": 0.5172560144749481, + "grad_norm": 0.4198668599128723, + "learning_rate": 4.963100006697602e-05, + "loss": 1.0444, + "step": 5789 + }, + { + "epoch": 0.51734536600621, + "grad_norm": 0.4606395363807678, + "learning_rate": 4.9616529759739305e-05, + "loss": 0.9313, + "step": 5790 + }, + { + "epoch": 0.5174347175374718, + "grad_norm": 0.47824013233184814, + "learning_rate": 4.96020594846223e-05, + "loss": 0.9325, + "step": 5791 + }, + { + "epoch": 0.5175240690687337, + "grad_norm": 0.46849849820137024, + "learning_rate": 4.958758924283706e-05, + "loss": 0.8359, + "step": 5792 + }, + { + "epoch": 0.5176134205999955, + "grad_norm": 0.39913827180862427, + "learning_rate": 4.957311903559561e-05, + "loss": 0.9767, + "step": 5793 + }, + { + "epoch": 0.5177027721312574, + "grad_norm": 0.5645284652709961, + "learning_rate": 4.9558648864110014e-05, + "loss": 0.8838, + "step": 5794 + }, + { + "epoch": 0.5177921236625193, + "grad_norm": 0.4400821924209595, + "learning_rate": 4.954417872959226e-05, + "loss": 0.9193, + "step": 5795 + }, + { + "epoch": 0.5178814751937811, + "grad_norm": 0.4306231439113617, + "learning_rate": 4.95297086332544e-05, + "loss": 0.9235, + "step": 5796 + }, + { + "epoch": 0.517970826725043, + "grad_norm": 0.5578770637512207, + "learning_rate": 4.951523857630845e-05, + "loss": 0.9793, + "step": 5797 + }, + { + "epoch": 0.5180601782563049, + "grad_norm": 0.4247242510318756, + "learning_rate": 4.950076855996643e-05, + "loss": 1.0176, + "step": 5798 + }, + { + "epoch": 0.5181495297875668, + "grad_norm": 0.5067040920257568, + "learning_rate": 4.9486298585440376e-05, + "loss": 0.9595, + "step": 5799 + }, + { + "epoch": 0.5182388813188286, + "grad_norm": 0.5062249898910522, + "learning_rate": 4.9471828653942286e-05, + "loss": 0.8926, + "step": 5800 + }, + { + "epoch": 0.5183282328500904, + "grad_norm": 0.43885526061058044, + "learning_rate": 4.945735876668418e-05, + "loss": 0.9922, + "step": 5801 + }, + { + "epoch": 0.5184175843813523, + "grad_norm": 0.36411014199256897, + "learning_rate": 4.9442888924878054e-05, + "loss": 0.9627, + "step": 5802 + }, + { + "epoch": 0.5185069359126142, + "grad_norm": 0.45766767859458923, + "learning_rate": 4.9428419129735925e-05, + "loss": 0.9285, + "step": 5803 + }, + { + "epoch": 0.5185962874438761, + "grad_norm": 0.4283965826034546, + "learning_rate": 4.9413949382469805e-05, + "loss": 1.0057, + "step": 5804 + }, + { + "epoch": 0.518685638975138, + "grad_norm": 0.4837194085121155, + "learning_rate": 4.939947968429165e-05, + "loss": 0.9082, + "step": 5805 + }, + { + "epoch": 0.5187749905063999, + "grad_norm": 0.459768146276474, + "learning_rate": 4.9385010036413475e-05, + "loss": 0.8664, + "step": 5806 + }, + { + "epoch": 0.5188643420376616, + "grad_norm": 0.39023587107658386, + "learning_rate": 4.9370540440047264e-05, + "loss": 1.0252, + "step": 5807 + }, + { + "epoch": 0.5189536935689235, + "grad_norm": 0.44945186376571655, + "learning_rate": 4.935607089640501e-05, + "loss": 0.919, + "step": 5808 + }, + { + "epoch": 0.5190430451001854, + "grad_norm": 0.4950411915779114, + "learning_rate": 4.934160140669867e-05, + "loss": 0.9833, + "step": 5809 + }, + { + "epoch": 0.5191323966314473, + "grad_norm": 0.42366206645965576, + "learning_rate": 4.932713197214023e-05, + "loss": 0.9545, + "step": 5810 + }, + { + "epoch": 0.5192217481627092, + "grad_norm": 0.5626574158668518, + "learning_rate": 4.931266259394167e-05, + "loss": 0.8383, + "step": 5811 + }, + { + "epoch": 0.519311099693971, + "grad_norm": 0.49013814330101013, + "learning_rate": 4.929819327331493e-05, + "loss": 0.9492, + "step": 5812 + }, + { + "epoch": 0.5194004512252328, + "grad_norm": 0.48132896423339844, + "learning_rate": 4.928372401147199e-05, + "loss": 0.8303, + "step": 5813 + }, + { + "epoch": 0.5194898027564947, + "grad_norm": 0.5105422735214233, + "learning_rate": 4.92692548096248e-05, + "loss": 0.9641, + "step": 5814 + }, + { + "epoch": 0.5195791542877566, + "grad_norm": 0.4313611388206482, + "learning_rate": 4.9254785668985335e-05, + "loss": 0.9816, + "step": 5815 + }, + { + "epoch": 0.5196685058190185, + "grad_norm": 0.3787804841995239, + "learning_rate": 4.924031659076548e-05, + "loss": 0.9836, + "step": 5816 + }, + { + "epoch": 0.5197578573502804, + "grad_norm": 0.519477903842926, + "learning_rate": 4.9225847576177216e-05, + "loss": 0.9174, + "step": 5817 + }, + { + "epoch": 0.5198472088815422, + "grad_norm": 0.40833377838134766, + "learning_rate": 4.921137862643247e-05, + "loss": 0.9769, + "step": 5818 + }, + { + "epoch": 0.5199365604128041, + "grad_norm": 0.5591320991516113, + "learning_rate": 4.919690974274316e-05, + "loss": 1.0388, + "step": 5819 + }, + { + "epoch": 0.5200259119440659, + "grad_norm": 0.44129371643066406, + "learning_rate": 4.918244092632123e-05, + "loss": 0.9731, + "step": 5820 + }, + { + "epoch": 0.5201152634753278, + "grad_norm": 0.3888798654079437, + "learning_rate": 4.916797217837859e-05, + "loss": 0.9446, + "step": 5821 + }, + { + "epoch": 0.5202046150065897, + "grad_norm": 0.4500011205673218, + "learning_rate": 4.915350350012714e-05, + "loss": 1.0109, + "step": 5822 + }, + { + "epoch": 0.5202939665378515, + "grad_norm": 0.5037985444068909, + "learning_rate": 4.913903489277881e-05, + "loss": 0.9591, + "step": 5823 + }, + { + "epoch": 0.5203833180691134, + "grad_norm": 0.49185624718666077, + "learning_rate": 4.9124566357545476e-05, + "loss": 1.0417, + "step": 5824 + }, + { + "epoch": 0.5204726696003753, + "grad_norm": 0.38457438349723816, + "learning_rate": 4.9110097895639046e-05, + "loss": 1.0718, + "step": 5825 + }, + { + "epoch": 0.5205620211316372, + "grad_norm": 0.4443393051624298, + "learning_rate": 4.9095629508271394e-05, + "loss": 1.0176, + "step": 5826 + }, + { + "epoch": 0.520651372662899, + "grad_norm": 0.39499226212501526, + "learning_rate": 4.908116119665443e-05, + "loss": 0.9926, + "step": 5827 + }, + { + "epoch": 0.5207407241941608, + "grad_norm": 0.563543975353241, + "learning_rate": 4.906669296199999e-05, + "loss": 0.87, + "step": 5828 + }, + { + "epoch": 0.5208300757254227, + "grad_norm": 0.5549851655960083, + "learning_rate": 4.905222480551996e-05, + "loss": 0.9655, + "step": 5829 + }, + { + "epoch": 0.5209194272566846, + "grad_norm": 0.5066598653793335, + "learning_rate": 4.903775672842621e-05, + "loss": 0.968, + "step": 5830 + }, + { + "epoch": 0.5210087787879465, + "grad_norm": 0.38776955008506775, + "learning_rate": 4.902328873193059e-05, + "loss": 1.0964, + "step": 5831 + }, + { + "epoch": 0.5210981303192084, + "grad_norm": 0.45412904024124146, + "learning_rate": 4.900882081724495e-05, + "loss": 0.9324, + "step": 5832 + }, + { + "epoch": 0.5211874818504703, + "grad_norm": 0.5597448945045471, + "learning_rate": 4.899435298558113e-05, + "loss": 0.9376, + "step": 5833 + }, + { + "epoch": 0.521276833381732, + "grad_norm": 0.413144588470459, + "learning_rate": 4.897988523815097e-05, + "loss": 0.9871, + "step": 5834 + }, + { + "epoch": 0.5213661849129939, + "grad_norm": 0.44255807995796204, + "learning_rate": 4.8965417576166316e-05, + "loss": 0.9833, + "step": 5835 + }, + { + "epoch": 0.5214555364442558, + "grad_norm": 0.43993690609931946, + "learning_rate": 4.8950950000838945e-05, + "loss": 0.972, + "step": 5836 + }, + { + "epoch": 0.5215448879755177, + "grad_norm": 0.4028208553791046, + "learning_rate": 4.893648251338069e-05, + "loss": 0.9361, + "step": 5837 + }, + { + "epoch": 0.5216342395067796, + "grad_norm": 0.4599561095237732, + "learning_rate": 4.8922015115003375e-05, + "loss": 0.9747, + "step": 5838 + }, + { + "epoch": 0.5217235910380414, + "grad_norm": 0.5245079398155212, + "learning_rate": 4.890754780691877e-05, + "loss": 0.8898, + "step": 5839 + }, + { + "epoch": 0.5218129425693033, + "grad_norm": 0.5127478837966919, + "learning_rate": 4.8893080590338684e-05, + "loss": 0.9127, + "step": 5840 + }, + { + "epoch": 0.5219022941005651, + "grad_norm": 0.37643924355506897, + "learning_rate": 4.887861346647491e-05, + "loss": 0.9822, + "step": 5841 + }, + { + "epoch": 0.521991645631827, + "grad_norm": 0.4069046974182129, + "learning_rate": 4.8864146436539196e-05, + "loss": 0.9837, + "step": 5842 + }, + { + "epoch": 0.5220809971630889, + "grad_norm": 0.4322601854801178, + "learning_rate": 4.884967950174333e-05, + "loss": 0.9865, + "step": 5843 + }, + { + "epoch": 0.5221703486943507, + "grad_norm": 0.4148218333721161, + "learning_rate": 4.883521266329907e-05, + "loss": 0.9269, + "step": 5844 + }, + { + "epoch": 0.5222597002256126, + "grad_norm": 0.3982529640197754, + "learning_rate": 4.882074592241817e-05, + "loss": 0.9791, + "step": 5845 + }, + { + "epoch": 0.5223490517568745, + "grad_norm": 0.48719900846481323, + "learning_rate": 4.880627928031237e-05, + "loss": 0.9728, + "step": 5846 + }, + { + "epoch": 0.5224384032881364, + "grad_norm": 0.5783781409263611, + "learning_rate": 4.87918127381934e-05, + "loss": 0.9212, + "step": 5847 + }, + { + "epoch": 0.5225277548193982, + "grad_norm": 0.4238271713256836, + "learning_rate": 4.8777346297272986e-05, + "loss": 1.0067, + "step": 5848 + }, + { + "epoch": 0.52261710635066, + "grad_norm": 0.433936208486557, + "learning_rate": 4.876287995876285e-05, + "loss": 0.9611, + "step": 5849 + }, + { + "epoch": 0.5227064578819219, + "grad_norm": 0.4108850657939911, + "learning_rate": 4.874841372387471e-05, + "loss": 0.9153, + "step": 5850 + }, + { + "epoch": 0.5227958094131838, + "grad_norm": 0.4847879111766815, + "learning_rate": 4.8733947593820254e-05, + "loss": 0.9725, + "step": 5851 + }, + { + "epoch": 0.5228851609444457, + "grad_norm": 0.508726954460144, + "learning_rate": 4.871948156981119e-05, + "loss": 0.8959, + "step": 5852 + }, + { + "epoch": 0.5229745124757076, + "grad_norm": 0.4954078495502472, + "learning_rate": 4.870501565305919e-05, + "loss": 0.9549, + "step": 5853 + }, + { + "epoch": 0.5230638640069695, + "grad_norm": 0.4881262183189392, + "learning_rate": 4.8690549844775935e-05, + "loss": 1.066, + "step": 5854 + }, + { + "epoch": 0.5231532155382312, + "grad_norm": 0.6973787546157837, + "learning_rate": 4.867608414617311e-05, + "loss": 0.9015, + "step": 5855 + }, + { + "epoch": 0.5232425670694931, + "grad_norm": 0.4565022587776184, + "learning_rate": 4.8661618558462325e-05, + "loss": 0.9478, + "step": 5856 + }, + { + "epoch": 0.523331918600755, + "grad_norm": 0.48407623171806335, + "learning_rate": 4.8647153082855254e-05, + "loss": 0.9787, + "step": 5857 + }, + { + "epoch": 0.5234212701320169, + "grad_norm": 0.4646012783050537, + "learning_rate": 4.863268772056353e-05, + "loss": 0.9203, + "step": 5858 + }, + { + "epoch": 0.5235106216632788, + "grad_norm": 0.4288366436958313, + "learning_rate": 4.8618222472798783e-05, + "loss": 0.9631, + "step": 5859 + }, + { + "epoch": 0.5235999731945407, + "grad_norm": 0.5206550359725952, + "learning_rate": 4.860375734077265e-05, + "loss": 0.9404, + "step": 5860 + }, + { + "epoch": 0.5236893247258025, + "grad_norm": 0.36840370297431946, + "learning_rate": 4.858929232569671e-05, + "loss": 1.0194, + "step": 5861 + }, + { + "epoch": 0.5237786762570643, + "grad_norm": 0.4584810435771942, + "learning_rate": 4.857482742878257e-05, + "loss": 0.9383, + "step": 5862 + }, + { + "epoch": 0.5238680277883262, + "grad_norm": 0.3916776478290558, + "learning_rate": 4.856036265124182e-05, + "loss": 1.0608, + "step": 5863 + }, + { + "epoch": 0.5239573793195881, + "grad_norm": 0.4396377503871918, + "learning_rate": 4.854589799428606e-05, + "loss": 0.9877, + "step": 5864 + }, + { + "epoch": 0.52404673085085, + "grad_norm": 0.5073574781417847, + "learning_rate": 4.853143345912685e-05, + "loss": 0.8528, + "step": 5865 + }, + { + "epoch": 0.5241360823821118, + "grad_norm": 0.5877308249473572, + "learning_rate": 4.851696904697573e-05, + "loss": 0.9014, + "step": 5866 + }, + { + "epoch": 0.5242254339133737, + "grad_norm": 0.470747172832489, + "learning_rate": 4.8502504759044245e-05, + "loss": 0.922, + "step": 5867 + }, + { + "epoch": 0.5243147854446356, + "grad_norm": 0.4053327143192291, + "learning_rate": 4.848804059654396e-05, + "loss": 0.93, + "step": 5868 + }, + { + "epoch": 0.5244041369758974, + "grad_norm": 0.44682514667510986, + "learning_rate": 4.847357656068639e-05, + "loss": 1.0523, + "step": 5869 + }, + { + "epoch": 0.5244934885071593, + "grad_norm": 0.3955923914909363, + "learning_rate": 4.845911265268305e-05, + "loss": 1.0694, + "step": 5870 + }, + { + "epoch": 0.5245828400384211, + "grad_norm": 0.46006014943122864, + "learning_rate": 4.844464887374545e-05, + "loss": 0.9943, + "step": 5871 + }, + { + "epoch": 0.524672191569683, + "grad_norm": 0.4770232141017914, + "learning_rate": 4.8430185225085096e-05, + "loss": 0.9395, + "step": 5872 + }, + { + "epoch": 0.5247615431009449, + "grad_norm": 0.5369167327880859, + "learning_rate": 4.841572170791345e-05, + "loss": 0.9318, + "step": 5873 + }, + { + "epoch": 0.5248508946322068, + "grad_norm": 0.4109400808811188, + "learning_rate": 4.8401258323442016e-05, + "loss": 1.0272, + "step": 5874 + }, + { + "epoch": 0.5249402461634686, + "grad_norm": 0.42871788144111633, + "learning_rate": 4.8386795072882234e-05, + "loss": 0.9692, + "step": 5875 + }, + { + "epoch": 0.5250295976947305, + "grad_norm": 0.4132499098777771, + "learning_rate": 4.837233195744556e-05, + "loss": 0.9554, + "step": 5876 + }, + { + "epoch": 0.5251189492259923, + "grad_norm": 0.40216875076293945, + "learning_rate": 4.835786897834342e-05, + "loss": 0.9684, + "step": 5877 + }, + { + "epoch": 0.5252083007572542, + "grad_norm": 0.3839309513568878, + "learning_rate": 4.8343406136787274e-05, + "loss": 0.9613, + "step": 5878 + }, + { + "epoch": 0.5252976522885161, + "grad_norm": 0.4431716799736023, + "learning_rate": 4.832894343398851e-05, + "loss": 0.9824, + "step": 5879 + }, + { + "epoch": 0.525387003819778, + "grad_norm": 0.5026467442512512, + "learning_rate": 4.831448087115855e-05, + "loss": 1.0332, + "step": 5880 + }, + { + "epoch": 0.5254763553510399, + "grad_norm": 0.431949257850647, + "learning_rate": 4.8300018449508775e-05, + "loss": 0.9471, + "step": 5881 + }, + { + "epoch": 0.5255657068823016, + "grad_norm": 0.5039017796516418, + "learning_rate": 4.8285556170250584e-05, + "loss": 0.9319, + "step": 5882 + }, + { + "epoch": 0.5256550584135635, + "grad_norm": 0.44245362281799316, + "learning_rate": 4.8271094034595336e-05, + "loss": 0.9568, + "step": 5883 + }, + { + "epoch": 0.5257444099448254, + "grad_norm": 0.4410870671272278, + "learning_rate": 4.825663204375439e-05, + "loss": 0.9713, + "step": 5884 + }, + { + "epoch": 0.5258337614760873, + "grad_norm": 0.5179281234741211, + "learning_rate": 4.824217019893912e-05, + "loss": 0.8289, + "step": 5885 + }, + { + "epoch": 0.5259231130073492, + "grad_norm": 0.4670274555683136, + "learning_rate": 4.82277085013608e-05, + "loss": 0.9096, + "step": 5886 + }, + { + "epoch": 0.5260124645386111, + "grad_norm": 0.5021420121192932, + "learning_rate": 4.8213246952230794e-05, + "loss": 0.9992, + "step": 5887 + }, + { + "epoch": 0.5261018160698729, + "grad_norm": 0.4476109445095062, + "learning_rate": 4.81987855527604e-05, + "loss": 0.9169, + "step": 5888 + }, + { + "epoch": 0.5261911676011347, + "grad_norm": 0.4486454725265503, + "learning_rate": 4.818432430416091e-05, + "loss": 1.0424, + "step": 5889 + }, + { + "epoch": 0.5262805191323966, + "grad_norm": 0.5046273469924927, + "learning_rate": 4.8169863207643615e-05, + "loss": 0.9194, + "step": 5890 + }, + { + "epoch": 0.5263698706636585, + "grad_norm": 0.4156638979911804, + "learning_rate": 4.815540226441979e-05, + "loss": 0.9668, + "step": 5891 + }, + { + "epoch": 0.5264592221949204, + "grad_norm": 0.4498765468597412, + "learning_rate": 4.814094147570069e-05, + "loss": 0.9449, + "step": 5892 + }, + { + "epoch": 0.5265485737261822, + "grad_norm": 0.4794931411743164, + "learning_rate": 4.812648084269756e-05, + "loss": 1.0374, + "step": 5893 + }, + { + "epoch": 0.5266379252574441, + "grad_norm": 0.4733235836029053, + "learning_rate": 4.811202036662162e-05, + "loss": 0.9983, + "step": 5894 + }, + { + "epoch": 0.526727276788706, + "grad_norm": 0.46899041533470154, + "learning_rate": 4.8097560048684095e-05, + "loss": 0.9764, + "step": 5895 + }, + { + "epoch": 0.5268166283199678, + "grad_norm": 0.44023454189300537, + "learning_rate": 4.808309989009621e-05, + "loss": 0.935, + "step": 5896 + }, + { + "epoch": 0.5269059798512297, + "grad_norm": 0.43503814935684204, + "learning_rate": 4.806863989206914e-05, + "loss": 0.8848, + "step": 5897 + }, + { + "epoch": 0.5269953313824915, + "grad_norm": 0.4301520884037018, + "learning_rate": 4.805418005581406e-05, + "loss": 0.999, + "step": 5898 + }, + { + "epoch": 0.5270846829137534, + "grad_norm": 0.3994184136390686, + "learning_rate": 4.803972038254214e-05, + "loss": 0.9203, + "step": 5899 + }, + { + "epoch": 0.5271740344450153, + "grad_norm": 0.4339260756969452, + "learning_rate": 4.802526087346453e-05, + "loss": 1.0067, + "step": 5900 + }, + { + "epoch": 0.5272633859762772, + "grad_norm": 0.4898630678653717, + "learning_rate": 4.801080152979237e-05, + "loss": 0.9212, + "step": 5901 + }, + { + "epoch": 0.5273527375075391, + "grad_norm": 0.49684223532676697, + "learning_rate": 4.799634235273679e-05, + "loss": 0.9273, + "step": 5902 + }, + { + "epoch": 0.5274420890388009, + "grad_norm": 0.420746386051178, + "learning_rate": 4.798188334350889e-05, + "loss": 0.9468, + "step": 5903 + }, + { + "epoch": 0.5275314405700627, + "grad_norm": 0.4721129536628723, + "learning_rate": 4.7967424503319774e-05, + "loss": 0.9378, + "step": 5904 + }, + { + "epoch": 0.5276207921013246, + "grad_norm": 0.4775635004043579, + "learning_rate": 4.795296583338052e-05, + "loss": 0.996, + "step": 5905 + }, + { + "epoch": 0.5277101436325865, + "grad_norm": 0.4336440861225128, + "learning_rate": 4.793850733490222e-05, + "loss": 1.0088, + "step": 5906 + }, + { + "epoch": 0.5277994951638484, + "grad_norm": 0.4374808669090271, + "learning_rate": 4.792404900909589e-05, + "loss": 0.9745, + "step": 5907 + }, + { + "epoch": 0.5278888466951103, + "grad_norm": 0.4299390912055969, + "learning_rate": 4.7909590857172574e-05, + "loss": 0.9602, + "step": 5908 + }, + { + "epoch": 0.5279781982263722, + "grad_norm": 0.4343792498111725, + "learning_rate": 4.7895132880343306e-05, + "loss": 0.9381, + "step": 5909 + }, + { + "epoch": 0.5280675497576339, + "grad_norm": 0.536439836025238, + "learning_rate": 4.78806750798191e-05, + "loss": 0.8658, + "step": 5910 + }, + { + "epoch": 0.5281569012888958, + "grad_norm": 0.46042898297309875, + "learning_rate": 4.786621745681096e-05, + "loss": 0.9794, + "step": 5911 + }, + { + "epoch": 0.5282462528201577, + "grad_norm": 0.4773177206516266, + "learning_rate": 4.785176001252983e-05, + "loss": 0.9424, + "step": 5912 + }, + { + "epoch": 0.5283356043514196, + "grad_norm": 0.47220951318740845, + "learning_rate": 4.7837302748186705e-05, + "loss": 0.9412, + "step": 5913 + }, + { + "epoch": 0.5284249558826815, + "grad_norm": 0.46493634581565857, + "learning_rate": 4.782284566499253e-05, + "loss": 0.9811, + "step": 5914 + }, + { + "epoch": 0.5285143074139433, + "grad_norm": 0.5822499990463257, + "learning_rate": 4.780838876415824e-05, + "loss": 0.8253, + "step": 5915 + }, + { + "epoch": 0.5286036589452052, + "grad_norm": 0.46834489703178406, + "learning_rate": 4.779393204689477e-05, + "loss": 0.9326, + "step": 5916 + }, + { + "epoch": 0.528693010476467, + "grad_norm": 0.5302138328552246, + "learning_rate": 4.777947551441298e-05, + "loss": 0.8613, + "step": 5917 + }, + { + "epoch": 0.5287823620077289, + "grad_norm": 0.4419107735157013, + "learning_rate": 4.776501916792379e-05, + "loss": 0.9798, + "step": 5918 + }, + { + "epoch": 0.5288717135389908, + "grad_norm": 0.4270457923412323, + "learning_rate": 4.775056300863806e-05, + "loss": 1.0351, + "step": 5919 + }, + { + "epoch": 0.5289610650702526, + "grad_norm": 0.4577907621860504, + "learning_rate": 4.773610703776666e-05, + "loss": 0.9121, + "step": 5920 + }, + { + "epoch": 0.5290504166015145, + "grad_norm": 0.4939514398574829, + "learning_rate": 4.772165125652043e-05, + "loss": 0.9367, + "step": 5921 + }, + { + "epoch": 0.5291397681327764, + "grad_norm": 0.45236796140670776, + "learning_rate": 4.7707195666110195e-05, + "loss": 0.9506, + "step": 5922 + }, + { + "epoch": 0.5292291196640383, + "grad_norm": 0.4021003842353821, + "learning_rate": 4.7692740267746757e-05, + "loss": 0.9241, + "step": 5923 + }, + { + "epoch": 0.5293184711953001, + "grad_norm": 0.44145897030830383, + "learning_rate": 4.767828506264091e-05, + "loss": 0.937, + "step": 5924 + }, + { + "epoch": 0.529407822726562, + "grad_norm": 0.4989675283432007, + "learning_rate": 4.766383005200345e-05, + "loss": 1.0619, + "step": 5925 + }, + { + "epoch": 0.5294971742578238, + "grad_norm": 0.45392560958862305, + "learning_rate": 4.7649375237045135e-05, + "loss": 0.9564, + "step": 5926 + }, + { + "epoch": 0.5295865257890857, + "grad_norm": 0.40658605098724365, + "learning_rate": 4.763492061897669e-05, + "loss": 0.9595, + "step": 5927 + }, + { + "epoch": 0.5296758773203476, + "grad_norm": 0.47459685802459717, + "learning_rate": 4.762046619900884e-05, + "loss": 0.9083, + "step": 5928 + }, + { + "epoch": 0.5297652288516095, + "grad_norm": 0.5040339827537537, + "learning_rate": 4.7606011978352315e-05, + "loss": 0.9873, + "step": 5929 + }, + { + "epoch": 0.5298545803828714, + "grad_norm": 0.5010223388671875, + "learning_rate": 4.759155795821782e-05, + "loss": 0.8799, + "step": 5930 + }, + { + "epoch": 0.5299439319141331, + "grad_norm": 0.4148063063621521, + "learning_rate": 4.7577104139816e-05, + "loss": 0.9431, + "step": 5931 + }, + { + "epoch": 0.530033283445395, + "grad_norm": 0.469046413898468, + "learning_rate": 4.7562650524357536e-05, + "loss": 0.9503, + "step": 5932 + }, + { + "epoch": 0.5301226349766569, + "grad_norm": 0.5078368186950684, + "learning_rate": 4.754819711305308e-05, + "loss": 0.9369, + "step": 5933 + }, + { + "epoch": 0.5302119865079188, + "grad_norm": 0.5710771083831787, + "learning_rate": 4.753374390711324e-05, + "loss": 0.8195, + "step": 5934 + }, + { + "epoch": 0.5303013380391807, + "grad_norm": 0.4148007035255432, + "learning_rate": 4.751929090774864e-05, + "loss": 0.9669, + "step": 5935 + }, + { + "epoch": 0.5303906895704426, + "grad_norm": 0.4209470748901367, + "learning_rate": 4.7504838116169895e-05, + "loss": 0.9673, + "step": 5936 + }, + { + "epoch": 0.5304800411017044, + "grad_norm": 0.45647767186164856, + "learning_rate": 4.7490385533587525e-05, + "loss": 0.9706, + "step": 5937 + }, + { + "epoch": 0.5305693926329662, + "grad_norm": 0.4158954620361328, + "learning_rate": 4.7475933161212116e-05, + "loss": 1.0093, + "step": 5938 + }, + { + "epoch": 0.5306587441642281, + "grad_norm": 0.4575401842594147, + "learning_rate": 4.746148100025421e-05, + "loss": 1.0107, + "step": 5939 + }, + { + "epoch": 0.53074809569549, + "grad_norm": 0.4388822019100189, + "learning_rate": 4.7447029051924334e-05, + "loss": 0.9565, + "step": 5940 + }, + { + "epoch": 0.5308374472267519, + "grad_norm": 0.5215067267417908, + "learning_rate": 4.7432577317432984e-05, + "loss": 0.8885, + "step": 5941 + }, + { + "epoch": 0.5309267987580137, + "grad_norm": 0.5837841033935547, + "learning_rate": 4.7418125797990655e-05, + "loss": 0.9078, + "step": 5942 + }, + { + "epoch": 0.5310161502892756, + "grad_norm": 0.4431391656398773, + "learning_rate": 4.740367449480781e-05, + "loss": 0.9465, + "step": 5943 + }, + { + "epoch": 0.5311055018205374, + "grad_norm": 0.4966404438018799, + "learning_rate": 4.7389223409094904e-05, + "loss": 0.9855, + "step": 5944 + }, + { + "epoch": 0.5311948533517993, + "grad_norm": 0.4595504403114319, + "learning_rate": 4.737477254206236e-05, + "loss": 0.9012, + "step": 5945 + }, + { + "epoch": 0.5312842048830612, + "grad_norm": 0.4891195595264435, + "learning_rate": 4.736032189492062e-05, + "loss": 0.865, + "step": 5946 + }, + { + "epoch": 0.531373556414323, + "grad_norm": 0.38100045919418335, + "learning_rate": 4.734587146888003e-05, + "loss": 0.9545, + "step": 5947 + }, + { + "epoch": 0.5314629079455849, + "grad_norm": 0.4812198877334595, + "learning_rate": 4.733142126515101e-05, + "loss": 1.0111, + "step": 5948 + }, + { + "epoch": 0.5315522594768468, + "grad_norm": 0.5166088938713074, + "learning_rate": 4.7316971284943905e-05, + "loss": 0.9222, + "step": 5949 + }, + { + "epoch": 0.5316416110081087, + "grad_norm": 0.42743030190467834, + "learning_rate": 4.730252152946905e-05, + "loss": 0.9176, + "step": 5950 + }, + { + "epoch": 0.5317309625393705, + "grad_norm": 0.4856008291244507, + "learning_rate": 4.7288071999936766e-05, + "loss": 0.9253, + "step": 5951 + }, + { + "epoch": 0.5318203140706323, + "grad_norm": 0.40241360664367676, + "learning_rate": 4.7273622697557356e-05, + "loss": 1.0319, + "step": 5952 + }, + { + "epoch": 0.5319096656018942, + "grad_norm": 0.46790629625320435, + "learning_rate": 4.725917362354111e-05, + "loss": 0.9318, + "step": 5953 + }, + { + "epoch": 0.5319990171331561, + "grad_norm": 0.4300851821899414, + "learning_rate": 4.7244724779098293e-05, + "loss": 0.9938, + "step": 5954 + }, + { + "epoch": 0.532088368664418, + "grad_norm": 0.5142315626144409, + "learning_rate": 4.7230276165439136e-05, + "loss": 0.9487, + "step": 5955 + }, + { + "epoch": 0.5321777201956799, + "grad_norm": 0.38115739822387695, + "learning_rate": 4.72158277837739e-05, + "loss": 0.9286, + "step": 5956 + }, + { + "epoch": 0.5322670717269418, + "grad_norm": 0.7321306467056274, + "learning_rate": 4.720137963531274e-05, + "loss": 0.9289, + "step": 5957 + }, + { + "epoch": 0.5323564232582035, + "grad_norm": 0.4938461184501648, + "learning_rate": 4.718693172126587e-05, + "loss": 0.973, + "step": 5958 + }, + { + "epoch": 0.5324457747894654, + "grad_norm": 0.5693181157112122, + "learning_rate": 4.7172484042843454e-05, + "loss": 0.9181, + "step": 5959 + }, + { + "epoch": 0.5325351263207273, + "grad_norm": 0.5643350481987, + "learning_rate": 4.7158036601255634e-05, + "loss": 1.017, + "step": 5960 + }, + { + "epoch": 0.5326244778519892, + "grad_norm": 0.5340102910995483, + "learning_rate": 4.714358939771255e-05, + "loss": 0.8804, + "step": 5961 + }, + { + "epoch": 0.5327138293832511, + "grad_norm": 0.5467627048492432, + "learning_rate": 4.712914243342429e-05, + "loss": 0.8569, + "step": 5962 + }, + { + "epoch": 0.532803180914513, + "grad_norm": 0.5655173659324646, + "learning_rate": 4.711469570960096e-05, + "loss": 0.9432, + "step": 5963 + }, + { + "epoch": 0.5328925324457748, + "grad_norm": 0.41005784273147583, + "learning_rate": 4.7100249227452627e-05, + "loss": 1.0204, + "step": 5964 + }, + { + "epoch": 0.5329818839770366, + "grad_norm": 0.4703254997730255, + "learning_rate": 4.7085802988189315e-05, + "loss": 0.9528, + "step": 5965 + }, + { + "epoch": 0.5330712355082985, + "grad_norm": 0.4015927314758301, + "learning_rate": 4.707135699302108e-05, + "loss": 0.955, + "step": 5966 + }, + { + "epoch": 0.5331605870395604, + "grad_norm": 0.4751605689525604, + "learning_rate": 4.705691124315792e-05, + "loss": 0.9394, + "step": 5967 + }, + { + "epoch": 0.5332499385708223, + "grad_norm": 0.4073632061481476, + "learning_rate": 4.70424657398098e-05, + "loss": 0.943, + "step": 5968 + }, + { + "epoch": 0.5333392901020841, + "grad_norm": 0.43543925881385803, + "learning_rate": 4.70280204841867e-05, + "loss": 0.9864, + "step": 5969 + }, + { + "epoch": 0.533428641633346, + "grad_norm": 0.4593610465526581, + "learning_rate": 4.701357547749856e-05, + "loss": 0.9586, + "step": 5970 + }, + { + "epoch": 0.5335179931646079, + "grad_norm": 0.4604717493057251, + "learning_rate": 4.699913072095531e-05, + "loss": 0.9719, + "step": 5971 + }, + { + "epoch": 0.5336073446958697, + "grad_norm": 0.522756814956665, + "learning_rate": 4.698468621576685e-05, + "loss": 0.9178, + "step": 5972 + }, + { + "epoch": 0.5336966962271316, + "grad_norm": 0.43565860390663147, + "learning_rate": 4.697024196314305e-05, + "loss": 0.9208, + "step": 5973 + }, + { + "epoch": 0.5337860477583934, + "grad_norm": 0.45407941937446594, + "learning_rate": 4.695579796429379e-05, + "loss": 0.9655, + "step": 5974 + }, + { + "epoch": 0.5338753992896553, + "grad_norm": 0.524958074092865, + "learning_rate": 4.69413542204289e-05, + "loss": 0.9391, + "step": 5975 + }, + { + "epoch": 0.5339647508209172, + "grad_norm": 0.4323229193687439, + "learning_rate": 4.6926910732758215e-05, + "loss": 0.937, + "step": 5976 + }, + { + "epoch": 0.5340541023521791, + "grad_norm": 0.48769766092300415, + "learning_rate": 4.69124675024915e-05, + "loss": 0.9422, + "step": 5977 + }, + { + "epoch": 0.534143453883441, + "grad_norm": 0.47193393111228943, + "learning_rate": 4.689802453083853e-05, + "loss": 0.9762, + "step": 5978 + }, + { + "epoch": 0.5342328054147027, + "grad_norm": 0.574012041091919, + "learning_rate": 4.688358181900907e-05, + "loss": 0.867, + "step": 5979 + }, + { + "epoch": 0.5343221569459646, + "grad_norm": 0.43543365597724915, + "learning_rate": 4.686913936821287e-05, + "loss": 0.9602, + "step": 5980 + }, + { + "epoch": 0.5344115084772265, + "grad_norm": 0.45189157128334045, + "learning_rate": 4.6854697179659614e-05, + "loss": 0.9971, + "step": 5981 + }, + { + "epoch": 0.5345008600084884, + "grad_norm": 0.4232499599456787, + "learning_rate": 4.684025525455899e-05, + "loss": 1.0099, + "step": 5982 + }, + { + "epoch": 0.5345902115397503, + "grad_norm": 0.42236101627349854, + "learning_rate": 4.682581359412066e-05, + "loss": 0.9674, + "step": 5983 + }, + { + "epoch": 0.5346795630710122, + "grad_norm": 0.4021371603012085, + "learning_rate": 4.681137219955429e-05, + "loss": 0.932, + "step": 5984 + }, + { + "epoch": 0.534768914602274, + "grad_norm": 0.39467480778694153, + "learning_rate": 4.6796931072069484e-05, + "loss": 0.9422, + "step": 5985 + }, + { + "epoch": 0.5348582661335358, + "grad_norm": 0.40237951278686523, + "learning_rate": 4.678249021287583e-05, + "loss": 0.9538, + "step": 5986 + }, + { + "epoch": 0.5349476176647977, + "grad_norm": 0.5124891996383667, + "learning_rate": 4.6768049623182953e-05, + "loss": 0.9768, + "step": 5987 + }, + { + "epoch": 0.5350369691960596, + "grad_norm": 0.4601878225803375, + "learning_rate": 4.675360930420035e-05, + "loss": 0.9826, + "step": 5988 + }, + { + "epoch": 0.5351263207273215, + "grad_norm": 0.4513419568538666, + "learning_rate": 4.673916925713756e-05, + "loss": 0.9567, + "step": 5989 + }, + { + "epoch": 0.5352156722585834, + "grad_norm": 0.38111451268196106, + "learning_rate": 4.672472948320411e-05, + "loss": 1.0469, + "step": 5990 + }, + { + "epoch": 0.5353050237898452, + "grad_norm": 0.5757367610931396, + "learning_rate": 4.671028998360947e-05, + "loss": 0.9822, + "step": 5991 + }, + { + "epoch": 0.5353943753211071, + "grad_norm": 0.5025779008865356, + "learning_rate": 4.669585075956312e-05, + "loss": 0.9309, + "step": 5992 + }, + { + "epoch": 0.5354837268523689, + "grad_norm": 0.4102911651134491, + "learning_rate": 4.668141181227448e-05, + "loss": 0.9255, + "step": 5993 + }, + { + "epoch": 0.5355730783836308, + "grad_norm": 0.4956420958042145, + "learning_rate": 4.666697314295298e-05, + "loss": 0.8852, + "step": 5994 + }, + { + "epoch": 0.5356624299148927, + "grad_norm": 0.4618532061576843, + "learning_rate": 4.665253475280801e-05, + "loss": 0.9496, + "step": 5995 + }, + { + "epoch": 0.5357517814461545, + "grad_norm": 0.5225943922996521, + "learning_rate": 4.663809664304894e-05, + "loss": 0.9285, + "step": 5996 + }, + { + "epoch": 0.5358411329774164, + "grad_norm": 0.5119801759719849, + "learning_rate": 4.662365881488511e-05, + "loss": 0.9436, + "step": 5997 + }, + { + "epoch": 0.5359304845086783, + "grad_norm": 0.5475390553474426, + "learning_rate": 4.6609221269525835e-05, + "loss": 0.9381, + "step": 5998 + }, + { + "epoch": 0.5360198360399402, + "grad_norm": 0.48015183210372925, + "learning_rate": 4.659478400818043e-05, + "loss": 0.9396, + "step": 5999 + }, + { + "epoch": 0.536109187571202, + "grad_norm": 0.5057094693183899, + "learning_rate": 4.658034703205816e-05, + "loss": 0.9935, + "step": 6000 + }, + { + "epoch": 0.5361985391024638, + "grad_norm": 0.48920387029647827, + "learning_rate": 4.6565910342368266e-05, + "loss": 0.9402, + "step": 6001 + }, + { + "epoch": 0.5362878906337257, + "grad_norm": 0.4365025758743286, + "learning_rate": 4.6551473940319995e-05, + "loss": 0.9472, + "step": 6002 + }, + { + "epoch": 0.5363772421649876, + "grad_norm": 0.408589243888855, + "learning_rate": 4.6537037827122536e-05, + "loss": 0.9435, + "step": 6003 + }, + { + "epoch": 0.5364665936962495, + "grad_norm": 0.5231300592422485, + "learning_rate": 4.652260200398507e-05, + "loss": 0.9997, + "step": 6004 + }, + { + "epoch": 0.5365559452275114, + "grad_norm": 0.435147225856781, + "learning_rate": 4.6508166472116754e-05, + "loss": 0.9694, + "step": 6005 + }, + { + "epoch": 0.5366452967587731, + "grad_norm": 0.42764097452163696, + "learning_rate": 4.649373123272672e-05, + "loss": 0.9844, + "step": 6006 + }, + { + "epoch": 0.536734648290035, + "grad_norm": 0.5183307528495789, + "learning_rate": 4.647929628702408e-05, + "loss": 0.9385, + "step": 6007 + }, + { + "epoch": 0.5368239998212969, + "grad_norm": 0.4516974091529846, + "learning_rate": 4.6464861636217895e-05, + "loss": 0.9214, + "step": 6008 + }, + { + "epoch": 0.5369133513525588, + "grad_norm": 0.39056047797203064, + "learning_rate": 4.645042728151722e-05, + "loss": 0.9394, + "step": 6009 + }, + { + "epoch": 0.5370027028838207, + "grad_norm": 0.3742974102497101, + "learning_rate": 4.64359932241311e-05, + "loss": 0.9829, + "step": 6010 + }, + { + "epoch": 0.5370920544150826, + "grad_norm": 0.491948664188385, + "learning_rate": 4.642155946526854e-05, + "loss": 0.8914, + "step": 6011 + }, + { + "epoch": 0.5371814059463444, + "grad_norm": 0.4805702567100525, + "learning_rate": 4.640712600613851e-05, + "loss": 0.9744, + "step": 6012 + }, + { + "epoch": 0.5372707574776062, + "grad_norm": 0.48853474855422974, + "learning_rate": 4.6392692847949984e-05, + "loss": 0.985, + "step": 6013 + }, + { + "epoch": 0.5373601090088681, + "grad_norm": 0.414133220911026, + "learning_rate": 4.6378259991911886e-05, + "loss": 0.9825, + "step": 6014 + }, + { + "epoch": 0.53744946054013, + "grad_norm": 0.3783036470413208, + "learning_rate": 4.6363827439233114e-05, + "loss": 0.9994, + "step": 6015 + }, + { + "epoch": 0.5375388120713919, + "grad_norm": 0.5566924214363098, + "learning_rate": 4.634939519112255e-05, + "loss": 0.9119, + "step": 6016 + }, + { + "epoch": 0.5376281636026538, + "grad_norm": 0.49489137530326843, + "learning_rate": 4.633496324878906e-05, + "loss": 0.952, + "step": 6017 + }, + { + "epoch": 0.5377175151339156, + "grad_norm": 0.45840591192245483, + "learning_rate": 4.632053161344146e-05, + "loss": 0.8933, + "step": 6018 + }, + { + "epoch": 0.5378068666651775, + "grad_norm": 0.5176952481269836, + "learning_rate": 4.6306100286288565e-05, + "loss": 0.9606, + "step": 6019 + }, + { + "epoch": 0.5378962181964393, + "grad_norm": 0.484392374753952, + "learning_rate": 4.629166926853913e-05, + "loss": 0.9356, + "step": 6020 + }, + { + "epoch": 0.5379855697277012, + "grad_norm": 0.5076186656951904, + "learning_rate": 4.6277238561401927e-05, + "loss": 0.9744, + "step": 6021 + }, + { + "epoch": 0.5380749212589631, + "grad_norm": 0.4623713791370392, + "learning_rate": 4.6262808166085674e-05, + "loss": 0.9705, + "step": 6022 + }, + { + "epoch": 0.5381642727902249, + "grad_norm": 0.419096440076828, + "learning_rate": 4.624837808379907e-05, + "loss": 1.0033, + "step": 6023 + }, + { + "epoch": 0.5382536243214868, + "grad_norm": 0.3825317323207855, + "learning_rate": 4.62339483157508e-05, + "loss": 1.0228, + "step": 6024 + }, + { + "epoch": 0.5383429758527487, + "grad_norm": 0.5461272597312927, + "learning_rate": 4.6219518863149493e-05, + "loss": 0.9623, + "step": 6025 + }, + { + "epoch": 0.5384323273840106, + "grad_norm": 0.4370659589767456, + "learning_rate": 4.6205089727203785e-05, + "loss": 0.9881, + "step": 6026 + }, + { + "epoch": 0.5385216789152724, + "grad_norm": 0.48183608055114746, + "learning_rate": 4.619066090912228e-05, + "loss": 0.867, + "step": 6027 + }, + { + "epoch": 0.5386110304465342, + "grad_norm": 0.4971342980861664, + "learning_rate": 4.6176232410113506e-05, + "loss": 0.9406, + "step": 6028 + }, + { + "epoch": 0.5387003819777961, + "grad_norm": 0.44625434279441833, + "learning_rate": 4.616180423138603e-05, + "loss": 0.9701, + "step": 6029 + }, + { + "epoch": 0.538789733509058, + "grad_norm": 0.4783417880535126, + "learning_rate": 4.614737637414836e-05, + "loss": 0.8333, + "step": 6030 + }, + { + "epoch": 0.5388790850403199, + "grad_norm": 0.39774414896965027, + "learning_rate": 4.613294883960898e-05, + "loss": 0.9892, + "step": 6031 + }, + { + "epoch": 0.5389684365715818, + "grad_norm": 0.449542373418808, + "learning_rate": 4.611852162897636e-05, + "loss": 0.9164, + "step": 6032 + }, + { + "epoch": 0.5390577881028437, + "grad_norm": 0.38516610860824585, + "learning_rate": 4.610409474345894e-05, + "loss": 0.9848, + "step": 6033 + }, + { + "epoch": 0.5391471396341054, + "grad_norm": 0.43511533737182617, + "learning_rate": 4.60896681842651e-05, + "loss": 0.9101, + "step": 6034 + }, + { + "epoch": 0.5392364911653673, + "grad_norm": 0.4192296862602234, + "learning_rate": 4.6075241952603225e-05, + "loss": 1.0098, + "step": 6035 + }, + { + "epoch": 0.5393258426966292, + "grad_norm": 0.38819652795791626, + "learning_rate": 4.6060816049681676e-05, + "loss": 0.9525, + "step": 6036 + }, + { + "epoch": 0.5394151942278911, + "grad_norm": 0.4016992449760437, + "learning_rate": 4.6046390476708794e-05, + "loss": 0.993, + "step": 6037 + }, + { + "epoch": 0.539504545759153, + "grad_norm": 0.4347737431526184, + "learning_rate": 4.6031965234892834e-05, + "loss": 0.9847, + "step": 6038 + }, + { + "epoch": 0.5395938972904148, + "grad_norm": 0.4962137043476105, + "learning_rate": 4.601754032544208e-05, + "loss": 0.908, + "step": 6039 + }, + { + "epoch": 0.5396832488216767, + "grad_norm": 0.4483336806297302, + "learning_rate": 4.6003115749564765e-05, + "loss": 0.9726, + "step": 6040 + }, + { + "epoch": 0.5397726003529385, + "grad_norm": 0.5861220359802246, + "learning_rate": 4.598869150846912e-05, + "loss": 0.8876, + "step": 6041 + }, + { + "epoch": 0.5398619518842004, + "grad_norm": 0.4730405807495117, + "learning_rate": 4.597426760336331e-05, + "loss": 0.9645, + "step": 6042 + }, + { + "epoch": 0.5399513034154623, + "grad_norm": 0.48867109417915344, + "learning_rate": 4.59598440354555e-05, + "loss": 0.9377, + "step": 6043 + }, + { + "epoch": 0.5400406549467242, + "grad_norm": 0.4301815927028656, + "learning_rate": 4.5945420805953825e-05, + "loss": 0.9474, + "step": 6044 + }, + { + "epoch": 0.540130006477986, + "grad_norm": 0.4912492334842682, + "learning_rate": 4.593099791606637e-05, + "loss": 0.955, + "step": 6045 + }, + { + "epoch": 0.5402193580092479, + "grad_norm": 0.5850039124488831, + "learning_rate": 4.5916575367001214e-05, + "loss": 0.904, + "step": 6046 + }, + { + "epoch": 0.5403087095405098, + "grad_norm": 0.4344959557056427, + "learning_rate": 4.590215315996642e-05, + "loss": 0.9921, + "step": 6047 + }, + { + "epoch": 0.5403980610717716, + "grad_norm": 0.5166114568710327, + "learning_rate": 4.588773129616996e-05, + "loss": 0.941, + "step": 6048 + }, + { + "epoch": 0.5404874126030335, + "grad_norm": 0.5347388386726379, + "learning_rate": 4.587330977681983e-05, + "loss": 0.893, + "step": 6049 + }, + { + "epoch": 0.5405767641342953, + "grad_norm": 0.398970365524292, + "learning_rate": 4.585888860312399e-05, + "loss": 0.9573, + "step": 6050 + }, + { + "epoch": 0.5406661156655572, + "grad_norm": 0.4769752621650696, + "learning_rate": 4.584446777629038e-05, + "loss": 0.9554, + "step": 6051 + }, + { + "epoch": 0.5407554671968191, + "grad_norm": 0.5892907381057739, + "learning_rate": 4.5830047297526904e-05, + "loss": 1.025, + "step": 6052 + }, + { + "epoch": 0.540844818728081, + "grad_norm": 0.4411056637763977, + "learning_rate": 4.58156271680414e-05, + "loss": 0.9451, + "step": 6053 + }, + { + "epoch": 0.5409341702593429, + "grad_norm": 0.4067056477069855, + "learning_rate": 4.5801207389041715e-05, + "loss": 0.993, + "step": 6054 + }, + { + "epoch": 0.5410235217906046, + "grad_norm": 0.44847869873046875, + "learning_rate": 4.5786787961735673e-05, + "loss": 0.9894, + "step": 6055 + }, + { + "epoch": 0.5411128733218665, + "grad_norm": 0.43127429485321045, + "learning_rate": 4.577236888733105e-05, + "loss": 0.9412, + "step": 6056 + }, + { + "epoch": 0.5412022248531284, + "grad_norm": 0.4345093369483948, + "learning_rate": 4.575795016703561e-05, + "loss": 1.0033, + "step": 6057 + }, + { + "epoch": 0.5412915763843903, + "grad_norm": 0.44015347957611084, + "learning_rate": 4.574353180205705e-05, + "loss": 0.9976, + "step": 6058 + }, + { + "epoch": 0.5413809279156522, + "grad_norm": 0.45786052942276, + "learning_rate": 4.572911379360307e-05, + "loss": 0.9713, + "step": 6059 + }, + { + "epoch": 0.5414702794469141, + "grad_norm": 0.47783318161964417, + "learning_rate": 4.571469614288133e-05, + "loss": 0.9236, + "step": 6060 + }, + { + "epoch": 0.5415596309781759, + "grad_norm": 0.5037944316864014, + "learning_rate": 4.5700278851099464e-05, + "loss": 0.9076, + "step": 6061 + }, + { + "epoch": 0.5416489825094377, + "grad_norm": 0.402452290058136, + "learning_rate": 4.568586191946508e-05, + "loss": 0.9454, + "step": 6062 + }, + { + "epoch": 0.5417383340406996, + "grad_norm": 0.45303410291671753, + "learning_rate": 4.567144534918574e-05, + "loss": 0.9148, + "step": 6063 + }, + { + "epoch": 0.5418276855719615, + "grad_norm": 0.4717887341976166, + "learning_rate": 4.5657029141468996e-05, + "loss": 0.9045, + "step": 6064 + }, + { + "epoch": 0.5419170371032234, + "grad_norm": 0.4819827079772949, + "learning_rate": 4.564261329752236e-05, + "loss": 0.8857, + "step": 6065 + }, + { + "epoch": 0.5420063886344852, + "grad_norm": 0.4282495975494385, + "learning_rate": 4.562819781855331e-05, + "loss": 0.9649, + "step": 6066 + }, + { + "epoch": 0.5420957401657471, + "grad_norm": 0.43561655282974243, + "learning_rate": 4.561378270576929e-05, + "loss": 0.9829, + "step": 6067 + }, + { + "epoch": 0.5421850916970089, + "grad_norm": 0.47019922733306885, + "learning_rate": 4.559936796037772e-05, + "loss": 0.9926, + "step": 6068 + }, + { + "epoch": 0.5422744432282708, + "grad_norm": 0.5216493606567383, + "learning_rate": 4.5584953583585985e-05, + "loss": 0.8186, + "step": 6069 + }, + { + "epoch": 0.5423637947595327, + "grad_norm": 0.5065385699272156, + "learning_rate": 4.5570539576601463e-05, + "loss": 0.9491, + "step": 6070 + }, + { + "epoch": 0.5424531462907946, + "grad_norm": 0.473476380109787, + "learning_rate": 4.5556125940631454e-05, + "loss": 0.9613, + "step": 6071 + }, + { + "epoch": 0.5425424978220564, + "grad_norm": 0.4875657558441162, + "learning_rate": 4.5541712676883263e-05, + "loss": 0.9939, + "step": 6072 + }, + { + "epoch": 0.5426318493533183, + "grad_norm": 0.5284254550933838, + "learning_rate": 4.552729978656416e-05, + "loss": 0.8902, + "step": 6073 + }, + { + "epoch": 0.5427212008845802, + "grad_norm": 0.5050989985466003, + "learning_rate": 4.5512887270881374e-05, + "loss": 0.8749, + "step": 6074 + }, + { + "epoch": 0.542810552415842, + "grad_norm": 0.5607906579971313, + "learning_rate": 4.5498475131042106e-05, + "loss": 0.9388, + "step": 6075 + }, + { + "epoch": 0.5428999039471039, + "grad_norm": 0.5271878838539124, + "learning_rate": 4.548406336825353e-05, + "loss": 1.0645, + "step": 6076 + }, + { + "epoch": 0.5429892554783657, + "grad_norm": 0.41676685214042664, + "learning_rate": 4.546965198372279e-05, + "loss": 0.9279, + "step": 6077 + }, + { + "epoch": 0.5430786070096276, + "grad_norm": 0.47751128673553467, + "learning_rate": 4.5455240978656996e-05, + "loss": 0.9398, + "step": 6078 + }, + { + "epoch": 0.5431679585408895, + "grad_norm": 0.4766964912414551, + "learning_rate": 4.5440830354263205e-05, + "loss": 0.9324, + "step": 6079 + }, + { + "epoch": 0.5432573100721514, + "grad_norm": 0.4669797718524933, + "learning_rate": 4.542642011174846e-05, + "loss": 0.9492, + "step": 6080 + }, + { + "epoch": 0.5433466616034133, + "grad_norm": 0.4376409947872162, + "learning_rate": 4.5412010252319784e-05, + "loss": 1.0007, + "step": 6081 + }, + { + "epoch": 0.543436013134675, + "grad_norm": 0.42658764123916626, + "learning_rate": 4.539760077718416e-05, + "loss": 0.9542, + "step": 6082 + }, + { + "epoch": 0.5435253646659369, + "grad_norm": 0.4167364835739136, + "learning_rate": 4.5383191687548513e-05, + "loss": 1.0401, + "step": 6083 + }, + { + "epoch": 0.5436147161971988, + "grad_norm": 0.4156143367290497, + "learning_rate": 4.53687829846198e-05, + "loss": 1.0165, + "step": 6084 + }, + { + "epoch": 0.5437040677284607, + "grad_norm": 0.5895379781723022, + "learning_rate": 4.535437466960486e-05, + "loss": 0.9548, + "step": 6085 + }, + { + "epoch": 0.5437934192597226, + "grad_norm": 0.44905561208724976, + "learning_rate": 4.533996674371056e-05, + "loss": 0.9401, + "step": 6086 + }, + { + "epoch": 0.5438827707909845, + "grad_norm": 0.43985962867736816, + "learning_rate": 4.5325559208143717e-05, + "loss": 0.9516, + "step": 6087 + }, + { + "epoch": 0.5439721223222463, + "grad_norm": 0.5113573670387268, + "learning_rate": 4.5311152064111134e-05, + "loss": 0.9881, + "step": 6088 + }, + { + "epoch": 0.5440614738535081, + "grad_norm": 0.37436649203300476, + "learning_rate": 4.529674531281954e-05, + "loss": 0.9438, + "step": 6089 + }, + { + "epoch": 0.54415082538477, + "grad_norm": 0.5028565526008606, + "learning_rate": 4.5282338955475644e-05, + "loss": 1.0298, + "step": 6090 + }, + { + "epoch": 0.5442401769160319, + "grad_norm": 0.4592767059803009, + "learning_rate": 4.526793299328616e-05, + "loss": 0.9478, + "step": 6091 + }, + { + "epoch": 0.5443295284472938, + "grad_norm": 0.4168972671031952, + "learning_rate": 4.5253527427457715e-05, + "loss": 0.9624, + "step": 6092 + }, + { + "epoch": 0.5444188799785556, + "grad_norm": 0.4067929685115814, + "learning_rate": 4.523912225919694e-05, + "loss": 0.8968, + "step": 6093 + }, + { + "epoch": 0.5445082315098175, + "grad_norm": 0.5613819360733032, + "learning_rate": 4.522471748971043e-05, + "loss": 0.8793, + "step": 6094 + }, + { + "epoch": 0.5445975830410794, + "grad_norm": 0.4839697778224945, + "learning_rate": 4.5210313120204735e-05, + "loss": 0.9269, + "step": 6095 + }, + { + "epoch": 0.5446869345723412, + "grad_norm": 0.42698168754577637, + "learning_rate": 4.519590915188637e-05, + "loss": 0.9873, + "step": 6096 + }, + { + "epoch": 0.5447762861036031, + "grad_norm": 0.5424548983573914, + "learning_rate": 4.5181505585961816e-05, + "loss": 0.9847, + "step": 6097 + }, + { + "epoch": 0.544865637634865, + "grad_norm": 0.4275175929069519, + "learning_rate": 4.5167102423637554e-05, + "loss": 1.0299, + "step": 6098 + }, + { + "epoch": 0.5449549891661268, + "grad_norm": 0.4584430158138275, + "learning_rate": 4.515269966611996e-05, + "loss": 0.9439, + "step": 6099 + }, + { + "epoch": 0.5450443406973887, + "grad_norm": 0.4883447289466858, + "learning_rate": 4.513829731461543e-05, + "loss": 0.9921, + "step": 6100 + }, + { + "epoch": 0.5451336922286506, + "grad_norm": 0.44181644916534424, + "learning_rate": 4.512389537033033e-05, + "loss": 0.9704, + "step": 6101 + }, + { + "epoch": 0.5452230437599125, + "grad_norm": 0.47616517543792725, + "learning_rate": 4.510949383447096e-05, + "loss": 0.9071, + "step": 6102 + }, + { + "epoch": 0.5453123952911743, + "grad_norm": 0.6123970746994019, + "learning_rate": 4.5095092708243623e-05, + "loss": 0.8817, + "step": 6103 + }, + { + "epoch": 0.5454017468224361, + "grad_norm": 0.4854317009449005, + "learning_rate": 4.5080691992854554e-05, + "loss": 0.8724, + "step": 6104 + }, + { + "epoch": 0.545491098353698, + "grad_norm": 0.47287046909332275, + "learning_rate": 4.5066291689509953e-05, + "loss": 0.9679, + "step": 6105 + }, + { + "epoch": 0.5455804498849599, + "grad_norm": 0.4474678933620453, + "learning_rate": 4.5051891799416025e-05, + "loss": 0.943, + "step": 6106 + }, + { + "epoch": 0.5456698014162218, + "grad_norm": 0.46921437978744507, + "learning_rate": 4.503749232377889e-05, + "loss": 0.9584, + "step": 6107 + }, + { + "epoch": 0.5457591529474837, + "grad_norm": 0.5251668691635132, + "learning_rate": 4.50230932638047e-05, + "loss": 0.9205, + "step": 6108 + }, + { + "epoch": 0.5458485044787456, + "grad_norm": 0.41507506370544434, + "learning_rate": 4.5008694620699474e-05, + "loss": 0.9704, + "step": 6109 + }, + { + "epoch": 0.5459378560100073, + "grad_norm": 0.43505141139030457, + "learning_rate": 4.4994296395669276e-05, + "loss": 0.926, + "step": 6110 + }, + { + "epoch": 0.5460272075412692, + "grad_norm": 0.5086851716041565, + "learning_rate": 4.497989858992011e-05, + "loss": 0.905, + "step": 6111 + }, + { + "epoch": 0.5461165590725311, + "grad_norm": 0.4543953537940979, + "learning_rate": 4.496550120465795e-05, + "loss": 0.9689, + "step": 6112 + }, + { + "epoch": 0.546205910603793, + "grad_norm": 0.4054141938686371, + "learning_rate": 4.495110424108873e-05, + "loss": 1.0004, + "step": 6113 + }, + { + "epoch": 0.5462952621350549, + "grad_norm": 0.39775604009628296, + "learning_rate": 4.4936707700418346e-05, + "loss": 1.021, + "step": 6114 + }, + { + "epoch": 0.5463846136663167, + "grad_norm": 0.5066169500350952, + "learning_rate": 4.492231158385266e-05, + "loss": 0.9796, + "step": 6115 + }, + { + "epoch": 0.5464739651975786, + "grad_norm": 0.43312814831733704, + "learning_rate": 4.4907915892597504e-05, + "loss": 0.9808, + "step": 6116 + }, + { + "epoch": 0.5465633167288404, + "grad_norm": 0.4440118372440338, + "learning_rate": 4.489352062785869e-05, + "loss": 0.9781, + "step": 6117 + }, + { + "epoch": 0.5466526682601023, + "grad_norm": 0.45043429732322693, + "learning_rate": 4.4879125790841944e-05, + "loss": 1.0155, + "step": 6118 + }, + { + "epoch": 0.5467420197913642, + "grad_norm": 0.44489559531211853, + "learning_rate": 4.486473138275299e-05, + "loss": 0.9426, + "step": 6119 + }, + { + "epoch": 0.546831371322626, + "grad_norm": 0.45286840200424194, + "learning_rate": 4.485033740479752e-05, + "loss": 0.982, + "step": 6120 + }, + { + "epoch": 0.5469207228538879, + "grad_norm": 0.4958401322364807, + "learning_rate": 4.483594385818118e-05, + "loss": 0.9419, + "step": 6121 + }, + { + "epoch": 0.5470100743851498, + "grad_norm": 0.4510517120361328, + "learning_rate": 4.482155074410961e-05, + "loss": 0.9684, + "step": 6122 + }, + { + "epoch": 0.5470994259164117, + "grad_norm": 0.3880564272403717, + "learning_rate": 4.480715806378834e-05, + "loss": 1.0172, + "step": 6123 + }, + { + "epoch": 0.5471887774476735, + "grad_norm": 0.5004435181617737, + "learning_rate": 4.479276581842294e-05, + "loss": 1.0378, + "step": 6124 + }, + { + "epoch": 0.5472781289789354, + "grad_norm": 0.43363669514656067, + "learning_rate": 4.4778374009218904e-05, + "loss": 1.0019, + "step": 6125 + }, + { + "epoch": 0.5473674805101972, + "grad_norm": 0.4868524670600891, + "learning_rate": 4.4763982637381706e-05, + "loss": 0.9947, + "step": 6126 + }, + { + "epoch": 0.5474568320414591, + "grad_norm": 0.43527916073799133, + "learning_rate": 4.474959170411677e-05, + "loss": 1.0288, + "step": 6127 + }, + { + "epoch": 0.547546183572721, + "grad_norm": 0.5454886555671692, + "learning_rate": 4.473520121062952e-05, + "loss": 0.9882, + "step": 6128 + }, + { + "epoch": 0.5476355351039829, + "grad_norm": 0.4397745132446289, + "learning_rate": 4.4720811158125267e-05, + "loss": 1.0003, + "step": 6129 + }, + { + "epoch": 0.5477248866352447, + "grad_norm": 0.5059059262275696, + "learning_rate": 4.470642154780935e-05, + "loss": 0.9265, + "step": 6130 + }, + { + "epoch": 0.5478142381665065, + "grad_norm": 0.5163365006446838, + "learning_rate": 4.469203238088705e-05, + "loss": 0.9582, + "step": 6131 + }, + { + "epoch": 0.5479035896977684, + "grad_norm": 0.48326799273490906, + "learning_rate": 4.467764365856362e-05, + "loss": 0.9396, + "step": 6132 + }, + { + "epoch": 0.5479929412290303, + "grad_norm": 0.46925976872444153, + "learning_rate": 4.466325538204427e-05, + "loss": 1.0036, + "step": 6133 + }, + { + "epoch": 0.5480822927602922, + "grad_norm": 0.4270949363708496, + "learning_rate": 4.464886755253416e-05, + "loss": 1.0472, + "step": 6134 + }, + { + "epoch": 0.5481716442915541, + "grad_norm": 0.49136438965797424, + "learning_rate": 4.463448017123844e-05, + "loss": 0.8925, + "step": 6135 + }, + { + "epoch": 0.548260995822816, + "grad_norm": 0.41291165351867676, + "learning_rate": 4.4620093239362204e-05, + "loss": 0.9923, + "step": 6136 + }, + { + "epoch": 0.5483503473540777, + "grad_norm": 0.40838754177093506, + "learning_rate": 4.460570675811049e-05, + "loss": 0.9903, + "step": 6137 + }, + { + "epoch": 0.5484396988853396, + "grad_norm": 0.5193032622337341, + "learning_rate": 4.459132072868835e-05, + "loss": 0.9049, + "step": 6138 + }, + { + "epoch": 0.5485290504166015, + "grad_norm": 0.4100017547607422, + "learning_rate": 4.457693515230074e-05, + "loss": 1.0034, + "step": 6139 + }, + { + "epoch": 0.5486184019478634, + "grad_norm": 0.5860205888748169, + "learning_rate": 4.456255003015263e-05, + "loss": 0.9951, + "step": 6140 + }, + { + "epoch": 0.5487077534791253, + "grad_norm": 0.49297282099723816, + "learning_rate": 4.4548165363448894e-05, + "loss": 0.9113, + "step": 6141 + }, + { + "epoch": 0.5487971050103871, + "grad_norm": 0.41039666533470154, + "learning_rate": 4.4533781153394426e-05, + "loss": 0.9713, + "step": 6142 + }, + { + "epoch": 0.548886456541649, + "grad_norm": 0.40572866797447205, + "learning_rate": 4.4519397401194056e-05, + "loss": 0.9805, + "step": 6143 + }, + { + "epoch": 0.5489758080729108, + "grad_norm": 0.43796059489250183, + "learning_rate": 4.4505014108052564e-05, + "loss": 1.0003, + "step": 6144 + }, + { + "epoch": 0.5490651596041727, + "grad_norm": 0.4492766857147217, + "learning_rate": 4.449063127517472e-05, + "loss": 0.9586, + "step": 6145 + }, + { + "epoch": 0.5491545111354346, + "grad_norm": 0.3965035676956177, + "learning_rate": 4.447624890376523e-05, + "loss": 0.9954, + "step": 6146 + }, + { + "epoch": 0.5492438626666964, + "grad_norm": 0.45780253410339355, + "learning_rate": 4.4461866995028776e-05, + "loss": 0.9275, + "step": 6147 + }, + { + "epoch": 0.5493332141979583, + "grad_norm": 0.4715741276741028, + "learning_rate": 4.4447485550170013e-05, + "loss": 0.9706, + "step": 6148 + }, + { + "epoch": 0.5494225657292202, + "grad_norm": 0.43292608857154846, + "learning_rate": 4.44331045703935e-05, + "loss": 0.9596, + "step": 6149 + }, + { + "epoch": 0.5495119172604821, + "grad_norm": 0.4346010982990265, + "learning_rate": 4.4418724056903824e-05, + "loss": 1.0092, + "step": 6150 + }, + { + "epoch": 0.5496012687917439, + "grad_norm": 0.5378443002700806, + "learning_rate": 4.440434401090549e-05, + "loss": 0.9605, + "step": 6151 + }, + { + "epoch": 0.5496906203230058, + "grad_norm": 0.5118531584739685, + "learning_rate": 4.438996443360299e-05, + "loss": 0.8896, + "step": 6152 + }, + { + "epoch": 0.5497799718542676, + "grad_norm": 0.4279499351978302, + "learning_rate": 4.437558532620077e-05, + "loss": 0.8621, + "step": 6153 + }, + { + "epoch": 0.5498693233855295, + "grad_norm": 0.4221118092536926, + "learning_rate": 4.436120668990324e-05, + "loss": 0.9983, + "step": 6154 + }, + { + "epoch": 0.5499586749167914, + "grad_norm": 0.42579370737075806, + "learning_rate": 4.434682852591476e-05, + "loss": 0.9634, + "step": 6155 + }, + { + "epoch": 0.5500480264480533, + "grad_norm": 0.47894287109375, + "learning_rate": 4.4332450835439634e-05, + "loss": 0.9609, + "step": 6156 + }, + { + "epoch": 0.5501373779793152, + "grad_norm": 0.5039592385292053, + "learning_rate": 4.431807361968217e-05, + "loss": 0.9188, + "step": 6157 + }, + { + "epoch": 0.5502267295105769, + "grad_norm": 0.4221189618110657, + "learning_rate": 4.4303696879846593e-05, + "loss": 0.9687, + "step": 6158 + }, + { + "epoch": 0.5503160810418388, + "grad_norm": 0.454010009765625, + "learning_rate": 4.428932061713715e-05, + "loss": 0.9647, + "step": 6159 + }, + { + "epoch": 0.5504054325731007, + "grad_norm": 0.409974604845047, + "learning_rate": 4.427494483275796e-05, + "loss": 0.9301, + "step": 6160 + }, + { + "epoch": 0.5504947841043626, + "grad_norm": 0.4791439473628998, + "learning_rate": 4.426056952791316e-05, + "loss": 0.9908, + "step": 6161 + }, + { + "epoch": 0.5505841356356245, + "grad_norm": 0.4021007716655731, + "learning_rate": 4.424619470380684e-05, + "loss": 1.0427, + "step": 6162 + }, + { + "epoch": 0.5506734871668864, + "grad_norm": 0.48248547315597534, + "learning_rate": 4.423182036164304e-05, + "loss": 0.9028, + "step": 6163 + }, + { + "epoch": 0.5507628386981482, + "grad_norm": 0.4851128160953522, + "learning_rate": 4.4217446502625773e-05, + "loss": 0.9881, + "step": 6164 + }, + { + "epoch": 0.55085219022941, + "grad_norm": 0.42037060856819153, + "learning_rate": 4.420307312795901e-05, + "loss": 0.9585, + "step": 6165 + }, + { + "epoch": 0.5509415417606719, + "grad_norm": 0.47032657265663147, + "learning_rate": 4.418870023884665e-05, + "loss": 0.9322, + "step": 6166 + }, + { + "epoch": 0.5510308932919338, + "grad_norm": 0.6283010244369507, + "learning_rate": 4.4174327836492587e-05, + "loss": 0.9798, + "step": 6167 + }, + { + "epoch": 0.5511202448231957, + "grad_norm": 0.45526832342147827, + "learning_rate": 4.4159955922100674e-05, + "loss": 0.9473, + "step": 6168 + }, + { + "epoch": 0.5512095963544575, + "grad_norm": 0.4486526548862457, + "learning_rate": 4.414558449687471e-05, + "loss": 0.9134, + "step": 6169 + }, + { + "epoch": 0.5512989478857194, + "grad_norm": 0.38991108536720276, + "learning_rate": 4.413121356201844e-05, + "loss": 0.9947, + "step": 6170 + }, + { + "epoch": 0.5513882994169813, + "grad_norm": 0.4334878623485565, + "learning_rate": 4.411684311873559e-05, + "loss": 0.9613, + "step": 6171 + }, + { + "epoch": 0.5514776509482431, + "grad_norm": 0.49451541900634766, + "learning_rate": 4.4102473168229837e-05, + "loss": 0.9073, + "step": 6172 + }, + { + "epoch": 0.551567002479505, + "grad_norm": 0.41775211691856384, + "learning_rate": 4.408810371170483e-05, + "loss": 0.9729, + "step": 6173 + }, + { + "epoch": 0.5516563540107668, + "grad_norm": 0.4632229506969452, + "learning_rate": 4.4073734750364144e-05, + "loss": 0.9096, + "step": 6174 + }, + { + "epoch": 0.5517457055420287, + "grad_norm": 0.5613375306129456, + "learning_rate": 4.4059366285411344e-05, + "loss": 0.8467, + "step": 6175 + }, + { + "epoch": 0.5518350570732906, + "grad_norm": 0.4471941590309143, + "learning_rate": 4.404499831804993e-05, + "loss": 0.9103, + "step": 6176 + }, + { + "epoch": 0.5519244086045525, + "grad_norm": 0.4910937547683716, + "learning_rate": 4.403063084948339e-05, + "loss": 1.0049, + "step": 6177 + }, + { + "epoch": 0.5520137601358144, + "grad_norm": 0.551129162311554, + "learning_rate": 4.4016263880915146e-05, + "loss": 0.9435, + "step": 6178 + }, + { + "epoch": 0.5521031116670762, + "grad_norm": 0.44475892186164856, + "learning_rate": 4.4001897413548605e-05, + "loss": 0.9798, + "step": 6179 + }, + { + "epoch": 0.552192463198338, + "grad_norm": 0.549535870552063, + "learning_rate": 4.398753144858707e-05, + "loss": 0.878, + "step": 6180 + }, + { + "epoch": 0.5522818147295999, + "grad_norm": 0.4066050052642822, + "learning_rate": 4.397316598723385e-05, + "loss": 1.008, + "step": 6181 + }, + { + "epoch": 0.5523711662608618, + "grad_norm": 0.4381326138973236, + "learning_rate": 4.3958801030692245e-05, + "loss": 0.9418, + "step": 6182 + }, + { + "epoch": 0.5524605177921237, + "grad_norm": 0.4462524354457855, + "learning_rate": 4.394443658016543e-05, + "loss": 0.932, + "step": 6183 + }, + { + "epoch": 0.5525498693233856, + "grad_norm": 0.4263027608394623, + "learning_rate": 4.393007263685661e-05, + "loss": 0.968, + "step": 6184 + }, + { + "epoch": 0.5526392208546475, + "grad_norm": 0.3910346031188965, + "learning_rate": 4.3915709201968896e-05, + "loss": 1.014, + "step": 6185 + }, + { + "epoch": 0.5527285723859092, + "grad_norm": 0.45934584736824036, + "learning_rate": 4.39013462767054e-05, + "loss": 0.9602, + "step": 6186 + }, + { + "epoch": 0.5528179239171711, + "grad_norm": 0.48381033539772034, + "learning_rate": 4.388698386226917e-05, + "loss": 0.9204, + "step": 6187 + }, + { + "epoch": 0.552907275448433, + "grad_norm": 0.4025076627731323, + "learning_rate": 4.3872621959863185e-05, + "loss": 0.9068, + "step": 6188 + }, + { + "epoch": 0.5529966269796949, + "grad_norm": 0.5276347398757935, + "learning_rate": 4.385826057069044e-05, + "loss": 0.9264, + "step": 6189 + }, + { + "epoch": 0.5530859785109568, + "grad_norm": 0.42190733551979065, + "learning_rate": 4.3843899695953826e-05, + "loss": 0.941, + "step": 6190 + }, + { + "epoch": 0.5531753300422186, + "grad_norm": 0.45186877250671387, + "learning_rate": 4.382953933685623e-05, + "loss": 0.9799, + "step": 6191 + }, + { + "epoch": 0.5532646815734804, + "grad_norm": 0.4708520472049713, + "learning_rate": 4.38151794946005e-05, + "loss": 0.9633, + "step": 6192 + }, + { + "epoch": 0.5533540331047423, + "grad_norm": 0.44095146656036377, + "learning_rate": 4.38008201703894e-05, + "loss": 0.9377, + "step": 6193 + }, + { + "epoch": 0.5534433846360042, + "grad_norm": 0.4277689754962921, + "learning_rate": 4.378646136542569e-05, + "loss": 0.9737, + "step": 6194 + }, + { + "epoch": 0.5535327361672661, + "grad_norm": 0.4015730321407318, + "learning_rate": 4.377210308091207e-05, + "loss": 0.9792, + "step": 6195 + }, + { + "epoch": 0.5536220876985279, + "grad_norm": 0.4145139753818512, + "learning_rate": 4.375774531805121e-05, + "loss": 0.9336, + "step": 6196 + }, + { + "epoch": 0.5537114392297898, + "grad_norm": 0.42907387018203735, + "learning_rate": 4.374338807804571e-05, + "loss": 0.9789, + "step": 6197 + }, + { + "epoch": 0.5538007907610517, + "grad_norm": 0.4848146140575409, + "learning_rate": 4.372903136209815e-05, + "loss": 0.9244, + "step": 6198 + }, + { + "epoch": 0.5538901422923135, + "grad_norm": 0.4261986017227173, + "learning_rate": 4.371467517141108e-05, + "loss": 0.9835, + "step": 6199 + }, + { + "epoch": 0.5539794938235754, + "grad_norm": 0.430756539106369, + "learning_rate": 4.3700319507186935e-05, + "loss": 1.0051, + "step": 6200 + }, + { + "epoch": 0.5540688453548372, + "grad_norm": 0.3806210160255432, + "learning_rate": 4.3685964370628193e-05, + "loss": 0.9518, + "step": 6201 + }, + { + "epoch": 0.5541581968860991, + "grad_norm": 0.44627901911735535, + "learning_rate": 4.367160976293723e-05, + "loss": 0.9854, + "step": 6202 + }, + { + "epoch": 0.554247548417361, + "grad_norm": 0.492567241191864, + "learning_rate": 4.3657255685316404e-05, + "loss": 0.9096, + "step": 6203 + }, + { + "epoch": 0.5543368999486229, + "grad_norm": 0.3693414330482483, + "learning_rate": 4.364290213896802e-05, + "loss": 0.9909, + "step": 6204 + }, + { + "epoch": 0.5544262514798848, + "grad_norm": 0.4952232539653778, + "learning_rate": 4.362854912509435e-05, + "loss": 0.9532, + "step": 6205 + }, + { + "epoch": 0.5545156030111466, + "grad_norm": 0.44463467597961426, + "learning_rate": 4.361419664489762e-05, + "loss": 0.9138, + "step": 6206 + }, + { + "epoch": 0.5546049545424084, + "grad_norm": 0.4538500905036926, + "learning_rate": 4.3599844699579964e-05, + "loss": 0.9869, + "step": 6207 + }, + { + "epoch": 0.5546943060736703, + "grad_norm": 0.4345034658908844, + "learning_rate": 4.358549329034355e-05, + "loss": 0.8805, + "step": 6208 + }, + { + "epoch": 0.5547836576049322, + "grad_norm": 0.48444312810897827, + "learning_rate": 4.357114241839045e-05, + "loss": 0.9715, + "step": 6209 + }, + { + "epoch": 0.5548730091361941, + "grad_norm": 0.46041545271873474, + "learning_rate": 4.35567920849227e-05, + "loss": 0.9387, + "step": 6210 + }, + { + "epoch": 0.554962360667456, + "grad_norm": 0.49429547786712646, + "learning_rate": 4.354244229114228e-05, + "loss": 0.9641, + "step": 6211 + }, + { + "epoch": 0.5550517121987179, + "grad_norm": 0.41394224762916565, + "learning_rate": 4.352809303825115e-05, + "loss": 0.9933, + "step": 6212 + }, + { + "epoch": 0.5551410637299796, + "grad_norm": 0.4146006405353546, + "learning_rate": 4.351374432745122e-05, + "loss": 0.8814, + "step": 6213 + }, + { + "epoch": 0.5552304152612415, + "grad_norm": 0.5136590600013733, + "learning_rate": 4.349939615994433e-05, + "loss": 0.9119, + "step": 6214 + }, + { + "epoch": 0.5553197667925034, + "grad_norm": 0.5504260659217834, + "learning_rate": 4.3485048536932314e-05, + "loss": 0.9742, + "step": 6215 + }, + { + "epoch": 0.5554091183237653, + "grad_norm": 0.5039506554603577, + "learning_rate": 4.347070145961692e-05, + "loss": 0.9296, + "step": 6216 + }, + { + "epoch": 0.5554984698550272, + "grad_norm": 0.46146827936172485, + "learning_rate": 4.345635492919988e-05, + "loss": 0.9909, + "step": 6217 + }, + { + "epoch": 0.555587821386289, + "grad_norm": 0.4409083127975464, + "learning_rate": 4.344200894688287e-05, + "loss": 0.9877, + "step": 6218 + }, + { + "epoch": 0.5556771729175509, + "grad_norm": 0.49056732654571533, + "learning_rate": 4.342766351386753e-05, + "loss": 0.9094, + "step": 6219 + }, + { + "epoch": 0.5557665244488127, + "grad_norm": 0.4934116303920746, + "learning_rate": 4.3413318631355403e-05, + "loss": 0.9019, + "step": 6220 + }, + { + "epoch": 0.5558558759800746, + "grad_norm": 0.4715670943260193, + "learning_rate": 4.339897430054806e-05, + "loss": 0.9102, + "step": 6221 + }, + { + "epoch": 0.5559452275113365, + "grad_norm": 0.41046231985092163, + "learning_rate": 4.338463052264697e-05, + "loss": 0.9674, + "step": 6222 + }, + { + "epoch": 0.5560345790425983, + "grad_norm": 0.39487752318382263, + "learning_rate": 4.3370287298853585e-05, + "loss": 1.0071, + "step": 6223 + }, + { + "epoch": 0.5561239305738602, + "grad_norm": 0.510352611541748, + "learning_rate": 4.3355944630369315e-05, + "loss": 0.9072, + "step": 6224 + }, + { + "epoch": 0.5562132821051221, + "grad_norm": 0.409078985452652, + "learning_rate": 4.334160251839551e-05, + "loss": 0.9709, + "step": 6225 + }, + { + "epoch": 0.556302633636384, + "grad_norm": 0.4232335388660431, + "learning_rate": 4.332726096413346e-05, + "loss": 0.9738, + "step": 6226 + }, + { + "epoch": 0.5563919851676458, + "grad_norm": 0.44725194573402405, + "learning_rate": 4.331291996878443e-05, + "loss": 0.8612, + "step": 6227 + }, + { + "epoch": 0.5564813366989076, + "grad_norm": 0.44963935017585754, + "learning_rate": 4.329857953354963e-05, + "loss": 0.9369, + "step": 6228 + }, + { + "epoch": 0.5565706882301695, + "grad_norm": 0.48645251989364624, + "learning_rate": 4.328423965963025e-05, + "loss": 0.9099, + "step": 6229 + }, + { + "epoch": 0.5566600397614314, + "grad_norm": 0.5652446746826172, + "learning_rate": 4.326990034822736e-05, + "loss": 0.8716, + "step": 6230 + }, + { + "epoch": 0.5567493912926933, + "grad_norm": 0.3996671438217163, + "learning_rate": 4.325556160054205e-05, + "loss": 0.9745, + "step": 6231 + }, + { + "epoch": 0.5568387428239552, + "grad_norm": 0.5085249543190002, + "learning_rate": 4.324122341777535e-05, + "loss": 1.0347, + "step": 6232 + }, + { + "epoch": 0.5569280943552171, + "grad_norm": 0.3927776515483856, + "learning_rate": 4.322688580112824e-05, + "loss": 1.0099, + "step": 6233 + }, + { + "epoch": 0.5570174458864788, + "grad_norm": 0.5127935409545898, + "learning_rate": 4.321254875180163e-05, + "loss": 1.0354, + "step": 6234 + }, + { + "epoch": 0.5571067974177407, + "grad_norm": 0.4555366039276123, + "learning_rate": 4.319821227099641e-05, + "loss": 0.9839, + "step": 6235 + }, + { + "epoch": 0.5571961489490026, + "grad_norm": 0.4541626274585724, + "learning_rate": 4.318387635991342e-05, + "loss": 0.9915, + "step": 6236 + }, + { + "epoch": 0.5572855004802645, + "grad_norm": 0.4412585198879242, + "learning_rate": 4.316954101975343e-05, + "loss": 1.0239, + "step": 6237 + }, + { + "epoch": 0.5573748520115264, + "grad_norm": 0.42731690406799316, + "learning_rate": 4.31552062517172e-05, + "loss": 0.9534, + "step": 6238 + }, + { + "epoch": 0.5574642035427883, + "grad_norm": 0.4002136290073395, + "learning_rate": 4.314087205700542e-05, + "loss": 0.9607, + "step": 6239 + }, + { + "epoch": 0.5575535550740501, + "grad_norm": 0.4313288629055023, + "learning_rate": 4.3126538436818704e-05, + "loss": 0.926, + "step": 6240 + }, + { + "epoch": 0.5576429066053119, + "grad_norm": 0.4170110821723938, + "learning_rate": 4.311220539235765e-05, + "loss": 0.9685, + "step": 6241 + }, + { + "epoch": 0.5577322581365738, + "grad_norm": 0.4707728624343872, + "learning_rate": 4.3097872924822816e-05, + "loss": 0.9372, + "step": 6242 + }, + { + "epoch": 0.5578216096678357, + "grad_norm": 0.5320422649383545, + "learning_rate": 4.308354103541471e-05, + "loss": 0.99, + "step": 6243 + }, + { + "epoch": 0.5579109611990976, + "grad_norm": 0.44399821758270264, + "learning_rate": 4.3069209725333756e-05, + "loss": 0.9685, + "step": 6244 + }, + { + "epoch": 0.5580003127303594, + "grad_norm": 0.5755261182785034, + "learning_rate": 4.305487899578036e-05, + "loss": 0.9169, + "step": 6245 + }, + { + "epoch": 0.5580896642616213, + "grad_norm": 0.4340269863605499, + "learning_rate": 4.3040548847954885e-05, + "loss": 0.9954, + "step": 6246 + }, + { + "epoch": 0.5581790157928832, + "grad_norm": 0.4525034725666046, + "learning_rate": 4.3026219283057625e-05, + "loss": 0.9127, + "step": 6247 + }, + { + "epoch": 0.558268367324145, + "grad_norm": 0.45277920365333557, + "learning_rate": 4.301189030228883e-05, + "loss": 0.9261, + "step": 6248 + }, + { + "epoch": 0.5583577188554069, + "grad_norm": 0.5205567479133606, + "learning_rate": 4.299756190684871e-05, + "loss": 0.9047, + "step": 6249 + }, + { + "epoch": 0.5584470703866687, + "grad_norm": 0.5122022032737732, + "learning_rate": 4.2983234097937444e-05, + "loss": 0.9429, + "step": 6250 + }, + { + "epoch": 0.5585364219179306, + "grad_norm": 0.5380150675773621, + "learning_rate": 4.29689068767551e-05, + "loss": 0.974, + "step": 6251 + }, + { + "epoch": 0.5586257734491925, + "grad_norm": 0.45227405428886414, + "learning_rate": 4.295458024450174e-05, + "loss": 0.9372, + "step": 6252 + }, + { + "epoch": 0.5587151249804544, + "grad_norm": 0.47980716824531555, + "learning_rate": 4.2940254202377395e-05, + "loss": 0.9399, + "step": 6253 + }, + { + "epoch": 0.5588044765117163, + "grad_norm": 0.5136576890945435, + "learning_rate": 4.2925928751582e-05, + "loss": 0.8619, + "step": 6254 + }, + { + "epoch": 0.558893828042978, + "grad_norm": 0.4292888045310974, + "learning_rate": 4.291160389331549e-05, + "loss": 0.9736, + "step": 6255 + }, + { + "epoch": 0.5589831795742399, + "grad_norm": 0.4159403443336487, + "learning_rate": 4.28972796287777e-05, + "loss": 0.9077, + "step": 6256 + }, + { + "epoch": 0.5590725311055018, + "grad_norm": 0.43436843156814575, + "learning_rate": 4.2882955959168454e-05, + "loss": 0.965, + "step": 6257 + }, + { + "epoch": 0.5591618826367637, + "grad_norm": 0.4158000349998474, + "learning_rate": 4.286863288568752e-05, + "loss": 1.0099, + "step": 6258 + }, + { + "epoch": 0.5592512341680256, + "grad_norm": 0.4041937589645386, + "learning_rate": 4.2854310409534583e-05, + "loss": 0.9982, + "step": 6259 + }, + { + "epoch": 0.5593405856992875, + "grad_norm": 0.5290785431861877, + "learning_rate": 4.283998853190933e-05, + "loss": 0.9658, + "step": 6260 + }, + { + "epoch": 0.5594299372305492, + "grad_norm": 0.4550890326499939, + "learning_rate": 4.2825667254011346e-05, + "loss": 0.9857, + "step": 6261 + }, + { + "epoch": 0.5595192887618111, + "grad_norm": 0.5154798030853271, + "learning_rate": 4.281134657704022e-05, + "loss": 0.8585, + "step": 6262 + }, + { + "epoch": 0.559608640293073, + "grad_norm": 0.46254241466522217, + "learning_rate": 4.279702650219543e-05, + "loss": 0.9572, + "step": 6263 + }, + { + "epoch": 0.5596979918243349, + "grad_norm": 0.41614094376564026, + "learning_rate": 4.278270703067644e-05, + "loss": 0.9475, + "step": 6264 + }, + { + "epoch": 0.5597873433555968, + "grad_norm": 0.39605072140693665, + "learning_rate": 4.276838816368267e-05, + "loss": 0.984, + "step": 6265 + }, + { + "epoch": 0.5598766948868587, + "grad_norm": 0.4425526261329651, + "learning_rate": 4.275406990241348e-05, + "loss": 1.0881, + "step": 6266 + }, + { + "epoch": 0.5599660464181205, + "grad_norm": 0.43981900811195374, + "learning_rate": 4.273975224806816e-05, + "loss": 0.9565, + "step": 6267 + }, + { + "epoch": 0.5600553979493823, + "grad_norm": 0.48525822162628174, + "learning_rate": 4.272543520184599e-05, + "loss": 0.8835, + "step": 6268 + }, + { + "epoch": 0.5601447494806442, + "grad_norm": 0.43978825211524963, + "learning_rate": 4.271111876494616e-05, + "loss": 0.8794, + "step": 6269 + }, + { + "epoch": 0.5602341010119061, + "grad_norm": 0.4795457124710083, + "learning_rate": 4.2696802938567854e-05, + "loss": 0.9744, + "step": 6270 + }, + { + "epoch": 0.560323452543168, + "grad_norm": 0.46985960006713867, + "learning_rate": 4.2682487723910116e-05, + "loss": 0.9602, + "step": 6271 + }, + { + "epoch": 0.5604128040744298, + "grad_norm": 0.4838345944881439, + "learning_rate": 4.266817312217204e-05, + "loss": 1.0128, + "step": 6272 + }, + { + "epoch": 0.5605021556056917, + "grad_norm": 0.4208124279975891, + "learning_rate": 4.2653859134552616e-05, + "loss": 0.9531, + "step": 6273 + }, + { + "epoch": 0.5605915071369536, + "grad_norm": 0.36559927463531494, + "learning_rate": 4.263954576225079e-05, + "loss": 0.9299, + "step": 6274 + }, + { + "epoch": 0.5606808586682154, + "grad_norm": 0.6015887260437012, + "learning_rate": 4.262523300646546e-05, + "loss": 0.8584, + "step": 6275 + }, + { + "epoch": 0.5607702101994773, + "grad_norm": 0.6308769583702087, + "learning_rate": 4.261092086839549e-05, + "loss": 0.8135, + "step": 6276 + }, + { + "epoch": 0.5608595617307391, + "grad_norm": 0.46543291211128235, + "learning_rate": 4.259660934923965e-05, + "loss": 0.9407, + "step": 6277 + }, + { + "epoch": 0.560948913262001, + "grad_norm": 0.4775448739528656, + "learning_rate": 4.258229845019669e-05, + "loss": 0.9693, + "step": 6278 + }, + { + "epoch": 0.5610382647932629, + "grad_norm": 0.5649442672729492, + "learning_rate": 4.2567988172465304e-05, + "loss": 1.0061, + "step": 6279 + }, + { + "epoch": 0.5611276163245248, + "grad_norm": 0.5074711441993713, + "learning_rate": 4.2553678517244144e-05, + "loss": 0.9523, + "step": 6280 + }, + { + "epoch": 0.5612169678557867, + "grad_norm": 0.49485981464385986, + "learning_rate": 4.253936948573176e-05, + "loss": 0.9723, + "step": 6281 + }, + { + "epoch": 0.5613063193870484, + "grad_norm": 0.41250553727149963, + "learning_rate": 4.2525061079126705e-05, + "loss": 0.9542, + "step": 6282 + }, + { + "epoch": 0.5613956709183103, + "grad_norm": 0.42612266540527344, + "learning_rate": 4.251075329862747e-05, + "loss": 1.0109, + "step": 6283 + }, + { + "epoch": 0.5614850224495722, + "grad_norm": 0.41712063550949097, + "learning_rate": 4.249644614543247e-05, + "loss": 0.9561, + "step": 6284 + }, + { + "epoch": 0.5615743739808341, + "grad_norm": 0.42624518275260925, + "learning_rate": 4.2482139620740084e-05, + "loss": 0.9082, + "step": 6285 + }, + { + "epoch": 0.561663725512096, + "grad_norm": 0.4930400848388672, + "learning_rate": 4.246783372574864e-05, + "loss": 0.9108, + "step": 6286 + }, + { + "epoch": 0.5617530770433579, + "grad_norm": 0.46004951000213623, + "learning_rate": 4.245352846165641e-05, + "loss": 1.0204, + "step": 6287 + }, + { + "epoch": 0.5618424285746197, + "grad_norm": 0.46289774775505066, + "learning_rate": 4.243922382966162e-05, + "loss": 0.9439, + "step": 6288 + }, + { + "epoch": 0.5619317801058815, + "grad_norm": 0.39655157923698425, + "learning_rate": 4.2424919830962414e-05, + "loss": 0.9299, + "step": 6289 + }, + { + "epoch": 0.5620211316371434, + "grad_norm": 0.53416508436203, + "learning_rate": 4.241061646675695e-05, + "loss": 1.0126, + "step": 6290 + }, + { + "epoch": 0.5621104831684053, + "grad_norm": 0.5086248517036438, + "learning_rate": 4.239631373824322e-05, + "loss": 0.9606, + "step": 6291 + }, + { + "epoch": 0.5621998346996672, + "grad_norm": 0.4238837659358978, + "learning_rate": 4.2382011646619265e-05, + "loss": 1.0183, + "step": 6292 + }, + { + "epoch": 0.562289186230929, + "grad_norm": 0.4114820063114166, + "learning_rate": 4.236771019308304e-05, + "loss": 0.9271, + "step": 6293 + }, + { + "epoch": 0.5623785377621909, + "grad_norm": 0.5694311261177063, + "learning_rate": 4.235340937883245e-05, + "loss": 0.9144, + "step": 6294 + }, + { + "epoch": 0.5624678892934528, + "grad_norm": 0.41458985209465027, + "learning_rate": 4.233910920506533e-05, + "loss": 0.9604, + "step": 6295 + }, + { + "epoch": 0.5625572408247146, + "grad_norm": 0.5260997414588928, + "learning_rate": 4.232480967297947e-05, + "loss": 0.9441, + "step": 6296 + }, + { + "epoch": 0.5626465923559765, + "grad_norm": 0.4400598406791687, + "learning_rate": 4.2310510783772605e-05, + "loss": 0.9635, + "step": 6297 + }, + { + "epoch": 0.5627359438872384, + "grad_norm": 0.6165459156036377, + "learning_rate": 4.229621253864243e-05, + "loss": 0.8765, + "step": 6298 + }, + { + "epoch": 0.5628252954185002, + "grad_norm": 0.535810649394989, + "learning_rate": 4.228191493878657e-05, + "loss": 0.8623, + "step": 6299 + }, + { + "epoch": 0.5629146469497621, + "grad_norm": 0.556769073009491, + "learning_rate": 4.2267617985402625e-05, + "loss": 0.9318, + "step": 6300 + }, + { + "epoch": 0.563003998481024, + "grad_norm": 0.44776806235313416, + "learning_rate": 4.225332167968807e-05, + "loss": 0.9505, + "step": 6301 + }, + { + "epoch": 0.5630933500122859, + "grad_norm": 0.5459929704666138, + "learning_rate": 4.2239026022840404e-05, + "loss": 0.9728, + "step": 6302 + }, + { + "epoch": 0.5631827015435477, + "grad_norm": 0.4663999378681183, + "learning_rate": 4.222473101605703e-05, + "loss": 0.9274, + "step": 6303 + }, + { + "epoch": 0.5632720530748095, + "grad_norm": 0.4985668957233429, + "learning_rate": 4.221043666053531e-05, + "loss": 0.9777, + "step": 6304 + }, + { + "epoch": 0.5633614046060714, + "grad_norm": 0.4315873086452484, + "learning_rate": 4.2196142957472554e-05, + "loss": 0.9759, + "step": 6305 + }, + { + "epoch": 0.5634507561373333, + "grad_norm": 0.45682254433631897, + "learning_rate": 4.218184990806601e-05, + "loss": 0.9806, + "step": 6306 + }, + { + "epoch": 0.5635401076685952, + "grad_norm": 0.5467448234558105, + "learning_rate": 4.216755751351287e-05, + "loss": 0.8838, + "step": 6307 + }, + { + "epoch": 0.5636294591998571, + "grad_norm": 0.4684566557407379, + "learning_rate": 4.215326577501028e-05, + "loss": 0.8951, + "step": 6308 + }, + { + "epoch": 0.563718810731119, + "grad_norm": 0.4865800142288208, + "learning_rate": 4.213897469375533e-05, + "loss": 0.9647, + "step": 6309 + }, + { + "epoch": 0.5638081622623807, + "grad_norm": 0.5342085361480713, + "learning_rate": 4.212468427094503e-05, + "loss": 0.9468, + "step": 6310 + }, + { + "epoch": 0.5638975137936426, + "grad_norm": 0.38983017206192017, + "learning_rate": 4.2110394507776377e-05, + "loss": 1.0299, + "step": 6311 + }, + { + "epoch": 0.5639868653249045, + "grad_norm": 0.42452338337898254, + "learning_rate": 4.2096105405446264e-05, + "loss": 0.9127, + "step": 6312 + }, + { + "epoch": 0.5640762168561664, + "grad_norm": 0.5090858936309814, + "learning_rate": 4.2081816965151595e-05, + "loss": 0.858, + "step": 6313 + }, + { + "epoch": 0.5641655683874283, + "grad_norm": 0.4234965443611145, + "learning_rate": 4.206752918808914e-05, + "loss": 0.9866, + "step": 6314 + }, + { + "epoch": 0.5642549199186901, + "grad_norm": 0.5019885897636414, + "learning_rate": 4.205324207545567e-05, + "loss": 0.9501, + "step": 6315 + }, + { + "epoch": 0.564344271449952, + "grad_norm": 0.5135182738304138, + "learning_rate": 4.203895562844789e-05, + "loss": 1.013, + "step": 6316 + }, + { + "epoch": 0.5644336229812138, + "grad_norm": 0.42812684178352356, + "learning_rate": 4.202466984826242e-05, + "loss": 1.0301, + "step": 6317 + }, + { + "epoch": 0.5645229745124757, + "grad_norm": 0.5810215473175049, + "learning_rate": 4.201038473609587e-05, + "loss": 0.938, + "step": 6318 + }, + { + "epoch": 0.5646123260437376, + "grad_norm": 0.4419316351413727, + "learning_rate": 4.199610029314476e-05, + "loss": 0.9318, + "step": 6319 + }, + { + "epoch": 0.5647016775749994, + "grad_norm": 0.48427048325538635, + "learning_rate": 4.198181652060559e-05, + "loss": 0.9813, + "step": 6320 + }, + { + "epoch": 0.5647910291062613, + "grad_norm": 0.4527822732925415, + "learning_rate": 4.196753341967473e-05, + "loss": 0.9851, + "step": 6321 + }, + { + "epoch": 0.5648803806375232, + "grad_norm": 0.45797282457351685, + "learning_rate": 4.195325099154857e-05, + "loss": 0.9875, + "step": 6322 + }, + { + "epoch": 0.564969732168785, + "grad_norm": 0.5200628042221069, + "learning_rate": 4.193896923742341e-05, + "loss": 0.9328, + "step": 6323 + }, + { + "epoch": 0.5650590837000469, + "grad_norm": 0.4386134147644043, + "learning_rate": 4.192468815849549e-05, + "loss": 0.9706, + "step": 6324 + }, + { + "epoch": 0.5651484352313088, + "grad_norm": 0.4257854223251343, + "learning_rate": 4.1910407755961025e-05, + "loss": 0.9642, + "step": 6325 + }, + { + "epoch": 0.5652377867625706, + "grad_norm": 0.42326584458351135, + "learning_rate": 4.189612803101614e-05, + "loss": 0.9783, + "step": 6326 + }, + { + "epoch": 0.5653271382938325, + "grad_norm": 0.38882023096084595, + "learning_rate": 4.188184898485691e-05, + "loss": 0.9474, + "step": 6327 + }, + { + "epoch": 0.5654164898250944, + "grad_norm": 0.4262528717517853, + "learning_rate": 4.186757061867937e-05, + "loss": 1.0289, + "step": 6328 + }, + { + "epoch": 0.5655058413563563, + "grad_norm": 0.5065363049507141, + "learning_rate": 4.185329293367947e-05, + "loss": 0.8959, + "step": 6329 + }, + { + "epoch": 0.5655951928876181, + "grad_norm": 0.4666295051574707, + "learning_rate": 4.1839015931053125e-05, + "loss": 0.9883, + "step": 6330 + }, + { + "epoch": 0.5656845444188799, + "grad_norm": 0.4304862916469574, + "learning_rate": 4.182473961199619e-05, + "loss": 0.9752, + "step": 6331 + }, + { + "epoch": 0.5657738959501418, + "grad_norm": 0.4546220600605011, + "learning_rate": 4.1810463977704464e-05, + "loss": 0.9584, + "step": 6332 + }, + { + "epoch": 0.5658632474814037, + "grad_norm": 0.4094926714897156, + "learning_rate": 4.179618902937365e-05, + "loss": 0.957, + "step": 6333 + }, + { + "epoch": 0.5659525990126656, + "grad_norm": 0.41204431653022766, + "learning_rate": 4.178191476819946e-05, + "loss": 1.0065, + "step": 6334 + }, + { + "epoch": 0.5660419505439275, + "grad_norm": 0.5892501473426819, + "learning_rate": 4.17676411953775e-05, + "loss": 1.0111, + "step": 6335 + }, + { + "epoch": 0.5661313020751894, + "grad_norm": 0.5121837854385376, + "learning_rate": 4.175336831210335e-05, + "loss": 0.8916, + "step": 6336 + }, + { + "epoch": 0.5662206536064511, + "grad_norm": 0.5115182995796204, + "learning_rate": 4.17390961195725e-05, + "loss": 0.9181, + "step": 6337 + }, + { + "epoch": 0.566310005137713, + "grad_norm": 0.4587537348270416, + "learning_rate": 4.172482461898041e-05, + "loss": 0.9743, + "step": 6338 + }, + { + "epoch": 0.5663993566689749, + "grad_norm": 0.533208966255188, + "learning_rate": 4.171055381152246e-05, + "loss": 0.9295, + "step": 6339 + }, + { + "epoch": 0.5664887082002368, + "grad_norm": 0.4383775591850281, + "learning_rate": 4.169628369839399e-05, + "loss": 0.9257, + "step": 6340 + }, + { + "epoch": 0.5665780597314987, + "grad_norm": 0.572481632232666, + "learning_rate": 4.1682014280790294e-05, + "loss": 0.9358, + "step": 6341 + }, + { + "epoch": 0.5666674112627605, + "grad_norm": 0.4642484486103058, + "learning_rate": 4.166774555990654e-05, + "loss": 0.9556, + "step": 6342 + }, + { + "epoch": 0.5667567627940224, + "grad_norm": 0.4034402668476105, + "learning_rate": 4.165347753693791e-05, + "loss": 1.0468, + "step": 6343 + }, + { + "epoch": 0.5668461143252842, + "grad_norm": 0.5274239778518677, + "learning_rate": 4.1639210213079513e-05, + "loss": 0.9687, + "step": 6344 + }, + { + "epoch": 0.5669354658565461, + "grad_norm": 0.4413708746433258, + "learning_rate": 4.162494358952637e-05, + "loss": 0.9749, + "step": 6345 + }, + { + "epoch": 0.567024817387808, + "grad_norm": 0.4011813700199127, + "learning_rate": 4.161067766747349e-05, + "loss": 1.012, + "step": 6346 + }, + { + "epoch": 0.5671141689190698, + "grad_norm": 0.4661290943622589, + "learning_rate": 4.159641244811577e-05, + "loss": 0.9554, + "step": 6347 + }, + { + "epoch": 0.5672035204503317, + "grad_norm": 0.40306004881858826, + "learning_rate": 4.1582147932648074e-05, + "loss": 0.9564, + "step": 6348 + }, + { + "epoch": 0.5672928719815936, + "grad_norm": 0.4345554709434509, + "learning_rate": 4.156788412226522e-05, + "loss": 0.9711, + "step": 6349 + }, + { + "epoch": 0.5673822235128555, + "grad_norm": 0.4540269076824188, + "learning_rate": 4.155362101816196e-05, + "loss": 0.91, + "step": 6350 + }, + { + "epoch": 0.5674715750441173, + "grad_norm": 0.44446346163749695, + "learning_rate": 4.153935862153298e-05, + "loss": 0.9575, + "step": 6351 + }, + { + "epoch": 0.5675609265753792, + "grad_norm": 0.5166485905647278, + "learning_rate": 4.152509693357289e-05, + "loss": 0.9601, + "step": 6352 + }, + { + "epoch": 0.567650278106641, + "grad_norm": 0.39662766456604004, + "learning_rate": 4.1510835955476256e-05, + "loss": 1.0281, + "step": 6353 + }, + { + "epoch": 0.5677396296379029, + "grad_norm": 0.4909988045692444, + "learning_rate": 4.1496575688437605e-05, + "loss": 0.9623, + "step": 6354 + }, + { + "epoch": 0.5678289811691648, + "grad_norm": 0.5233615040779114, + "learning_rate": 4.1482316133651375e-05, + "loss": 0.9088, + "step": 6355 + }, + { + "epoch": 0.5679183327004267, + "grad_norm": 0.48598557710647583, + "learning_rate": 4.146805729231197e-05, + "loss": 0.9457, + "step": 6356 + }, + { + "epoch": 0.5680076842316886, + "grad_norm": 0.4184345304965973, + "learning_rate": 4.145379916561371e-05, + "loss": 1.0264, + "step": 6357 + }, + { + "epoch": 0.5680970357629503, + "grad_norm": 0.48690488934516907, + "learning_rate": 4.143954175475086e-05, + "loss": 0.9859, + "step": 6358 + }, + { + "epoch": 0.5681863872942122, + "grad_norm": 0.5679425001144409, + "learning_rate": 4.142528506091764e-05, + "loss": 1.0039, + "step": 6359 + }, + { + "epoch": 0.5682757388254741, + "grad_norm": 0.44501417875289917, + "learning_rate": 4.141102908530819e-05, + "loss": 0.974, + "step": 6360 + }, + { + "epoch": 0.568365090356736, + "grad_norm": 0.4163120687007904, + "learning_rate": 4.139677382911663e-05, + "loss": 0.8965, + "step": 6361 + }, + { + "epoch": 0.5684544418879979, + "grad_norm": 0.46128639578819275, + "learning_rate": 4.138251929353695e-05, + "loss": 0.9975, + "step": 6362 + }, + { + "epoch": 0.5685437934192598, + "grad_norm": 0.42836207151412964, + "learning_rate": 4.1368265479763127e-05, + "loss": 0.9683, + "step": 6363 + }, + { + "epoch": 0.5686331449505216, + "grad_norm": 0.4289848208427429, + "learning_rate": 4.135401238898908e-05, + "loss": 0.9996, + "step": 6364 + }, + { + "epoch": 0.5687224964817834, + "grad_norm": 0.5396338105201721, + "learning_rate": 4.133976002240867e-05, + "loss": 0.929, + "step": 6365 + }, + { + "epoch": 0.5688118480130453, + "grad_norm": 0.5357837080955505, + "learning_rate": 4.132550838121565e-05, + "loss": 0.9172, + "step": 6366 + }, + { + "epoch": 0.5689011995443072, + "grad_norm": 0.48109525442123413, + "learning_rate": 4.1311257466603774e-05, + "loss": 0.9564, + "step": 6367 + }, + { + "epoch": 0.5689905510755691, + "grad_norm": 0.5908039212226868, + "learning_rate": 4.12970072797667e-05, + "loss": 0.9478, + "step": 6368 + }, + { + "epoch": 0.569079902606831, + "grad_norm": 0.4044991731643677, + "learning_rate": 4.128275782189803e-05, + "loss": 0.9789, + "step": 6369 + }, + { + "epoch": 0.5691692541380928, + "grad_norm": 0.44325581192970276, + "learning_rate": 4.1268509094191315e-05, + "loss": 0.9043, + "step": 6370 + }, + { + "epoch": 0.5692586056693547, + "grad_norm": 0.37263184785842896, + "learning_rate": 4.125426109784006e-05, + "loss": 1.0072, + "step": 6371 + }, + { + "epoch": 0.5693479572006165, + "grad_norm": 0.4318254888057709, + "learning_rate": 4.1240013834037626e-05, + "loss": 0.8824, + "step": 6372 + }, + { + "epoch": 0.5694373087318784, + "grad_norm": 0.49909618496894836, + "learning_rate": 4.122576730397742e-05, + "loss": 0.9341, + "step": 6373 + }, + { + "epoch": 0.5695266602631402, + "grad_norm": 0.6334441304206848, + "learning_rate": 4.1211521508852726e-05, + "loss": 0.8383, + "step": 6374 + }, + { + "epoch": 0.5696160117944021, + "grad_norm": 0.5850768685340881, + "learning_rate": 4.119727644985678e-05, + "loss": 0.88, + "step": 6375 + }, + { + "epoch": 0.569705363325664, + "grad_norm": 0.48001495003700256, + "learning_rate": 4.1183032128182766e-05, + "loss": 0.9366, + "step": 6376 + }, + { + "epoch": 0.5697947148569259, + "grad_norm": 0.504117488861084, + "learning_rate": 4.1168788545023796e-05, + "loss": 0.9801, + "step": 6377 + }, + { + "epoch": 0.5698840663881878, + "grad_norm": 0.4841177761554718, + "learning_rate": 4.115454570157291e-05, + "loss": 0.9103, + "step": 6378 + }, + { + "epoch": 0.5699734179194496, + "grad_norm": 0.475758820772171, + "learning_rate": 4.114030359902313e-05, + "loss": 0.9221, + "step": 6379 + }, + { + "epoch": 0.5700627694507114, + "grad_norm": 0.4464789927005768, + "learning_rate": 4.112606223856734e-05, + "loss": 1.0044, + "step": 6380 + }, + { + "epoch": 0.5701521209819733, + "grad_norm": 0.4405708312988281, + "learning_rate": 4.1111821621398446e-05, + "loss": 0.9311, + "step": 6381 + }, + { + "epoch": 0.5702414725132352, + "grad_norm": 0.49968060851097107, + "learning_rate": 4.109758174870921e-05, + "loss": 0.898, + "step": 6382 + }, + { + "epoch": 0.5703308240444971, + "grad_norm": 0.4581187963485718, + "learning_rate": 4.10833426216924e-05, + "loss": 0.95, + "step": 6383 + }, + { + "epoch": 0.570420175575759, + "grad_norm": 0.46304047107696533, + "learning_rate": 4.1069104241540715e-05, + "loss": 0.9737, + "step": 6384 + }, + { + "epoch": 0.5705095271070207, + "grad_norm": 0.5021114349365234, + "learning_rate": 4.105486660944672e-05, + "loss": 0.9034, + "step": 6385 + }, + { + "epoch": 0.5705988786382826, + "grad_norm": 0.5141803622245789, + "learning_rate": 4.1040629726602996e-05, + "loss": 0.9224, + "step": 6386 + }, + { + "epoch": 0.5706882301695445, + "grad_norm": 0.4306526482105255, + "learning_rate": 4.102639359420204e-05, + "loss": 0.9798, + "step": 6387 + }, + { + "epoch": 0.5707775817008064, + "grad_norm": 0.5190479755401611, + "learning_rate": 4.101215821343626e-05, + "loss": 0.9899, + "step": 6388 + }, + { + "epoch": 0.5708669332320683, + "grad_norm": 0.5050314664840698, + "learning_rate": 4.0997923585498046e-05, + "loss": 0.935, + "step": 6389 + }, + { + "epoch": 0.5709562847633302, + "grad_norm": 0.47217750549316406, + "learning_rate": 4.098368971157968e-05, + "loss": 0.9683, + "step": 6390 + }, + { + "epoch": 0.571045636294592, + "grad_norm": 0.43386927247047424, + "learning_rate": 4.0969456592873435e-05, + "loss": 0.925, + "step": 6391 + }, + { + "epoch": 0.5711349878258538, + "grad_norm": 0.577601969242096, + "learning_rate": 4.095522423057143e-05, + "loss": 0.9045, + "step": 6392 + }, + { + "epoch": 0.5712243393571157, + "grad_norm": 0.38935279846191406, + "learning_rate": 4.094099262586581e-05, + "loss": 0.9945, + "step": 6393 + }, + { + "epoch": 0.5713136908883776, + "grad_norm": 0.4525635838508606, + "learning_rate": 4.092676177994862e-05, + "loss": 1.0153, + "step": 6394 + }, + { + "epoch": 0.5714030424196395, + "grad_norm": 0.4267630875110626, + "learning_rate": 4.091253169401184e-05, + "loss": 1.0024, + "step": 6395 + }, + { + "epoch": 0.5714923939509013, + "grad_norm": 0.47713613510131836, + "learning_rate": 4.0898302369247405e-05, + "loss": 0.9021, + "step": 6396 + }, + { + "epoch": 0.5715817454821632, + "grad_norm": 0.4627811908721924, + "learning_rate": 4.088407380684715e-05, + "loss": 0.9642, + "step": 6397 + }, + { + "epoch": 0.5716710970134251, + "grad_norm": 0.5233446955680847, + "learning_rate": 4.086984600800291e-05, + "loss": 0.9229, + "step": 6398 + }, + { + "epoch": 0.5717604485446869, + "grad_norm": 0.4403021037578583, + "learning_rate": 4.0855618973906365e-05, + "loss": 0.9546, + "step": 6399 + }, + { + "epoch": 0.5718498000759488, + "grad_norm": 0.42531147599220276, + "learning_rate": 4.08413927057492e-05, + "loss": 1.0129, + "step": 6400 + }, + { + "epoch": 0.5719391516072106, + "grad_norm": 0.46862322092056274, + "learning_rate": 4.082716720472304e-05, + "loss": 0.9922, + "step": 6401 + }, + { + "epoch": 0.5720285031384725, + "grad_norm": 0.43281012773513794, + "learning_rate": 4.08129424720194e-05, + "loss": 0.9253, + "step": 6402 + }, + { + "epoch": 0.5721178546697344, + "grad_norm": 0.44874048233032227, + "learning_rate": 4.079871850882975e-05, + "loss": 0.9576, + "step": 6403 + }, + { + "epoch": 0.5722072062009963, + "grad_norm": 0.3677184283733368, + "learning_rate": 4.0784495316345496e-05, + "loss": 1.0069, + "step": 6404 + }, + { + "epoch": 0.5722965577322582, + "grad_norm": 0.5108903050422668, + "learning_rate": 4.077027289575799e-05, + "loss": 0.9036, + "step": 6405 + }, + { + "epoch": 0.57238590926352, + "grad_norm": 0.5099184513092041, + "learning_rate": 4.0756051248258506e-05, + "loss": 0.9907, + "step": 6406 + }, + { + "epoch": 0.5724752607947818, + "grad_norm": 0.506807804107666, + "learning_rate": 4.074183037503827e-05, + "loss": 0.8992, + "step": 6407 + }, + { + "epoch": 0.5725646123260437, + "grad_norm": 0.4727827310562134, + "learning_rate": 4.072761027728842e-05, + "loss": 0.9714, + "step": 6408 + }, + { + "epoch": 0.5726539638573056, + "grad_norm": 0.41330844163894653, + "learning_rate": 4.0713390956200046e-05, + "loss": 0.9732, + "step": 6409 + }, + { + "epoch": 0.5727433153885675, + "grad_norm": 0.5296694040298462, + "learning_rate": 4.069917241296417e-05, + "loss": 1.0646, + "step": 6410 + }, + { + "epoch": 0.5728326669198294, + "grad_norm": 0.5255220532417297, + "learning_rate": 4.068495464877177e-05, + "loss": 0.967, + "step": 6411 + }, + { + "epoch": 0.5729220184510913, + "grad_norm": 0.5186451077461243, + "learning_rate": 4.067073766481369e-05, + "loss": 0.9345, + "step": 6412 + }, + { + "epoch": 0.573011369982353, + "grad_norm": 0.5088133215904236, + "learning_rate": 4.0656521462280764e-05, + "loss": 0.8783, + "step": 6413 + }, + { + "epoch": 0.5731007215136149, + "grad_norm": 0.4002193510532379, + "learning_rate": 4.064230604236376e-05, + "loss": 0.975, + "step": 6414 + }, + { + "epoch": 0.5731900730448768, + "grad_norm": 0.4866524040699005, + "learning_rate": 4.062809140625338e-05, + "loss": 0.8875, + "step": 6415 + }, + { + "epoch": 0.5732794245761387, + "grad_norm": 0.45396822690963745, + "learning_rate": 4.061387755514024e-05, + "loss": 0.9482, + "step": 6416 + }, + { + "epoch": 0.5733687761074006, + "grad_norm": 0.5568546056747437, + "learning_rate": 4.05996644902149e-05, + "loss": 0.9384, + "step": 6417 + }, + { + "epoch": 0.5734581276386624, + "grad_norm": 0.4871857464313507, + "learning_rate": 4.0585452212667864e-05, + "loss": 0.9945, + "step": 6418 + }, + { + "epoch": 0.5735474791699243, + "grad_norm": 0.4038003385066986, + "learning_rate": 4.0571240723689546e-05, + "loss": 0.9829, + "step": 6419 + }, + { + "epoch": 0.5736368307011861, + "grad_norm": 0.39206719398498535, + "learning_rate": 4.055703002447033e-05, + "loss": 1.0198, + "step": 6420 + }, + { + "epoch": 0.573726182232448, + "grad_norm": 0.37577176094055176, + "learning_rate": 4.0542820116200495e-05, + "loss": 0.9572, + "step": 6421 + }, + { + "epoch": 0.5738155337637099, + "grad_norm": 0.3899545669555664, + "learning_rate": 4.052861100007032e-05, + "loss": 1.035, + "step": 6422 + }, + { + "epoch": 0.5739048852949717, + "grad_norm": 0.6146190762519836, + "learning_rate": 4.051440267726989e-05, + "loss": 0.91, + "step": 6423 + }, + { + "epoch": 0.5739942368262336, + "grad_norm": 0.4568648338317871, + "learning_rate": 4.050019514898936e-05, + "loss": 0.9665, + "step": 6424 + }, + { + "epoch": 0.5740835883574955, + "grad_norm": 0.4437832534313202, + "learning_rate": 4.048598841641874e-05, + "loss": 0.9733, + "step": 6425 + }, + { + "epoch": 0.5741729398887574, + "grad_norm": 0.4725315272808075, + "learning_rate": 4.0471782480748e-05, + "loss": 0.947, + "step": 6426 + }, + { + "epoch": 0.5742622914200192, + "grad_norm": 0.5983218550682068, + "learning_rate": 4.0457577343167044e-05, + "loss": 0.9638, + "step": 6427 + }, + { + "epoch": 0.574351642951281, + "grad_norm": 0.42528533935546875, + "learning_rate": 4.04433730048657e-05, + "loss": 0.9599, + "step": 6428 + }, + { + "epoch": 0.5744409944825429, + "grad_norm": 0.4161036014556885, + "learning_rate": 4.042916946703373e-05, + "loss": 0.9122, + "step": 6429 + }, + { + "epoch": 0.5745303460138048, + "grad_norm": 0.418058305978775, + "learning_rate": 4.0414966730860846e-05, + "loss": 0.9734, + "step": 6430 + }, + { + "epoch": 0.5746196975450667, + "grad_norm": 0.5074496865272522, + "learning_rate": 4.0400764797536675e-05, + "loss": 0.8858, + "step": 6431 + }, + { + "epoch": 0.5747090490763286, + "grad_norm": 0.5379347801208496, + "learning_rate": 4.038656366825076e-05, + "loss": 0.9721, + "step": 6432 + }, + { + "epoch": 0.5747984006075905, + "grad_norm": 0.5281144380569458, + "learning_rate": 4.037236334419261e-05, + "loss": 1.0213, + "step": 6433 + }, + { + "epoch": 0.5748877521388522, + "grad_norm": 0.4760182201862335, + "learning_rate": 4.035816382655165e-05, + "loss": 0.8549, + "step": 6434 + }, + { + "epoch": 0.5749771036701141, + "grad_norm": 0.4302056133747101, + "learning_rate": 4.034396511651726e-05, + "loss": 0.9301, + "step": 6435 + }, + { + "epoch": 0.575066455201376, + "grad_norm": 0.4959314167499542, + "learning_rate": 4.032976721527869e-05, + "loss": 0.889, + "step": 6436 + }, + { + "epoch": 0.5751558067326379, + "grad_norm": 0.5147891044616699, + "learning_rate": 4.0315570124025216e-05, + "loss": 0.9727, + "step": 6437 + }, + { + "epoch": 0.5752451582638998, + "grad_norm": 0.42699047923088074, + "learning_rate": 4.030137384394595e-05, + "loss": 1.0242, + "step": 6438 + }, + { + "epoch": 0.5753345097951617, + "grad_norm": 0.5570961833000183, + "learning_rate": 4.028717837623002e-05, + "loss": 0.8377, + "step": 6439 + }, + { + "epoch": 0.5754238613264235, + "grad_norm": 0.5209365487098694, + "learning_rate": 4.0272983722066435e-05, + "loss": 0.9127, + "step": 6440 + }, + { + "epoch": 0.5755132128576853, + "grad_norm": 0.4663841128349304, + "learning_rate": 4.0258789882644135e-05, + "loss": 0.9139, + "step": 6441 + }, + { + "epoch": 0.5756025643889472, + "grad_norm": 0.5318526029586792, + "learning_rate": 4.024459685915204e-05, + "loss": 0.9117, + "step": 6442 + }, + { + "epoch": 0.5756919159202091, + "grad_norm": 0.455567866563797, + "learning_rate": 4.023040465277892e-05, + "loss": 0.8577, + "step": 6443 + }, + { + "epoch": 0.575781267451471, + "grad_norm": 0.4229848086833954, + "learning_rate": 4.0216213264713556e-05, + "loss": 0.9288, + "step": 6444 + }, + { + "epoch": 0.5758706189827328, + "grad_norm": 0.38377058506011963, + "learning_rate": 4.020202269614461e-05, + "loss": 0.9909, + "step": 6445 + }, + { + "epoch": 0.5759599705139947, + "grad_norm": 0.48710566759109497, + "learning_rate": 4.0187832948260705e-05, + "loss": 0.9682, + "step": 6446 + }, + { + "epoch": 0.5760493220452565, + "grad_norm": 0.3837445378303528, + "learning_rate": 4.017364402225038e-05, + "loss": 0.9965, + "step": 6447 + }, + { + "epoch": 0.5761386735765184, + "grad_norm": 0.4128133952617645, + "learning_rate": 4.0159455919302114e-05, + "loss": 0.9991, + "step": 6448 + }, + { + "epoch": 0.5762280251077803, + "grad_norm": 0.44364961981773376, + "learning_rate": 4.014526864060432e-05, + "loss": 1.0147, + "step": 6449 + }, + { + "epoch": 0.5763173766390421, + "grad_norm": 0.47115516662597656, + "learning_rate": 4.01310821873453e-05, + "loss": 0.9597, + "step": 6450 + }, + { + "epoch": 0.576406728170304, + "grad_norm": 0.4687146842479706, + "learning_rate": 4.0116896560713346e-05, + "loss": 0.9166, + "step": 6451 + }, + { + "epoch": 0.5764960797015659, + "grad_norm": 0.46051499247550964, + "learning_rate": 4.010271176189666e-05, + "loss": 0.9735, + "step": 6452 + }, + { + "epoch": 0.5765854312328278, + "grad_norm": 0.45504701137542725, + "learning_rate": 4.008852779208336e-05, + "loss": 0.9643, + "step": 6453 + }, + { + "epoch": 0.5766747827640896, + "grad_norm": 0.5997002124786377, + "learning_rate": 4.007434465246151e-05, + "loss": 0.9214, + "step": 6454 + }, + { + "epoch": 0.5767641342953514, + "grad_norm": 0.5700611472129822, + "learning_rate": 4.006016234421908e-05, + "loss": 1.0074, + "step": 6455 + }, + { + "epoch": 0.5768534858266133, + "grad_norm": 0.5041797757148743, + "learning_rate": 4.0045980868544014e-05, + "loss": 0.9945, + "step": 6456 + }, + { + "epoch": 0.5769428373578752, + "grad_norm": 0.39958614110946655, + "learning_rate": 4.003180022662415e-05, + "loss": 1.0317, + "step": 6457 + }, + { + "epoch": 0.5770321888891371, + "grad_norm": 0.5589715242385864, + "learning_rate": 4.001762041964727e-05, + "loss": 0.8559, + "step": 6458 + }, + { + "epoch": 0.577121540420399, + "grad_norm": 0.4533728361129761, + "learning_rate": 4.000344144880108e-05, + "loss": 0.9256, + "step": 6459 + }, + { + "epoch": 0.5772108919516609, + "grad_norm": 0.5269187688827515, + "learning_rate": 3.998926331527323e-05, + "loss": 0.9339, + "step": 6460 + }, + { + "epoch": 0.5773002434829226, + "grad_norm": 0.4693765938282013, + "learning_rate": 3.997508602025128e-05, + "loss": 0.9535, + "step": 6461 + }, + { + "epoch": 0.5773895950141845, + "grad_norm": 0.42026007175445557, + "learning_rate": 3.996090956492275e-05, + "loss": 0.9109, + "step": 6462 + }, + { + "epoch": 0.5774789465454464, + "grad_norm": 0.41388946771621704, + "learning_rate": 3.994673395047505e-05, + "loss": 0.9696, + "step": 6463 + }, + { + "epoch": 0.5775682980767083, + "grad_norm": 0.3982580304145813, + "learning_rate": 3.993255917809553e-05, + "loss": 1.0228, + "step": 6464 + }, + { + "epoch": 0.5776576496079702, + "grad_norm": 0.4337417483329773, + "learning_rate": 3.9918385248971484e-05, + "loss": 0.996, + "step": 6465 + }, + { + "epoch": 0.577747001139232, + "grad_norm": 0.5209522843360901, + "learning_rate": 3.990421216429014e-05, + "loss": 0.8832, + "step": 6466 + }, + { + "epoch": 0.5778363526704939, + "grad_norm": 0.4163426160812378, + "learning_rate": 3.9890039925238645e-05, + "loss": 0.9445, + "step": 6467 + }, + { + "epoch": 0.5779257042017557, + "grad_norm": 0.4440252482891083, + "learning_rate": 3.987586853300408e-05, + "loss": 0.9476, + "step": 6468 + }, + { + "epoch": 0.5780150557330176, + "grad_norm": 0.41356053948402405, + "learning_rate": 3.9861697988773425e-05, + "loss": 0.9586, + "step": 6469 + }, + { + "epoch": 0.5781044072642795, + "grad_norm": 0.46509677171707153, + "learning_rate": 3.9847528293733636e-05, + "loss": 0.956, + "step": 6470 + }, + { + "epoch": 0.5781937587955414, + "grad_norm": 0.4374508261680603, + "learning_rate": 3.9833359449071564e-05, + "loss": 0.9895, + "step": 6471 + }, + { + "epoch": 0.5782831103268032, + "grad_norm": 0.4301753044128418, + "learning_rate": 3.981919145597404e-05, + "loss": 1.0006, + "step": 6472 + }, + { + "epoch": 0.5783724618580651, + "grad_norm": 0.4043073356151581, + "learning_rate": 3.9805024315627714e-05, + "loss": 0.9939, + "step": 6473 + }, + { + "epoch": 0.578461813389327, + "grad_norm": 0.4929652512073517, + "learning_rate": 3.979085802921928e-05, + "loss": 0.808, + "step": 6474 + }, + { + "epoch": 0.5785511649205888, + "grad_norm": 0.3793524205684662, + "learning_rate": 3.977669259793531e-05, + "loss": 0.9753, + "step": 6475 + }, + { + "epoch": 0.5786405164518507, + "grad_norm": 0.4306744635105133, + "learning_rate": 3.9762528022962305e-05, + "loss": 0.9424, + "step": 6476 + }, + { + "epoch": 0.5787298679831125, + "grad_norm": 0.5743783712387085, + "learning_rate": 3.9748364305486703e-05, + "loss": 0.8385, + "step": 6477 + }, + { + "epoch": 0.5788192195143744, + "grad_norm": 0.46976256370544434, + "learning_rate": 3.9734201446694865e-05, + "loss": 0.9779, + "step": 6478 + }, + { + "epoch": 0.5789085710456363, + "grad_norm": 0.3878982365131378, + "learning_rate": 3.972003944777308e-05, + "loss": 1.0064, + "step": 6479 + }, + { + "epoch": 0.5789979225768982, + "grad_norm": 0.3860369324684143, + "learning_rate": 3.9705878309907565e-05, + "loss": 0.9581, + "step": 6480 + }, + { + "epoch": 0.5790872741081601, + "grad_norm": 0.582404375076294, + "learning_rate": 3.969171803428447e-05, + "loss": 0.8623, + "step": 6481 + }, + { + "epoch": 0.5791766256394218, + "grad_norm": 0.5933336019515991, + "learning_rate": 3.96775586220899e-05, + "loss": 0.8301, + "step": 6482 + }, + { + "epoch": 0.5792659771706837, + "grad_norm": 0.4246273934841156, + "learning_rate": 3.9663400074509786e-05, + "loss": 0.9505, + "step": 6483 + }, + { + "epoch": 0.5793553287019456, + "grad_norm": 0.4642765522003174, + "learning_rate": 3.964924239273011e-05, + "loss": 0.9685, + "step": 6484 + }, + { + "epoch": 0.5794446802332075, + "grad_norm": 0.4278883635997772, + "learning_rate": 3.96350855779367e-05, + "loss": 1.0149, + "step": 6485 + }, + { + "epoch": 0.5795340317644694, + "grad_norm": 0.427097350358963, + "learning_rate": 3.962092963131537e-05, + "loss": 0.9337, + "step": 6486 + }, + { + "epoch": 0.5796233832957313, + "grad_norm": 0.5278881788253784, + "learning_rate": 3.9606774554051824e-05, + "loss": 0.9693, + "step": 6487 + }, + { + "epoch": 0.5797127348269931, + "grad_norm": 0.42778128385543823, + "learning_rate": 3.959262034733168e-05, + "loss": 0.984, + "step": 6488 + }, + { + "epoch": 0.5798020863582549, + "grad_norm": 0.6493330597877502, + "learning_rate": 3.9578467012340515e-05, + "loss": 0.8847, + "step": 6489 + }, + { + "epoch": 0.5798914378895168, + "grad_norm": 0.4391600787639618, + "learning_rate": 3.956431455026382e-05, + "loss": 1.0071, + "step": 6490 + }, + { + "epoch": 0.5799807894207787, + "grad_norm": 0.4158203899860382, + "learning_rate": 3.955016296228702e-05, + "loss": 0.967, + "step": 6491 + }, + { + "epoch": 0.5800701409520406, + "grad_norm": 0.41016045212745667, + "learning_rate": 3.953601224959549e-05, + "loss": 0.9616, + "step": 6492 + }, + { + "epoch": 0.5801594924833025, + "grad_norm": 0.420639306306839, + "learning_rate": 3.952186241337444e-05, + "loss": 0.9555, + "step": 6493 + }, + { + "epoch": 0.5802488440145643, + "grad_norm": 0.4078323543071747, + "learning_rate": 3.9507713454809106e-05, + "loss": 0.9645, + "step": 6494 + }, + { + "epoch": 0.5803381955458262, + "grad_norm": 0.4655108153820038, + "learning_rate": 3.949356537508461e-05, + "loss": 0.8071, + "step": 6495 + }, + { + "epoch": 0.580427547077088, + "grad_norm": 0.4758930504322052, + "learning_rate": 3.947941817538601e-05, + "loss": 0.8597, + "step": 6496 + }, + { + "epoch": 0.5805168986083499, + "grad_norm": 0.41579219698905945, + "learning_rate": 3.946527185689827e-05, + "loss": 0.9475, + "step": 6497 + }, + { + "epoch": 0.5806062501396118, + "grad_norm": 0.4709773659706116, + "learning_rate": 3.9451126420806304e-05, + "loss": 0.9533, + "step": 6498 + }, + { + "epoch": 0.5806956016708736, + "grad_norm": 0.572155773639679, + "learning_rate": 3.943698186829495e-05, + "loss": 0.932, + "step": 6499 + }, + { + "epoch": 0.5807849532021355, + "grad_norm": 0.4728442132472992, + "learning_rate": 3.942283820054895e-05, + "loss": 1.0003, + "step": 6500 + }, + { + "epoch": 0.5808743047333974, + "grad_norm": 0.504787266254425, + "learning_rate": 3.940869541875301e-05, + "loss": 0.9066, + "step": 6501 + }, + { + "epoch": 0.5809636562646593, + "grad_norm": 0.4003005921840668, + "learning_rate": 3.939455352409172e-05, + "loss": 0.9711, + "step": 6502 + }, + { + "epoch": 0.5810530077959211, + "grad_norm": 0.4955715537071228, + "learning_rate": 3.9380412517749613e-05, + "loss": 0.9386, + "step": 6503 + }, + { + "epoch": 0.581142359327183, + "grad_norm": 0.4974922835826874, + "learning_rate": 3.9366272400911156e-05, + "loss": 0.9126, + "step": 6504 + }, + { + "epoch": 0.5812317108584448, + "grad_norm": 0.444976806640625, + "learning_rate": 3.935213317476074e-05, + "loss": 1.0139, + "step": 6505 + }, + { + "epoch": 0.5813210623897067, + "grad_norm": 0.47083789110183716, + "learning_rate": 3.9337994840482664e-05, + "loss": 0.9698, + "step": 6506 + }, + { + "epoch": 0.5814104139209686, + "grad_norm": 0.3796704411506653, + "learning_rate": 3.932385739926116e-05, + "loss": 0.9525, + "step": 6507 + }, + { + "epoch": 0.5814997654522305, + "grad_norm": 0.4334143400192261, + "learning_rate": 3.93097208522804e-05, + "loss": 0.9909, + "step": 6508 + }, + { + "epoch": 0.5815891169834922, + "grad_norm": 0.47649437189102173, + "learning_rate": 3.929558520072447e-05, + "loss": 0.9077, + "step": 6509 + }, + { + "epoch": 0.5816784685147541, + "grad_norm": 0.6175134181976318, + "learning_rate": 3.928145044577738e-05, + "loss": 0.9106, + "step": 6510 + }, + { + "epoch": 0.581767820046016, + "grad_norm": 0.4393305480480194, + "learning_rate": 3.926731658862307e-05, + "loss": 0.9112, + "step": 6511 + }, + { + "epoch": 0.5818571715772779, + "grad_norm": 0.46963876485824585, + "learning_rate": 3.9253183630445395e-05, + "loss": 0.9079, + "step": 6512 + }, + { + "epoch": 0.5819465231085398, + "grad_norm": 0.4847874343395233, + "learning_rate": 3.923905157242817e-05, + "loss": 0.9349, + "step": 6513 + }, + { + "epoch": 0.5820358746398017, + "grad_norm": 0.40541452169418335, + "learning_rate": 3.922492041575505e-05, + "loss": 0.9665, + "step": 6514 + }, + { + "epoch": 0.5821252261710635, + "grad_norm": 0.4827271103858948, + "learning_rate": 3.92107901616097e-05, + "loss": 0.9835, + "step": 6515 + }, + { + "epoch": 0.5822145777023253, + "grad_norm": 0.5301873087882996, + "learning_rate": 3.9196660811175685e-05, + "loss": 0.8416, + "step": 6516 + }, + { + "epoch": 0.5823039292335872, + "grad_norm": 0.557282030582428, + "learning_rate": 3.918253236563648e-05, + "loss": 0.8892, + "step": 6517 + }, + { + "epoch": 0.5823932807648491, + "grad_norm": 0.584327220916748, + "learning_rate": 3.9168404826175486e-05, + "loss": 0.9241, + "step": 6518 + }, + { + "epoch": 0.582482632296111, + "grad_norm": 0.42827340960502625, + "learning_rate": 3.9154278193976066e-05, + "loss": 0.92, + "step": 6519 + }, + { + "epoch": 0.5825719838273729, + "grad_norm": 0.43465957045555115, + "learning_rate": 3.914015247022144e-05, + "loss": 0.9636, + "step": 6520 + }, + { + "epoch": 0.5826613353586347, + "grad_norm": 0.4111988842487335, + "learning_rate": 3.9126027656094806e-05, + "loss": 0.9871, + "step": 6521 + }, + { + "epoch": 0.5827506868898966, + "grad_norm": 0.4078396260738373, + "learning_rate": 3.9111903752779263e-05, + "loss": 0.9392, + "step": 6522 + }, + { + "epoch": 0.5828400384211584, + "grad_norm": 0.49271008372306824, + "learning_rate": 3.909778076145785e-05, + "loss": 0.9823, + "step": 6523 + }, + { + "epoch": 0.5829293899524203, + "grad_norm": 0.4981904625892639, + "learning_rate": 3.90836586833135e-05, + "loss": 0.9286, + "step": 6524 + }, + { + "epoch": 0.5830187414836822, + "grad_norm": 0.5224156379699707, + "learning_rate": 3.906953751952909e-05, + "loss": 0.9063, + "step": 6525 + }, + { + "epoch": 0.583108093014944, + "grad_norm": 0.5273329615592957, + "learning_rate": 3.9055417271287426e-05, + "loss": 0.999, + "step": 6526 + }, + { + "epoch": 0.5831974445462059, + "grad_norm": 0.5131478905677795, + "learning_rate": 3.9041297939771224e-05, + "loss": 0.9669, + "step": 6527 + }, + { + "epoch": 0.5832867960774678, + "grad_norm": 0.5106601119041443, + "learning_rate": 3.9027179526163125e-05, + "loss": 0.8704, + "step": 6528 + }, + { + "epoch": 0.5833761476087297, + "grad_norm": 0.5162898302078247, + "learning_rate": 3.901306203164571e-05, + "loss": 0.9613, + "step": 6529 + }, + { + "epoch": 0.5834654991399915, + "grad_norm": 0.42621272802352905, + "learning_rate": 3.899894545740146e-05, + "loss": 0.9468, + "step": 6530 + }, + { + "epoch": 0.5835548506712533, + "grad_norm": 0.47468671202659607, + "learning_rate": 3.898482980461279e-05, + "loss": 0.9432, + "step": 6531 + }, + { + "epoch": 0.5836442022025152, + "grad_norm": 0.4372991919517517, + "learning_rate": 3.897071507446204e-05, + "loss": 0.9773, + "step": 6532 + }, + { + "epoch": 0.5837335537337771, + "grad_norm": 0.4610428810119629, + "learning_rate": 3.8956601268131486e-05, + "loss": 0.9766, + "step": 6533 + }, + { + "epoch": 0.583822905265039, + "grad_norm": 0.4634798765182495, + "learning_rate": 3.894248838680327e-05, + "loss": 1.0252, + "step": 6534 + }, + { + "epoch": 0.5839122567963009, + "grad_norm": 0.46993133425712585, + "learning_rate": 3.8928376431659516e-05, + "loss": 0.8756, + "step": 6535 + }, + { + "epoch": 0.5840016083275628, + "grad_norm": 0.4452194273471832, + "learning_rate": 3.891426540388224e-05, + "loss": 0.9596, + "step": 6536 + }, + { + "epoch": 0.5840909598588245, + "grad_norm": 0.47519785165786743, + "learning_rate": 3.890015530465342e-05, + "loss": 0.9268, + "step": 6537 + }, + { + "epoch": 0.5841803113900864, + "grad_norm": 0.42138373851776123, + "learning_rate": 3.888604613515491e-05, + "loss": 0.9785, + "step": 6538 + }, + { + "epoch": 0.5842696629213483, + "grad_norm": 0.429235577583313, + "learning_rate": 3.887193789656849e-05, + "loss": 0.9077, + "step": 6539 + }, + { + "epoch": 0.5843590144526102, + "grad_norm": 0.4701876640319824, + "learning_rate": 3.8857830590075895e-05, + "loss": 0.9449, + "step": 6540 + }, + { + "epoch": 0.5844483659838721, + "grad_norm": 0.45711401104927063, + "learning_rate": 3.8843724216858745e-05, + "loss": 1.0787, + "step": 6541 + }, + { + "epoch": 0.584537717515134, + "grad_norm": 0.4365895390510559, + "learning_rate": 3.882961877809862e-05, + "loss": 0.9814, + "step": 6542 + }, + { + "epoch": 0.5846270690463958, + "grad_norm": 0.5129074454307556, + "learning_rate": 3.881551427497701e-05, + "loss": 0.9376, + "step": 6543 + }, + { + "epoch": 0.5847164205776576, + "grad_norm": 0.45288604497909546, + "learning_rate": 3.880141070867527e-05, + "loss": 0.9004, + "step": 6544 + }, + { + "epoch": 0.5848057721089195, + "grad_norm": 0.42734140157699585, + "learning_rate": 3.878730808037475e-05, + "loss": 1.0315, + "step": 6545 + }, + { + "epoch": 0.5848951236401814, + "grad_norm": 0.47614336013793945, + "learning_rate": 3.87732063912567e-05, + "loss": 1.0126, + "step": 6546 + }, + { + "epoch": 0.5849844751714433, + "grad_norm": 0.475925475358963, + "learning_rate": 3.875910564250229e-05, + "loss": 0.9627, + "step": 6547 + }, + { + "epoch": 0.5850738267027051, + "grad_norm": 0.4257443845272064, + "learning_rate": 3.874500583529259e-05, + "loss": 0.9535, + "step": 6548 + }, + { + "epoch": 0.585163178233967, + "grad_norm": 0.4719519317150116, + "learning_rate": 3.873090697080863e-05, + "loss": 0.9882, + "step": 6549 + }, + { + "epoch": 0.5852525297652289, + "grad_norm": 0.5359987020492554, + "learning_rate": 3.871680905023133e-05, + "loss": 0.9496, + "step": 6550 + }, + { + "epoch": 0.5853418812964907, + "grad_norm": 0.5468908548355103, + "learning_rate": 3.8702712074741534e-05, + "loss": 0.9608, + "step": 6551 + }, + { + "epoch": 0.5854312328277526, + "grad_norm": 0.5251081585884094, + "learning_rate": 3.868861604552004e-05, + "loss": 0.8878, + "step": 6552 + }, + { + "epoch": 0.5855205843590144, + "grad_norm": 0.4280673861503601, + "learning_rate": 3.8674520963747526e-05, + "loss": 0.916, + "step": 6553 + }, + { + "epoch": 0.5856099358902763, + "grad_norm": 0.4354718029499054, + "learning_rate": 3.866042683060459e-05, + "loss": 0.952, + "step": 6554 + }, + { + "epoch": 0.5856992874215382, + "grad_norm": 0.43345123529434204, + "learning_rate": 3.864633364727177e-05, + "loss": 0.9935, + "step": 6555 + }, + { + "epoch": 0.5857886389528001, + "grad_norm": 0.3917457163333893, + "learning_rate": 3.8632241414929536e-05, + "loss": 0.9337, + "step": 6556 + }, + { + "epoch": 0.585877990484062, + "grad_norm": 0.45168983936309814, + "learning_rate": 3.861815013475827e-05, + "loss": 0.9561, + "step": 6557 + }, + { + "epoch": 0.5859673420153237, + "grad_norm": 0.47507405281066895, + "learning_rate": 3.860405980793823e-05, + "loss": 0.9946, + "step": 6558 + }, + { + "epoch": 0.5860566935465856, + "grad_norm": 0.5725569725036621, + "learning_rate": 3.858997043564966e-05, + "loss": 0.9315, + "step": 6559 + }, + { + "epoch": 0.5861460450778475, + "grad_norm": 0.4958851635456085, + "learning_rate": 3.8575882019072683e-05, + "loss": 0.8849, + "step": 6560 + }, + { + "epoch": 0.5862353966091094, + "grad_norm": 0.4789509177207947, + "learning_rate": 3.8561794559387366e-05, + "loss": 0.9942, + "step": 6561 + }, + { + "epoch": 0.5863247481403713, + "grad_norm": 0.4200628697872162, + "learning_rate": 3.854770805777368e-05, + "loss": 0.9415, + "step": 6562 + }, + { + "epoch": 0.5864140996716332, + "grad_norm": 0.4809480607509613, + "learning_rate": 3.8533622515411525e-05, + "loss": 0.9314, + "step": 6563 + }, + { + "epoch": 0.586503451202895, + "grad_norm": 0.5276997685432434, + "learning_rate": 3.85195379334807e-05, + "loss": 0.9348, + "step": 6564 + }, + { + "epoch": 0.5865928027341568, + "grad_norm": 0.5005677938461304, + "learning_rate": 3.8505454313160935e-05, + "loss": 0.8954, + "step": 6565 + }, + { + "epoch": 0.5866821542654187, + "grad_norm": 0.6186678409576416, + "learning_rate": 3.8491371655631897e-05, + "loss": 0.9265, + "step": 6566 + }, + { + "epoch": 0.5867715057966806, + "grad_norm": 0.45182543992996216, + "learning_rate": 3.847728996207316e-05, + "loss": 1.0398, + "step": 6567 + }, + { + "epoch": 0.5868608573279425, + "grad_norm": 0.5082495212554932, + "learning_rate": 3.846320923366421e-05, + "loss": 0.9212, + "step": 6568 + }, + { + "epoch": 0.5869502088592043, + "grad_norm": 0.4350298345088959, + "learning_rate": 3.844912947158446e-05, + "loss": 0.9523, + "step": 6569 + }, + { + "epoch": 0.5870395603904662, + "grad_norm": 0.5526973605155945, + "learning_rate": 3.843505067701324e-05, + "loss": 0.8776, + "step": 6570 + }, + { + "epoch": 0.5871289119217281, + "grad_norm": 0.4889020323753357, + "learning_rate": 3.84209728511298e-05, + "loss": 1.0305, + "step": 6571 + }, + { + "epoch": 0.5872182634529899, + "grad_norm": 0.5451194643974304, + "learning_rate": 3.840689599511331e-05, + "loss": 0.9137, + "step": 6572 + }, + { + "epoch": 0.5873076149842518, + "grad_norm": 0.44288796186447144, + "learning_rate": 3.839282011014286e-05, + "loss": 0.8818, + "step": 6573 + }, + { + "epoch": 0.5873969665155137, + "grad_norm": 0.42640823125839233, + "learning_rate": 3.837874519739744e-05, + "loss": 0.9603, + "step": 6574 + }, + { + "epoch": 0.5874863180467755, + "grad_norm": 0.43930867314338684, + "learning_rate": 3.8364671258056e-05, + "loss": 1.0335, + "step": 6575 + }, + { + "epoch": 0.5875756695780374, + "grad_norm": 0.5153297185897827, + "learning_rate": 3.835059829329735e-05, + "loss": 1.0377, + "step": 6576 + }, + { + "epoch": 0.5876650211092993, + "grad_norm": 0.47636160254478455, + "learning_rate": 3.8336526304300265e-05, + "loss": 0.9774, + "step": 6577 + }, + { + "epoch": 0.5877543726405611, + "grad_norm": 0.44680356979370117, + "learning_rate": 3.832245529224342e-05, + "loss": 1.0178, + "step": 6578 + }, + { + "epoch": 0.587843724171823, + "grad_norm": 0.44048798084259033, + "learning_rate": 3.830838525830542e-05, + "loss": 1.0009, + "step": 6579 + }, + { + "epoch": 0.5879330757030848, + "grad_norm": 0.4684334993362427, + "learning_rate": 3.829431620366479e-05, + "loss": 0.8922, + "step": 6580 + }, + { + "epoch": 0.5880224272343467, + "grad_norm": 0.5170097351074219, + "learning_rate": 3.828024812949994e-05, + "loss": 0.9479, + "step": 6581 + }, + { + "epoch": 0.5881117787656086, + "grad_norm": 0.4309813976287842, + "learning_rate": 3.826618103698924e-05, + "loss": 0.9732, + "step": 6582 + }, + { + "epoch": 0.5882011302968705, + "grad_norm": 0.4656025469303131, + "learning_rate": 3.825211492731097e-05, + "loss": 0.9387, + "step": 6583 + }, + { + "epoch": 0.5882904818281324, + "grad_norm": 0.4610069990158081, + "learning_rate": 3.823804980164328e-05, + "loss": 1.0072, + "step": 6584 + }, + { + "epoch": 0.5883798333593941, + "grad_norm": 0.49006903171539307, + "learning_rate": 3.8223985661164284e-05, + "loss": 0.9517, + "step": 6585 + }, + { + "epoch": 0.588469184890656, + "grad_norm": 0.5057728290557861, + "learning_rate": 3.820992250705202e-05, + "loss": 0.9777, + "step": 6586 + }, + { + "epoch": 0.5885585364219179, + "grad_norm": 0.45757678151130676, + "learning_rate": 3.819586034048441e-05, + "loss": 0.9111, + "step": 6587 + }, + { + "epoch": 0.5886478879531798, + "grad_norm": 0.4634867012500763, + "learning_rate": 3.818179916263933e-05, + "loss": 0.9293, + "step": 6588 + }, + { + "epoch": 0.5887372394844417, + "grad_norm": 0.42738625407218933, + "learning_rate": 3.816773897469454e-05, + "loss": 1.005, + "step": 6589 + }, + { + "epoch": 0.5888265910157036, + "grad_norm": 0.496150404214859, + "learning_rate": 3.815367977782774e-05, + "loss": 0.878, + "step": 6590 + }, + { + "epoch": 0.5889159425469654, + "grad_norm": 0.4651472866535187, + "learning_rate": 3.813962157321653e-05, + "loss": 0.991, + "step": 6591 + }, + { + "epoch": 0.5890052940782272, + "grad_norm": 0.49362361431121826, + "learning_rate": 3.812556436203843e-05, + "loss": 0.9222, + "step": 6592 + }, + { + "epoch": 0.5890946456094891, + "grad_norm": 0.4723232686519623, + "learning_rate": 3.8111508145470886e-05, + "loss": 0.9071, + "step": 6593 + }, + { + "epoch": 0.589183997140751, + "grad_norm": 0.4556770622730255, + "learning_rate": 3.809745292469128e-05, + "loss": 0.9447, + "step": 6594 + }, + { + "epoch": 0.5892733486720129, + "grad_norm": 0.49390918016433716, + "learning_rate": 3.808339870087684e-05, + "loss": 0.8855, + "step": 6595 + }, + { + "epoch": 0.5893627002032747, + "grad_norm": 0.4614836275577545, + "learning_rate": 3.8069345475204784e-05, + "loss": 0.9299, + "step": 6596 + }, + { + "epoch": 0.5894520517345366, + "grad_norm": 0.42079588770866394, + "learning_rate": 3.805529324885222e-05, + "loss": 0.9729, + "step": 6597 + }, + { + "epoch": 0.5895414032657985, + "grad_norm": 0.4124428629875183, + "learning_rate": 3.804124202299615e-05, + "loss": 0.9736, + "step": 6598 + }, + { + "epoch": 0.5896307547970603, + "grad_norm": 0.45447784662246704, + "learning_rate": 3.8027191798813546e-05, + "loss": 0.937, + "step": 6599 + }, + { + "epoch": 0.5897201063283222, + "grad_norm": 0.5072098970413208, + "learning_rate": 3.801314257748125e-05, + "loss": 0.9483, + "step": 6600 + }, + { + "epoch": 0.589809457859584, + "grad_norm": 0.42272502183914185, + "learning_rate": 3.799909436017604e-05, + "loss": 1.0475, + "step": 6601 + }, + { + "epoch": 0.5898988093908459, + "grad_norm": 0.44008708000183105, + "learning_rate": 3.7985047148074585e-05, + "loss": 0.9706, + "step": 6602 + }, + { + "epoch": 0.5899881609221078, + "grad_norm": 0.4524151086807251, + "learning_rate": 3.797100094235351e-05, + "loss": 0.9213, + "step": 6603 + }, + { + "epoch": 0.5900775124533697, + "grad_norm": 0.4076539874076843, + "learning_rate": 3.795695574418934e-05, + "loss": 1.0978, + "step": 6604 + }, + { + "epoch": 0.5901668639846316, + "grad_norm": 0.46805551648139954, + "learning_rate": 3.794291155475848e-05, + "loss": 0.8995, + "step": 6605 + }, + { + "epoch": 0.5902562155158934, + "grad_norm": 0.4466945230960846, + "learning_rate": 3.792886837523729e-05, + "loss": 0.9412, + "step": 6606 + }, + { + "epoch": 0.5903455670471552, + "grad_norm": 0.4632209837436676, + "learning_rate": 3.7914826206802047e-05, + "loss": 0.9332, + "step": 6607 + }, + { + "epoch": 0.5904349185784171, + "grad_norm": 0.49044013023376465, + "learning_rate": 3.790078505062894e-05, + "loss": 1.0105, + "step": 6608 + }, + { + "epoch": 0.590524270109679, + "grad_norm": 0.45329976081848145, + "learning_rate": 3.788674490789404e-05, + "loss": 1.0149, + "step": 6609 + }, + { + "epoch": 0.5906136216409409, + "grad_norm": 0.45417705178260803, + "learning_rate": 3.7872705779773376e-05, + "loss": 0.9508, + "step": 6610 + }, + { + "epoch": 0.5907029731722028, + "grad_norm": 0.4554515480995178, + "learning_rate": 3.785866766744287e-05, + "loss": 0.9459, + "step": 6611 + }, + { + "epoch": 0.5907923247034647, + "grad_norm": 0.4931671619415283, + "learning_rate": 3.784463057207836e-05, + "loss": 0.9422, + "step": 6612 + }, + { + "epoch": 0.5908816762347264, + "grad_norm": 0.5159240961074829, + "learning_rate": 3.783059449485561e-05, + "loss": 0.9961, + "step": 6613 + }, + { + "epoch": 0.5909710277659883, + "grad_norm": 0.4163471758365631, + "learning_rate": 3.78165594369503e-05, + "loss": 1.0271, + "step": 6614 + }, + { + "epoch": 0.5910603792972502, + "grad_norm": 0.4183323383331299, + "learning_rate": 3.7802525399538e-05, + "loss": 1.0124, + "step": 6615 + }, + { + "epoch": 0.5911497308285121, + "grad_norm": 0.46458837389945984, + "learning_rate": 3.77884923837942e-05, + "loss": 0.952, + "step": 6616 + }, + { + "epoch": 0.591239082359774, + "grad_norm": 0.3890257775783539, + "learning_rate": 3.777446039089433e-05, + "loss": 0.9898, + "step": 6617 + }, + { + "epoch": 0.5913284338910358, + "grad_norm": 0.4664503335952759, + "learning_rate": 3.776042942201372e-05, + "loss": 0.9048, + "step": 6618 + }, + { + "epoch": 0.5914177854222977, + "grad_norm": 0.5847722887992859, + "learning_rate": 3.774639947832761e-05, + "loss": 0.8766, + "step": 6619 + }, + { + "epoch": 0.5915071369535595, + "grad_norm": 0.4752833843231201, + "learning_rate": 3.7732370561011154e-05, + "loss": 0.9276, + "step": 6620 + }, + { + "epoch": 0.5915964884848214, + "grad_norm": 0.4764302670955658, + "learning_rate": 3.771834267123943e-05, + "loss": 0.9594, + "step": 6621 + }, + { + "epoch": 0.5916858400160833, + "grad_norm": 0.40147051215171814, + "learning_rate": 3.770431581018743e-05, + "loss": 0.9262, + "step": 6622 + }, + { + "epoch": 0.5917751915473451, + "grad_norm": 0.4563705623149872, + "learning_rate": 3.769028997903003e-05, + "loss": 0.9904, + "step": 6623 + }, + { + "epoch": 0.591864543078607, + "grad_norm": 0.4160234034061432, + "learning_rate": 3.7676265178942074e-05, + "loss": 1.0111, + "step": 6624 + }, + { + "epoch": 0.5919538946098689, + "grad_norm": 0.426800400018692, + "learning_rate": 3.766224141109825e-05, + "loss": 0.9869, + "step": 6625 + }, + { + "epoch": 0.5920432461411308, + "grad_norm": 0.43789252638816833, + "learning_rate": 3.764821867667323e-05, + "loss": 0.9818, + "step": 6626 + }, + { + "epoch": 0.5921325976723926, + "grad_norm": 0.5048993229866028, + "learning_rate": 3.763419697684156e-05, + "loss": 0.8763, + "step": 6627 + }, + { + "epoch": 0.5922219492036545, + "grad_norm": 0.5642315745353699, + "learning_rate": 3.76201763127777e-05, + "loss": 0.9, + "step": 6628 + }, + { + "epoch": 0.5923113007349163, + "grad_norm": 0.4938600957393646, + "learning_rate": 3.7606156685656026e-05, + "loss": 0.924, + "step": 6629 + }, + { + "epoch": 0.5924006522661782, + "grad_norm": 0.6127682328224182, + "learning_rate": 3.759213809665084e-05, + "loss": 0.9511, + "step": 6630 + }, + { + "epoch": 0.5924900037974401, + "grad_norm": 0.5626466870307922, + "learning_rate": 3.757812054693634e-05, + "loss": 0.9138, + "step": 6631 + }, + { + "epoch": 0.592579355328702, + "grad_norm": 0.4270266592502594, + "learning_rate": 3.756410403768667e-05, + "loss": 0.9399, + "step": 6632 + }, + { + "epoch": 0.5926687068599639, + "grad_norm": 0.4422089159488678, + "learning_rate": 3.755008857007583e-05, + "loss": 0.9363, + "step": 6633 + }, + { + "epoch": 0.5927580583912256, + "grad_norm": 0.4473762512207031, + "learning_rate": 3.75360741452778e-05, + "loss": 0.9541, + "step": 6634 + }, + { + "epoch": 0.5928474099224875, + "grad_norm": 0.44996803998947144, + "learning_rate": 3.752206076446641e-05, + "loss": 0.9283, + "step": 6635 + }, + { + "epoch": 0.5929367614537494, + "grad_norm": 0.544922411441803, + "learning_rate": 3.7508048428815416e-05, + "loss": 0.893, + "step": 6636 + }, + { + "epoch": 0.5930261129850113, + "grad_norm": 0.38203221559524536, + "learning_rate": 3.7494037139498525e-05, + "loss": 0.9555, + "step": 6637 + }, + { + "epoch": 0.5931154645162732, + "grad_norm": 0.47850242257118225, + "learning_rate": 3.748002689768934e-05, + "loss": 0.939, + "step": 6638 + }, + { + "epoch": 0.593204816047535, + "grad_norm": 0.43377867341041565, + "learning_rate": 3.7466017704561345e-05, + "loss": 0.9497, + "step": 6639 + }, + { + "epoch": 0.5932941675787968, + "grad_norm": 0.4623333215713501, + "learning_rate": 3.745200956128797e-05, + "loss": 0.8625, + "step": 6640 + }, + { + "epoch": 0.5933835191100587, + "grad_norm": 0.4614277482032776, + "learning_rate": 3.7438002469042565e-05, + "loss": 0.9202, + "step": 6641 + }, + { + "epoch": 0.5934728706413206, + "grad_norm": 0.4852299988269806, + "learning_rate": 3.742399642899833e-05, + "loss": 0.9428, + "step": 6642 + }, + { + "epoch": 0.5935622221725825, + "grad_norm": 0.46920087933540344, + "learning_rate": 3.740999144232846e-05, + "loss": 0.9912, + "step": 6643 + }, + { + "epoch": 0.5936515737038444, + "grad_norm": 0.4522823393344879, + "learning_rate": 3.739598751020601e-05, + "loss": 0.9731, + "step": 6644 + }, + { + "epoch": 0.5937409252351062, + "grad_norm": 0.4409060478210449, + "learning_rate": 3.7381984633803955e-05, + "loss": 0.9657, + "step": 6645 + }, + { + "epoch": 0.5938302767663681, + "grad_norm": 0.45765984058380127, + "learning_rate": 3.7367982814295174e-05, + "loss": 0.9724, + "step": 6646 + }, + { + "epoch": 0.5939196282976299, + "grad_norm": 0.5184260606765747, + "learning_rate": 3.735398205285248e-05, + "loss": 0.9308, + "step": 6647 + }, + { + "epoch": 0.5940089798288918, + "grad_norm": 0.44960641860961914, + "learning_rate": 3.733998235064858e-05, + "loss": 0.9715, + "step": 6648 + }, + { + "epoch": 0.5940983313601537, + "grad_norm": 0.416469931602478, + "learning_rate": 3.732598370885612e-05, + "loss": 0.9493, + "step": 6649 + }, + { + "epoch": 0.5941876828914155, + "grad_norm": 0.4398280382156372, + "learning_rate": 3.73119861286476e-05, + "loss": 0.996, + "step": 6650 + }, + { + "epoch": 0.5942770344226774, + "grad_norm": 0.4765319526195526, + "learning_rate": 3.7297989611195506e-05, + "loss": 0.9283, + "step": 6651 + }, + { + "epoch": 0.5943663859539393, + "grad_norm": 0.4116184115409851, + "learning_rate": 3.728399415767216e-05, + "loss": 0.9913, + "step": 6652 + }, + { + "epoch": 0.5944557374852012, + "grad_norm": 0.43389296531677246, + "learning_rate": 3.7269999769249855e-05, + "loss": 0.9189, + "step": 6653 + }, + { + "epoch": 0.594545089016463, + "grad_norm": 0.46604517102241516, + "learning_rate": 3.725600644710078e-05, + "loss": 0.8957, + "step": 6654 + }, + { + "epoch": 0.5946344405477249, + "grad_norm": 0.509748101234436, + "learning_rate": 3.724201419239699e-05, + "loss": 0.895, + "step": 6655 + }, + { + "epoch": 0.5947237920789867, + "grad_norm": 0.4493167996406555, + "learning_rate": 3.722802300631049e-05, + "loss": 0.9533, + "step": 6656 + }, + { + "epoch": 0.5948131436102486, + "grad_norm": 0.5172920823097229, + "learning_rate": 3.721403289001321e-05, + "loss": 0.9407, + "step": 6657 + }, + { + "epoch": 0.5949024951415105, + "grad_norm": 0.45403075218200684, + "learning_rate": 3.720004384467697e-05, + "loss": 0.98, + "step": 6658 + }, + { + "epoch": 0.5949918466727724, + "grad_norm": 0.5167436003684998, + "learning_rate": 3.718605587147348e-05, + "loss": 0.972, + "step": 6659 + }, + { + "epoch": 0.5950811982040343, + "grad_norm": 0.4543740153312683, + "learning_rate": 3.7172068971574426e-05, + "loss": 0.9288, + "step": 6660 + }, + { + "epoch": 0.595170549735296, + "grad_norm": 0.401083767414093, + "learning_rate": 3.715808314615131e-05, + "loss": 0.9727, + "step": 6661 + }, + { + "epoch": 0.5952599012665579, + "grad_norm": 0.4421873986721039, + "learning_rate": 3.714409839637562e-05, + "loss": 0.96, + "step": 6662 + }, + { + "epoch": 0.5953492527978198, + "grad_norm": 0.436882883310318, + "learning_rate": 3.713011472341872e-05, + "loss": 0.947, + "step": 6663 + }, + { + "epoch": 0.5954386043290817, + "grad_norm": 0.5699417591094971, + "learning_rate": 3.711613212845192e-05, + "loss": 0.9443, + "step": 6664 + }, + { + "epoch": 0.5955279558603436, + "grad_norm": 0.48340317606925964, + "learning_rate": 3.7102150612646356e-05, + "loss": 0.9616, + "step": 6665 + }, + { + "epoch": 0.5956173073916055, + "grad_norm": 0.6746427416801453, + "learning_rate": 3.708817017717317e-05, + "loss": 0.8092, + "step": 6666 + }, + { + "epoch": 0.5957066589228673, + "grad_norm": 0.5548288822174072, + "learning_rate": 3.707419082320336e-05, + "loss": 0.9948, + "step": 6667 + }, + { + "epoch": 0.5957960104541291, + "grad_norm": 0.4507887363433838, + "learning_rate": 3.7060212551907845e-05, + "loss": 0.9182, + "step": 6668 + }, + { + "epoch": 0.595885361985391, + "grad_norm": 0.43728700280189514, + "learning_rate": 3.704623536445746e-05, + "loss": 0.9271, + "step": 6669 + }, + { + "epoch": 0.5959747135166529, + "grad_norm": 0.43481287360191345, + "learning_rate": 3.7032259262022936e-05, + "loss": 1.0612, + "step": 6670 + }, + { + "epoch": 0.5960640650479148, + "grad_norm": 0.4465053081512451, + "learning_rate": 3.7018284245774925e-05, + "loss": 0.9987, + "step": 6671 + }, + { + "epoch": 0.5961534165791766, + "grad_norm": 0.50724196434021, + "learning_rate": 3.700431031688399e-05, + "loss": 0.9271, + "step": 6672 + }, + { + "epoch": 0.5962427681104385, + "grad_norm": 0.3783160150051117, + "learning_rate": 3.699033747652059e-05, + "loss": 0.9373, + "step": 6673 + }, + { + "epoch": 0.5963321196417004, + "grad_norm": 0.4265550971031189, + "learning_rate": 3.697636572585511e-05, + "loss": 0.923, + "step": 6674 + }, + { + "epoch": 0.5964214711729622, + "grad_norm": 0.4511755406856537, + "learning_rate": 3.6962395066057806e-05, + "loss": 0.9411, + "step": 6675 + }, + { + "epoch": 0.5965108227042241, + "grad_norm": 0.398965984582901, + "learning_rate": 3.694842549829889e-05, + "loss": 1.0101, + "step": 6676 + }, + { + "epoch": 0.596600174235486, + "grad_norm": 0.48364296555519104, + "learning_rate": 3.693445702374846e-05, + "loss": 0.9362, + "step": 6677 + }, + { + "epoch": 0.5966895257667478, + "grad_norm": 0.4517565071582794, + "learning_rate": 3.692048964357653e-05, + "loss": 0.9031, + "step": 6678 + }, + { + "epoch": 0.5967788772980097, + "grad_norm": 0.42774462699890137, + "learning_rate": 3.690652335895299e-05, + "loss": 0.9305, + "step": 6679 + }, + { + "epoch": 0.5968682288292716, + "grad_norm": 0.5115528702735901, + "learning_rate": 3.68925581710477e-05, + "loss": 0.8811, + "step": 6680 + }, + { + "epoch": 0.5969575803605335, + "grad_norm": 0.4454442262649536, + "learning_rate": 3.687859408103037e-05, + "loss": 0.9036, + "step": 6681 + }, + { + "epoch": 0.5970469318917953, + "grad_norm": 0.5473392009735107, + "learning_rate": 3.6864631090070655e-05, + "loss": 0.8625, + "step": 6682 + }, + { + "epoch": 0.5971362834230571, + "grad_norm": 0.4324687719345093, + "learning_rate": 3.6850669199338096e-05, + "loss": 0.9759, + "step": 6683 + }, + { + "epoch": 0.597225634954319, + "grad_norm": 0.4886661767959595, + "learning_rate": 3.683670841000215e-05, + "loss": 0.9272, + "step": 6684 + }, + { + "epoch": 0.5973149864855809, + "grad_norm": 0.4877077341079712, + "learning_rate": 3.682274872323221e-05, + "loss": 0.8533, + "step": 6685 + }, + { + "epoch": 0.5974043380168428, + "grad_norm": 0.42951878905296326, + "learning_rate": 3.680879014019751e-05, + "loss": 0.9555, + "step": 6686 + }, + { + "epoch": 0.5974936895481047, + "grad_norm": 0.5074982047080994, + "learning_rate": 3.679483266206723e-05, + "loss": 0.9338, + "step": 6687 + }, + { + "epoch": 0.5975830410793666, + "grad_norm": 0.4952092170715332, + "learning_rate": 3.678087629001048e-05, + "loss": 1.0184, + "step": 6688 + }, + { + "epoch": 0.5976723926106283, + "grad_norm": 0.45204105973243713, + "learning_rate": 3.676692102519625e-05, + "loss": 0.9575, + "step": 6689 + }, + { + "epoch": 0.5977617441418902, + "grad_norm": 0.5234025120735168, + "learning_rate": 3.675296686879343e-05, + "loss": 0.9418, + "step": 6690 + }, + { + "epoch": 0.5978510956731521, + "grad_norm": 0.4052625298500061, + "learning_rate": 3.6739013821970846e-05, + "loss": 0.949, + "step": 6691 + }, + { + "epoch": 0.597940447204414, + "grad_norm": 0.5295931696891785, + "learning_rate": 3.67250618858972e-05, + "loss": 0.9215, + "step": 6692 + }, + { + "epoch": 0.5980297987356759, + "grad_norm": 0.4193294942378998, + "learning_rate": 3.671111106174113e-05, + "loss": 0.9675, + "step": 6693 + }, + { + "epoch": 0.5981191502669377, + "grad_norm": 0.4514252841472626, + "learning_rate": 3.669716135067116e-05, + "loss": 0.9491, + "step": 6694 + }, + { + "epoch": 0.5982085017981996, + "grad_norm": 0.38525697588920593, + "learning_rate": 3.6683212753855726e-05, + "loss": 0.962, + "step": 6695 + }, + { + "epoch": 0.5982978533294614, + "grad_norm": 0.49608129262924194, + "learning_rate": 3.666926527246316e-05, + "loss": 0.9742, + "step": 6696 + }, + { + "epoch": 0.5983872048607233, + "grad_norm": 0.5682451128959656, + "learning_rate": 3.6655318907661726e-05, + "loss": 0.879, + "step": 6697 + }, + { + "epoch": 0.5984765563919852, + "grad_norm": 0.3935823142528534, + "learning_rate": 3.664137366061958e-05, + "loss": 0.9591, + "step": 6698 + }, + { + "epoch": 0.598565907923247, + "grad_norm": 0.4220222234725952, + "learning_rate": 3.662742953250478e-05, + "loss": 1.0057, + "step": 6699 + }, + { + "epoch": 0.5986552594545089, + "grad_norm": 0.4496472179889679, + "learning_rate": 3.6613486524485294e-05, + "loss": 0.9168, + "step": 6700 + }, + { + "epoch": 0.5987446109857708, + "grad_norm": 0.40350624918937683, + "learning_rate": 3.6599544637729007e-05, + "loss": 1.0385, + "step": 6701 + }, + { + "epoch": 0.5988339625170326, + "grad_norm": 0.42951273918151855, + "learning_rate": 3.65856038734037e-05, + "loss": 0.9669, + "step": 6702 + }, + { + "epoch": 0.5989233140482945, + "grad_norm": 0.4338366687297821, + "learning_rate": 3.657166423267704e-05, + "loss": 0.9676, + "step": 6703 + }, + { + "epoch": 0.5990126655795563, + "grad_norm": 0.5033963918685913, + "learning_rate": 3.655772571671664e-05, + "loss": 0.9289, + "step": 6704 + }, + { + "epoch": 0.5991020171108182, + "grad_norm": 0.48832231760025024, + "learning_rate": 3.654378832669002e-05, + "loss": 0.8811, + "step": 6705 + }, + { + "epoch": 0.5991913686420801, + "grad_norm": 0.5550376176834106, + "learning_rate": 3.6529852063764545e-05, + "loss": 0.8869, + "step": 6706 + }, + { + "epoch": 0.599280720173342, + "grad_norm": 0.46830353140830994, + "learning_rate": 3.651591692910754e-05, + "loss": 0.9608, + "step": 6707 + }, + { + "epoch": 0.5993700717046039, + "grad_norm": 0.4385646879673004, + "learning_rate": 3.650198292388621e-05, + "loss": 1.0007, + "step": 6708 + }, + { + "epoch": 0.5994594232358657, + "grad_norm": 0.5016947984695435, + "learning_rate": 3.64880500492677e-05, + "loss": 1.0079, + "step": 6709 + }, + { + "epoch": 0.5995487747671275, + "grad_norm": 0.38246116042137146, + "learning_rate": 3.647411830641903e-05, + "loss": 0.9701, + "step": 6710 + }, + { + "epoch": 0.5996381262983894, + "grad_norm": 0.6153517365455627, + "learning_rate": 3.646018769650713e-05, + "loss": 0.8733, + "step": 6711 + }, + { + "epoch": 0.5997274778296513, + "grad_norm": 0.5294399857521057, + "learning_rate": 3.6446258220698814e-05, + "loss": 0.9023, + "step": 6712 + }, + { + "epoch": 0.5998168293609132, + "grad_norm": 0.4735589623451233, + "learning_rate": 3.643232988016086e-05, + "loss": 0.9632, + "step": 6713 + }, + { + "epoch": 0.5999061808921751, + "grad_norm": 0.4517965018749237, + "learning_rate": 3.641840267605989e-05, + "loss": 0.95, + "step": 6714 + }, + { + "epoch": 0.599995532423437, + "grad_norm": 0.5445354580879211, + "learning_rate": 3.640447660956249e-05, + "loss": 0.8805, + "step": 6715 + }, + { + "epoch": 0.6000848839546987, + "grad_norm": 0.4683813452720642, + "learning_rate": 3.639055168183507e-05, + "loss": 0.9601, + "step": 6716 + }, + { + "epoch": 0.6001742354859606, + "grad_norm": 0.5692836046218872, + "learning_rate": 3.637662789404402e-05, + "loss": 0.9754, + "step": 6717 + }, + { + "epoch": 0.6002635870172225, + "grad_norm": 0.47742897272109985, + "learning_rate": 3.636270524735559e-05, + "loss": 0.956, + "step": 6718 + }, + { + "epoch": 0.6003529385484844, + "grad_norm": 0.5341329574584961, + "learning_rate": 3.6348783742935966e-05, + "loss": 0.9581, + "step": 6719 + }, + { + "epoch": 0.6004422900797463, + "grad_norm": 0.45011308789253235, + "learning_rate": 3.6334863381951214e-05, + "loss": 0.9189, + "step": 6720 + }, + { + "epoch": 0.6005316416110081, + "grad_norm": 0.4406468868255615, + "learning_rate": 3.632094416556731e-05, + "loss": 0.9414, + "step": 6721 + }, + { + "epoch": 0.60062099314227, + "grad_norm": 0.46647706627845764, + "learning_rate": 3.630702609495014e-05, + "loss": 0.893, + "step": 6722 + }, + { + "epoch": 0.6007103446735318, + "grad_norm": 0.3957505226135254, + "learning_rate": 3.6293109171265486e-05, + "loss": 0.9984, + "step": 6723 + }, + { + "epoch": 0.6007996962047937, + "grad_norm": 0.5789626836776733, + "learning_rate": 3.627919339567906e-05, + "loss": 0.9168, + "step": 6724 + }, + { + "epoch": 0.6008890477360556, + "grad_norm": 0.4705089032649994, + "learning_rate": 3.626527876935645e-05, + "loss": 0.9538, + "step": 6725 + }, + { + "epoch": 0.6009783992673174, + "grad_norm": 0.4237965941429138, + "learning_rate": 3.625136529346312e-05, + "loss": 1.0059, + "step": 6726 + }, + { + "epoch": 0.6010677507985793, + "grad_norm": 0.423532634973526, + "learning_rate": 3.6237452969164495e-05, + "loss": 0.9481, + "step": 6727 + }, + { + "epoch": 0.6011571023298412, + "grad_norm": 0.5018135905265808, + "learning_rate": 3.622354179762589e-05, + "loss": 0.9048, + "step": 6728 + }, + { + "epoch": 0.6012464538611031, + "grad_norm": 0.4970499873161316, + "learning_rate": 3.620963178001251e-05, + "loss": 0.8817, + "step": 6729 + }, + { + "epoch": 0.6013358053923649, + "grad_norm": 0.5816099047660828, + "learning_rate": 3.619572291748947e-05, + "loss": 0.9101, + "step": 6730 + }, + { + "epoch": 0.6014251569236267, + "grad_norm": 0.43934929370880127, + "learning_rate": 3.618181521122176e-05, + "loss": 0.9431, + "step": 6731 + }, + { + "epoch": 0.6015145084548886, + "grad_norm": 0.503680408000946, + "learning_rate": 3.616790866237433e-05, + "loss": 0.9065, + "step": 6732 + }, + { + "epoch": 0.6016038599861505, + "grad_norm": 0.6071944832801819, + "learning_rate": 3.615400327211198e-05, + "loss": 0.9336, + "step": 6733 + }, + { + "epoch": 0.6016932115174124, + "grad_norm": 0.4700120687484741, + "learning_rate": 3.614009904159945e-05, + "loss": 0.9705, + "step": 6734 + }, + { + "epoch": 0.6017825630486743, + "grad_norm": 0.4359018802642822, + "learning_rate": 3.6126195972001376e-05, + "loss": 0.968, + "step": 6735 + }, + { + "epoch": 0.6018719145799362, + "grad_norm": 0.3945593237876892, + "learning_rate": 3.6112294064482253e-05, + "loss": 1.0053, + "step": 6736 + }, + { + "epoch": 0.6019612661111979, + "grad_norm": 0.3968038558959961, + "learning_rate": 3.6098393320206536e-05, + "loss": 0.9179, + "step": 6737 + }, + { + "epoch": 0.6020506176424598, + "grad_norm": 0.4622810184955597, + "learning_rate": 3.608449374033856e-05, + "loss": 0.9345, + "step": 6738 + }, + { + "epoch": 0.6021399691737217, + "grad_norm": 0.45781803131103516, + "learning_rate": 3.607059532604256e-05, + "loss": 0.9804, + "step": 6739 + }, + { + "epoch": 0.6022293207049836, + "grad_norm": 0.43766266107559204, + "learning_rate": 3.6056698078482676e-05, + "loss": 0.9481, + "step": 6740 + }, + { + "epoch": 0.6023186722362455, + "grad_norm": 0.3953046202659607, + "learning_rate": 3.604280199882296e-05, + "loss": 0.9496, + "step": 6741 + }, + { + "epoch": 0.6024080237675074, + "grad_norm": 0.5054026246070862, + "learning_rate": 3.602890708822735e-05, + "loss": 0.9, + "step": 6742 + }, + { + "epoch": 0.6024973752987692, + "grad_norm": 0.43824490904808044, + "learning_rate": 3.601501334785968e-05, + "loss": 0.9556, + "step": 6743 + }, + { + "epoch": 0.602586726830031, + "grad_norm": 0.4914816915988922, + "learning_rate": 3.600112077888374e-05, + "loss": 1.0054, + "step": 6744 + }, + { + "epoch": 0.6026760783612929, + "grad_norm": 0.4403778910636902, + "learning_rate": 3.598722938246314e-05, + "loss": 0.9259, + "step": 6745 + }, + { + "epoch": 0.6027654298925548, + "grad_norm": 0.4326126277446747, + "learning_rate": 3.5973339159761435e-05, + "loss": 0.9524, + "step": 6746 + }, + { + "epoch": 0.6028547814238167, + "grad_norm": 0.4220033586025238, + "learning_rate": 3.595945011194208e-05, + "loss": 1.0042, + "step": 6747 + }, + { + "epoch": 0.6029441329550785, + "grad_norm": 0.5944996476173401, + "learning_rate": 3.594556224016847e-05, + "loss": 0.9395, + "step": 6748 + }, + { + "epoch": 0.6030334844863404, + "grad_norm": 0.4896973967552185, + "learning_rate": 3.593167554560381e-05, + "loss": 0.9024, + "step": 6749 + }, + { + "epoch": 0.6031228360176023, + "grad_norm": 0.5966628789901733, + "learning_rate": 3.591779002941128e-05, + "loss": 0.9577, + "step": 6750 + }, + { + "epoch": 0.6032121875488641, + "grad_norm": 0.4684157073497772, + "learning_rate": 3.590390569275395e-05, + "loss": 0.9349, + "step": 6751 + }, + { + "epoch": 0.603301539080126, + "grad_norm": 0.5347898006439209, + "learning_rate": 3.589002253679476e-05, + "loss": 0.9619, + "step": 6752 + }, + { + "epoch": 0.6033908906113878, + "grad_norm": 0.4841513931751251, + "learning_rate": 3.5876140562696594e-05, + "loss": 0.8933, + "step": 6753 + }, + { + "epoch": 0.6034802421426497, + "grad_norm": 0.48916760087013245, + "learning_rate": 3.58622597716222e-05, + "loss": 0.9148, + "step": 6754 + }, + { + "epoch": 0.6035695936739116, + "grad_norm": 0.440901517868042, + "learning_rate": 3.584838016473426e-05, + "loss": 0.9739, + "step": 6755 + }, + { + "epoch": 0.6036589452051735, + "grad_norm": 0.39379745721817017, + "learning_rate": 3.5834501743195314e-05, + "loss": 1.0049, + "step": 6756 + }, + { + "epoch": 0.6037482967364354, + "grad_norm": 0.6075735688209534, + "learning_rate": 3.582062450816784e-05, + "loss": 0.8999, + "step": 6757 + }, + { + "epoch": 0.6038376482676971, + "grad_norm": 0.4395672082901001, + "learning_rate": 3.580674846081421e-05, + "loss": 0.894, + "step": 6758 + }, + { + "epoch": 0.603926999798959, + "grad_norm": 0.5487232208251953, + "learning_rate": 3.579287360229668e-05, + "loss": 0.9693, + "step": 6759 + }, + { + "epoch": 0.6040163513302209, + "grad_norm": 0.44997963309288025, + "learning_rate": 3.5778999933777423e-05, + "loss": 0.9385, + "step": 6760 + }, + { + "epoch": 0.6041057028614828, + "grad_norm": 0.49523553252220154, + "learning_rate": 3.5765127456418514e-05, + "loss": 0.8897, + "step": 6761 + }, + { + "epoch": 0.6041950543927447, + "grad_norm": 0.4439547657966614, + "learning_rate": 3.57512561713819e-05, + "loss": 0.9414, + "step": 6762 + }, + { + "epoch": 0.6042844059240066, + "grad_norm": 0.5505682826042175, + "learning_rate": 3.5737386079829484e-05, + "loss": 0.8705, + "step": 6763 + }, + { + "epoch": 0.6043737574552683, + "grad_norm": 0.42850804328918457, + "learning_rate": 3.5723517182923e-05, + "loss": 0.9114, + "step": 6764 + }, + { + "epoch": 0.6044631089865302, + "grad_norm": 0.40083956718444824, + "learning_rate": 3.570964948182412e-05, + "loss": 0.9795, + "step": 6765 + }, + { + "epoch": 0.6045524605177921, + "grad_norm": 0.4469483196735382, + "learning_rate": 3.5695782977694436e-05, + "loss": 1.0354, + "step": 6766 + }, + { + "epoch": 0.604641812049054, + "grad_norm": 0.5799636840820312, + "learning_rate": 3.56819176716954e-05, + "loss": 0.8358, + "step": 6767 + }, + { + "epoch": 0.6047311635803159, + "grad_norm": 0.4821333587169647, + "learning_rate": 3.566805356498837e-05, + "loss": 0.8875, + "step": 6768 + }, + { + "epoch": 0.6048205151115778, + "grad_norm": 0.4493705928325653, + "learning_rate": 3.5654190658734624e-05, + "loss": 0.971, + "step": 6769 + }, + { + "epoch": 0.6049098666428396, + "grad_norm": 0.4657124876976013, + "learning_rate": 3.564032895409532e-05, + "loss": 0.9372, + "step": 6770 + }, + { + "epoch": 0.6049992181741014, + "grad_norm": 0.4851822555065155, + "learning_rate": 3.562646845223153e-05, + "loss": 0.9399, + "step": 6771 + }, + { + "epoch": 0.6050885697053633, + "grad_norm": 0.4866061210632324, + "learning_rate": 3.561260915430422e-05, + "loss": 0.9057, + "step": 6772 + }, + { + "epoch": 0.6051779212366252, + "grad_norm": 0.4425218403339386, + "learning_rate": 3.559875106147425e-05, + "loss": 0.9446, + "step": 6773 + }, + { + "epoch": 0.605267272767887, + "grad_norm": 0.49788007140159607, + "learning_rate": 3.5584894174902386e-05, + "loss": 0.934, + "step": 6774 + }, + { + "epoch": 0.6053566242991489, + "grad_norm": 0.4979552626609802, + "learning_rate": 3.557103849574929e-05, + "loss": 0.8783, + "step": 6775 + }, + { + "epoch": 0.6054459758304108, + "grad_norm": 0.4978952407836914, + "learning_rate": 3.5557184025175536e-05, + "loss": 0.9696, + "step": 6776 + }, + { + "epoch": 0.6055353273616727, + "grad_norm": 0.5217999219894409, + "learning_rate": 3.554333076434156e-05, + "loss": 0.8745, + "step": 6777 + }, + { + "epoch": 0.6056246788929345, + "grad_norm": 0.4879658818244934, + "learning_rate": 3.552947871440772e-05, + "loss": 0.9278, + "step": 6778 + }, + { + "epoch": 0.6057140304241964, + "grad_norm": 0.3954866826534271, + "learning_rate": 3.551562787653429e-05, + "loss": 0.943, + "step": 6779 + }, + { + "epoch": 0.6058033819554582, + "grad_norm": 0.41415131092071533, + "learning_rate": 3.550177825188141e-05, + "loss": 0.9986, + "step": 6780 + }, + { + "epoch": 0.6058927334867201, + "grad_norm": 0.4643884003162384, + "learning_rate": 3.5487929841609154e-05, + "loss": 0.8917, + "step": 6781 + }, + { + "epoch": 0.605982085017982, + "grad_norm": 0.5743858218193054, + "learning_rate": 3.5474082646877446e-05, + "loss": 0.9231, + "step": 6782 + }, + { + "epoch": 0.6060714365492439, + "grad_norm": 0.46073269844055176, + "learning_rate": 3.546023666884616e-05, + "loss": 0.9074, + "step": 6783 + }, + { + "epoch": 0.6061607880805058, + "grad_norm": 0.3921644389629364, + "learning_rate": 3.5446391908675033e-05, + "loss": 0.9552, + "step": 6784 + }, + { + "epoch": 0.6062501396117675, + "grad_norm": 0.4658121168613434, + "learning_rate": 3.543254836752371e-05, + "loss": 0.9736, + "step": 6785 + }, + { + "epoch": 0.6063394911430294, + "grad_norm": 0.44751229882240295, + "learning_rate": 3.541870604655176e-05, + "loss": 1.0575, + "step": 6786 + }, + { + "epoch": 0.6064288426742913, + "grad_norm": 0.534199595451355, + "learning_rate": 3.5404864946918595e-05, + "loss": 0.8935, + "step": 6787 + }, + { + "epoch": 0.6065181942055532, + "grad_norm": 0.41165226697921753, + "learning_rate": 3.539102506978356e-05, + "loss": 0.9994, + "step": 6788 + }, + { + "epoch": 0.6066075457368151, + "grad_norm": 0.5048467516899109, + "learning_rate": 3.5377186416305884e-05, + "loss": 0.9057, + "step": 6789 + }, + { + "epoch": 0.606696897268077, + "grad_norm": 0.44976770877838135, + "learning_rate": 3.5363348987644725e-05, + "loss": 0.9591, + "step": 6790 + }, + { + "epoch": 0.6067862487993388, + "grad_norm": 0.4342833161354065, + "learning_rate": 3.534951278495909e-05, + "loss": 0.945, + "step": 6791 + }, + { + "epoch": 0.6068756003306006, + "grad_norm": 0.4819537401199341, + "learning_rate": 3.533567780940794e-05, + "loss": 0.9135, + "step": 6792 + }, + { + "epoch": 0.6069649518618625, + "grad_norm": 0.4745539426803589, + "learning_rate": 3.532184406215008e-05, + "loss": 0.9931, + "step": 6793 + }, + { + "epoch": 0.6070543033931244, + "grad_norm": 0.4145744740962982, + "learning_rate": 3.5308011544344224e-05, + "loss": 1.0892, + "step": 6794 + }, + { + "epoch": 0.6071436549243863, + "grad_norm": 0.5059308409690857, + "learning_rate": 3.529418025714902e-05, + "loss": 0.9136, + "step": 6795 + }, + { + "epoch": 0.6072330064556481, + "grad_norm": 0.4676026403903961, + "learning_rate": 3.528035020172299e-05, + "loss": 0.9068, + "step": 6796 + }, + { + "epoch": 0.60732235798691, + "grad_norm": 0.4627551734447479, + "learning_rate": 3.526652137922451e-05, + "loss": 0.9546, + "step": 6797 + }, + { + "epoch": 0.6074117095181719, + "grad_norm": 0.5230789184570312, + "learning_rate": 3.525269379081191e-05, + "loss": 0.959, + "step": 6798 + }, + { + "epoch": 0.6075010610494337, + "grad_norm": 0.46442854404449463, + "learning_rate": 3.52388674376434e-05, + "loss": 0.9959, + "step": 6799 + }, + { + "epoch": 0.6075904125806956, + "grad_norm": 0.4610006511211395, + "learning_rate": 3.52250423208771e-05, + "loss": 0.8957, + "step": 6800 + }, + { + "epoch": 0.6076797641119575, + "grad_norm": 0.48097458481788635, + "learning_rate": 3.521121844167098e-05, + "loss": 0.9721, + "step": 6801 + }, + { + "epoch": 0.6077691156432193, + "grad_norm": 0.539371132850647, + "learning_rate": 3.5197395801182955e-05, + "loss": 0.8974, + "step": 6802 + }, + { + "epoch": 0.6078584671744812, + "grad_norm": 0.4365449845790863, + "learning_rate": 3.518357440057081e-05, + "loss": 0.9525, + "step": 6803 + }, + { + "epoch": 0.6079478187057431, + "grad_norm": 0.4736482501029968, + "learning_rate": 3.516975424099225e-05, + "loss": 0.8908, + "step": 6804 + }, + { + "epoch": 0.608037170237005, + "grad_norm": 0.4505004286766052, + "learning_rate": 3.515593532360484e-05, + "loss": 0.9166, + "step": 6805 + }, + { + "epoch": 0.6081265217682668, + "grad_norm": 0.5180654525756836, + "learning_rate": 3.5142117649566104e-05, + "loss": 0.9138, + "step": 6806 + }, + { + "epoch": 0.6082158732995286, + "grad_norm": 0.3982846140861511, + "learning_rate": 3.5128301220033366e-05, + "loss": 0.9941, + "step": 6807 + }, + { + "epoch": 0.6083052248307905, + "grad_norm": 0.4852418005466461, + "learning_rate": 3.511448603616392e-05, + "loss": 0.9798, + "step": 6808 + }, + { + "epoch": 0.6083945763620524, + "grad_norm": 0.4647953510284424, + "learning_rate": 3.510067209911493e-05, + "loss": 0.8897, + "step": 6809 + }, + { + "epoch": 0.6084839278933143, + "grad_norm": 0.4955390691757202, + "learning_rate": 3.508685941004348e-05, + "loss": 0.9398, + "step": 6810 + }, + { + "epoch": 0.6085732794245762, + "grad_norm": 0.45322465896606445, + "learning_rate": 3.507304797010651e-05, + "loss": 0.9526, + "step": 6811 + }, + { + "epoch": 0.6086626309558381, + "grad_norm": 0.47306427359580994, + "learning_rate": 3.505923778046088e-05, + "loss": 0.9454, + "step": 6812 + }, + { + "epoch": 0.6087519824870998, + "grad_norm": 0.5509787201881409, + "learning_rate": 3.5045428842263344e-05, + "loss": 0.87, + "step": 6813 + }, + { + "epoch": 0.6088413340183617, + "grad_norm": 0.4736871123313904, + "learning_rate": 3.503162115667056e-05, + "loss": 0.9705, + "step": 6814 + }, + { + "epoch": 0.6089306855496236, + "grad_norm": 0.6895001530647278, + "learning_rate": 3.5017814724839046e-05, + "loss": 0.9558, + "step": 6815 + }, + { + "epoch": 0.6090200370808855, + "grad_norm": 0.47932013869285583, + "learning_rate": 3.5004009547925255e-05, + "loss": 0.9453, + "step": 6816 + }, + { + "epoch": 0.6091093886121474, + "grad_norm": 0.49553731083869934, + "learning_rate": 3.4990205627085504e-05, + "loss": 0.9632, + "step": 6817 + }, + { + "epoch": 0.6091987401434092, + "grad_norm": 0.49811407923698425, + "learning_rate": 3.497640296347603e-05, + "loss": 0.898, + "step": 6818 + }, + { + "epoch": 0.6092880916746711, + "grad_norm": 0.42561087012290955, + "learning_rate": 3.496260155825294e-05, + "loss": 0.9824, + "step": 6819 + }, + { + "epoch": 0.6093774432059329, + "grad_norm": 0.4683232009410858, + "learning_rate": 3.4948801412572255e-05, + "loss": 0.9188, + "step": 6820 + }, + { + "epoch": 0.6094667947371948, + "grad_norm": 0.4638214409351349, + "learning_rate": 3.493500252758989e-05, + "loss": 0.9847, + "step": 6821 + }, + { + "epoch": 0.6095561462684567, + "grad_norm": 0.47168654203414917, + "learning_rate": 3.492120490446164e-05, + "loss": 0.9237, + "step": 6822 + }, + { + "epoch": 0.6096454977997185, + "grad_norm": 0.5087332725524902, + "learning_rate": 3.490740854434321e-05, + "loss": 0.9413, + "step": 6823 + }, + { + "epoch": 0.6097348493309804, + "grad_norm": 0.5125972628593445, + "learning_rate": 3.489361344839018e-05, + "loss": 1.0025, + "step": 6824 + }, + { + "epoch": 0.6098242008622423, + "grad_norm": 0.4349963068962097, + "learning_rate": 3.487981961775806e-05, + "loss": 0.9474, + "step": 6825 + }, + { + "epoch": 0.6099135523935042, + "grad_norm": 0.5097174644470215, + "learning_rate": 3.486602705360223e-05, + "loss": 0.9926, + "step": 6826 + }, + { + "epoch": 0.610002903924766, + "grad_norm": 0.43980687856674194, + "learning_rate": 3.4852235757077934e-05, + "loss": 0.9322, + "step": 6827 + }, + { + "epoch": 0.6100922554560279, + "grad_norm": 0.48493608832359314, + "learning_rate": 3.483844572934036e-05, + "loss": 0.992, + "step": 6828 + }, + { + "epoch": 0.6101816069872897, + "grad_norm": 0.43996307253837585, + "learning_rate": 3.482465697154456e-05, + "loss": 0.9379, + "step": 6829 + }, + { + "epoch": 0.6102709585185516, + "grad_norm": 0.5646868944168091, + "learning_rate": 3.48108694848455e-05, + "loss": 0.9061, + "step": 6830 + }, + { + "epoch": 0.6103603100498135, + "grad_norm": 0.4832056164741516, + "learning_rate": 3.479708327039802e-05, + "loss": 0.9026, + "step": 6831 + }, + { + "epoch": 0.6104496615810754, + "grad_norm": 0.5810287594795227, + "learning_rate": 3.478329832935687e-05, + "loss": 0.825, + "step": 6832 + }, + { + "epoch": 0.6105390131123372, + "grad_norm": 0.5271215438842773, + "learning_rate": 3.47695146628767e-05, + "loss": 0.9671, + "step": 6833 + }, + { + "epoch": 0.610628364643599, + "grad_norm": 0.461401104927063, + "learning_rate": 3.475573227211201e-05, + "loss": 0.9199, + "step": 6834 + }, + { + "epoch": 0.6107177161748609, + "grad_norm": 0.46049854159355164, + "learning_rate": 3.474195115821723e-05, + "loss": 0.9179, + "step": 6835 + }, + { + "epoch": 0.6108070677061228, + "grad_norm": 0.521156370639801, + "learning_rate": 3.4728171322346694e-05, + "loss": 0.8643, + "step": 6836 + }, + { + "epoch": 0.6108964192373847, + "grad_norm": 0.4350111782550812, + "learning_rate": 3.471439276565459e-05, + "loss": 0.9443, + "step": 6837 + }, + { + "epoch": 0.6109857707686466, + "grad_norm": 0.4543260931968689, + "learning_rate": 3.470061548929502e-05, + "loss": 0.9858, + "step": 6838 + }, + { + "epoch": 0.6110751222999085, + "grad_norm": 0.43811121582984924, + "learning_rate": 3.4686839494421976e-05, + "loss": 0.9243, + "step": 6839 + }, + { + "epoch": 0.6111644738311702, + "grad_norm": 0.45619046688079834, + "learning_rate": 3.4673064782189356e-05, + "loss": 0.9676, + "step": 6840 + }, + { + "epoch": 0.6112538253624321, + "grad_norm": 0.39327651262283325, + "learning_rate": 3.4659291353750934e-05, + "loss": 0.9336, + "step": 6841 + }, + { + "epoch": 0.611343176893694, + "grad_norm": 0.5329780578613281, + "learning_rate": 3.464551921026038e-05, + "loss": 0.9395, + "step": 6842 + }, + { + "epoch": 0.6114325284249559, + "grad_norm": 0.5322310924530029, + "learning_rate": 3.463174835287125e-05, + "loss": 0.9203, + "step": 6843 + }, + { + "epoch": 0.6115218799562178, + "grad_norm": 0.5095454454421997, + "learning_rate": 3.461797878273703e-05, + "loss": 0.9522, + "step": 6844 + }, + { + "epoch": 0.6116112314874796, + "grad_norm": 0.5680137276649475, + "learning_rate": 3.460421050101103e-05, + "loss": 0.9741, + "step": 6845 + }, + { + "epoch": 0.6117005830187415, + "grad_norm": 0.4388297498226166, + "learning_rate": 3.4590443508846536e-05, + "loss": 0.9311, + "step": 6846 + }, + { + "epoch": 0.6117899345500033, + "grad_norm": 0.3977706730365753, + "learning_rate": 3.457667780739663e-05, + "loss": 0.9636, + "step": 6847 + }, + { + "epoch": 0.6118792860812652, + "grad_norm": 0.42693883180618286, + "learning_rate": 3.456291339781435e-05, + "loss": 0.8826, + "step": 6848 + }, + { + "epoch": 0.6119686376125271, + "grad_norm": 0.4118557870388031, + "learning_rate": 3.4549150281252636e-05, + "loss": 0.9483, + "step": 6849 + }, + { + "epoch": 0.612057989143789, + "grad_norm": 0.5658688545227051, + "learning_rate": 3.453538845886427e-05, + "loss": 0.909, + "step": 6850 + }, + { + "epoch": 0.6121473406750508, + "grad_norm": 0.48689889907836914, + "learning_rate": 3.452162793180198e-05, + "loss": 1.0, + "step": 6851 + }, + { + "epoch": 0.6122366922063127, + "grad_norm": 0.5138941407203674, + "learning_rate": 3.4507868701218314e-05, + "loss": 1.0012, + "step": 6852 + }, + { + "epoch": 0.6123260437375746, + "grad_norm": 0.5919985771179199, + "learning_rate": 3.4494110768265795e-05, + "loss": 0.9056, + "step": 6853 + }, + { + "epoch": 0.6124153952688364, + "grad_norm": 0.5183218121528625, + "learning_rate": 3.448035413409677e-05, + "loss": 0.9247, + "step": 6854 + }, + { + "epoch": 0.6125047468000983, + "grad_norm": 0.455565869808197, + "learning_rate": 3.446659879986351e-05, + "loss": 1.024, + "step": 6855 + }, + { + "epoch": 0.6125940983313601, + "grad_norm": 0.5125479698181152, + "learning_rate": 3.445284476671818e-05, + "loss": 1.0608, + "step": 6856 + }, + { + "epoch": 0.612683449862622, + "grad_norm": 0.39517349004745483, + "learning_rate": 3.443909203581285e-05, + "loss": 0.9636, + "step": 6857 + }, + { + "epoch": 0.6127728013938839, + "grad_norm": 0.5921311974525452, + "learning_rate": 3.44253406082994e-05, + "loss": 0.9022, + "step": 6858 + }, + { + "epoch": 0.6128621529251458, + "grad_norm": 0.5381450057029724, + "learning_rate": 3.441159048532969e-05, + "loss": 0.9532, + "step": 6859 + }, + { + "epoch": 0.6129515044564077, + "grad_norm": 0.4705429971218109, + "learning_rate": 3.439784166805544e-05, + "loss": 0.9147, + "step": 6860 + }, + { + "epoch": 0.6130408559876694, + "grad_norm": 0.4696353077888489, + "learning_rate": 3.438409415762825e-05, + "loss": 0.9437, + "step": 6861 + }, + { + "epoch": 0.6131302075189313, + "grad_norm": 0.5168521404266357, + "learning_rate": 3.437034795519963e-05, + "loss": 0.9752, + "step": 6862 + }, + { + "epoch": 0.6132195590501932, + "grad_norm": 0.5304763317108154, + "learning_rate": 3.435660306192098e-05, + "loss": 0.9091, + "step": 6863 + }, + { + "epoch": 0.6133089105814551, + "grad_norm": 0.460254043340683, + "learning_rate": 3.434285947894356e-05, + "loss": 0.9877, + "step": 6864 + }, + { + "epoch": 0.613398262112717, + "grad_norm": 0.5005228519439697, + "learning_rate": 3.432911720741855e-05, + "loss": 0.9853, + "step": 6865 + }, + { + "epoch": 0.6134876136439789, + "grad_norm": 0.4847668409347534, + "learning_rate": 3.4315376248497025e-05, + "loss": 0.9965, + "step": 6866 + }, + { + "epoch": 0.6135769651752407, + "grad_norm": 0.4120919704437256, + "learning_rate": 3.430163660332993e-05, + "loss": 0.9123, + "step": 6867 + }, + { + "epoch": 0.6136663167065025, + "grad_norm": 0.5527864694595337, + "learning_rate": 3.428789827306809e-05, + "loss": 0.9233, + "step": 6868 + }, + { + "epoch": 0.6137556682377644, + "grad_norm": 0.5144837498664856, + "learning_rate": 3.4274161258862245e-05, + "loss": 0.9465, + "step": 6869 + }, + { + "epoch": 0.6138450197690263, + "grad_norm": 0.4069208800792694, + "learning_rate": 3.426042556186303e-05, + "loss": 0.9784, + "step": 6870 + }, + { + "epoch": 0.6139343713002882, + "grad_norm": 0.47290289402008057, + "learning_rate": 3.424669118322094e-05, + "loss": 0.9245, + "step": 6871 + }, + { + "epoch": 0.61402372283155, + "grad_norm": 0.5234418511390686, + "learning_rate": 3.423295812408638e-05, + "loss": 0.9324, + "step": 6872 + }, + { + "epoch": 0.6141130743628119, + "grad_norm": 0.42255857586860657, + "learning_rate": 3.421922638560964e-05, + "loss": 0.9565, + "step": 6873 + }, + { + "epoch": 0.6142024258940738, + "grad_norm": 0.4056835472583771, + "learning_rate": 3.4205495968940907e-05, + "loss": 0.983, + "step": 6874 + }, + { + "epoch": 0.6142917774253356, + "grad_norm": 0.42643529176712036, + "learning_rate": 3.4191766875230234e-05, + "loss": 0.906, + "step": 6875 + }, + { + "epoch": 0.6143811289565975, + "grad_norm": 0.42716148495674133, + "learning_rate": 3.4178039105627594e-05, + "loss": 1.0189, + "step": 6876 + }, + { + "epoch": 0.6144704804878593, + "grad_norm": 0.4740234315395355, + "learning_rate": 3.416431266128286e-05, + "loss": 0.9249, + "step": 6877 + }, + { + "epoch": 0.6145598320191212, + "grad_norm": 0.41565650701522827, + "learning_rate": 3.4150587543345705e-05, + "loss": 0.9317, + "step": 6878 + }, + { + "epoch": 0.6146491835503831, + "grad_norm": 0.46336111426353455, + "learning_rate": 3.413686375296578e-05, + "loss": 0.9369, + "step": 6879 + }, + { + "epoch": 0.614738535081645, + "grad_norm": 0.4771583676338196, + "learning_rate": 3.4123141291292615e-05, + "loss": 0.9388, + "step": 6880 + }, + { + "epoch": 0.6148278866129069, + "grad_norm": 0.4211154878139496, + "learning_rate": 3.41094201594756e-05, + "loss": 0.8495, + "step": 6881 + }, + { + "epoch": 0.6149172381441687, + "grad_norm": 0.4368746280670166, + "learning_rate": 3.4095700358664026e-05, + "loss": 0.9002, + "step": 6882 + }, + { + "epoch": 0.6150065896754305, + "grad_norm": 0.4566292464733124, + "learning_rate": 3.4081981890007074e-05, + "loss": 0.9938, + "step": 6883 + }, + { + "epoch": 0.6150959412066924, + "grad_norm": 0.43355435132980347, + "learning_rate": 3.406826475465382e-05, + "loss": 0.9271, + "step": 6884 + }, + { + "epoch": 0.6151852927379543, + "grad_norm": 0.502873420715332, + "learning_rate": 3.4054548953753205e-05, + "loss": 0.9655, + "step": 6885 + }, + { + "epoch": 0.6152746442692162, + "grad_norm": 0.5166613459587097, + "learning_rate": 3.4040834488454086e-05, + "loss": 0.8589, + "step": 6886 + }, + { + "epoch": 0.6153639958004781, + "grad_norm": 0.47696536779403687, + "learning_rate": 3.40271213599052e-05, + "loss": 0.8985, + "step": 6887 + }, + { + "epoch": 0.61545334733174, + "grad_norm": 0.47705844044685364, + "learning_rate": 3.401340956925515e-05, + "loss": 0.9217, + "step": 6888 + }, + { + "epoch": 0.6155426988630017, + "grad_norm": 0.5489514470100403, + "learning_rate": 3.399969911765246e-05, + "loss": 0.9599, + "step": 6889 + }, + { + "epoch": 0.6156320503942636, + "grad_norm": 0.47389769554138184, + "learning_rate": 3.398599000624551e-05, + "loss": 0.9344, + "step": 6890 + }, + { + "epoch": 0.6157214019255255, + "grad_norm": 0.41707125306129456, + "learning_rate": 3.39722822361826e-05, + "loss": 1.0065, + "step": 6891 + }, + { + "epoch": 0.6158107534567874, + "grad_norm": 0.4215580224990845, + "learning_rate": 3.395857580861188e-05, + "loss": 0.9582, + "step": 6892 + }, + { + "epoch": 0.6159001049880493, + "grad_norm": 0.5935801863670349, + "learning_rate": 3.394487072468144e-05, + "loss": 0.8802, + "step": 6893 + }, + { + "epoch": 0.6159894565193111, + "grad_norm": 0.40050363540649414, + "learning_rate": 3.393116698553921e-05, + "loss": 0.9529, + "step": 6894 + }, + { + "epoch": 0.6160788080505729, + "grad_norm": 0.41118553280830383, + "learning_rate": 3.391746459233302e-05, + "loss": 0.9639, + "step": 6895 + }, + { + "epoch": 0.6161681595818348, + "grad_norm": 0.47675037384033203, + "learning_rate": 3.39037635462106e-05, + "loss": 0.9067, + "step": 6896 + }, + { + "epoch": 0.6162575111130967, + "grad_norm": 0.5176188945770264, + "learning_rate": 3.3890063848319585e-05, + "loss": 0.8955, + "step": 6897 + }, + { + "epoch": 0.6163468626443586, + "grad_norm": 0.4446241557598114, + "learning_rate": 3.3876365499807414e-05, + "loss": 0.9455, + "step": 6898 + }, + { + "epoch": 0.6164362141756204, + "grad_norm": 0.42455655336380005, + "learning_rate": 3.38626685018215e-05, + "loss": 0.8833, + "step": 6899 + }, + { + "epoch": 0.6165255657068823, + "grad_norm": 0.4535345733165741, + "learning_rate": 3.384897285550911e-05, + "loss": 0.9354, + "step": 6900 + }, + { + "epoch": 0.6166149172381442, + "grad_norm": 0.3765280544757843, + "learning_rate": 3.38352785620174e-05, + "loss": 0.9248, + "step": 6901 + }, + { + "epoch": 0.616704268769406, + "grad_norm": 0.42387163639068604, + "learning_rate": 3.382158562249342e-05, + "loss": 0.945, + "step": 6902 + }, + { + "epoch": 0.6167936203006679, + "grad_norm": 0.4898373484611511, + "learning_rate": 3.38078940380841e-05, + "loss": 0.9428, + "step": 6903 + }, + { + "epoch": 0.6168829718319297, + "grad_norm": 0.5467028021812439, + "learning_rate": 3.3794203809936235e-05, + "loss": 0.891, + "step": 6904 + }, + { + "epoch": 0.6169723233631916, + "grad_norm": 0.4893369972705841, + "learning_rate": 3.3780514939196554e-05, + "loss": 0.9611, + "step": 6905 + }, + { + "epoch": 0.6170616748944535, + "grad_norm": 0.6234750747680664, + "learning_rate": 3.376682742701161e-05, + "loss": 0.946, + "step": 6906 + }, + { + "epoch": 0.6171510264257154, + "grad_norm": 0.49757829308509827, + "learning_rate": 3.375314127452795e-05, + "loss": 0.8848, + "step": 6907 + }, + { + "epoch": 0.6172403779569773, + "grad_norm": 0.49189379811286926, + "learning_rate": 3.3739456482891854e-05, + "loss": 0.901, + "step": 6908 + }, + { + "epoch": 0.617329729488239, + "grad_norm": 0.49801400303840637, + "learning_rate": 3.37257730532496e-05, + "loss": 1.0374, + "step": 6909 + }, + { + "epoch": 0.6174190810195009, + "grad_norm": 0.5089427828788757, + "learning_rate": 3.371209098674734e-05, + "loss": 0.9399, + "step": 6910 + }, + { + "epoch": 0.6175084325507628, + "grad_norm": 0.43401315808296204, + "learning_rate": 3.3698410284531055e-05, + "loss": 0.9137, + "step": 6911 + }, + { + "epoch": 0.6175977840820247, + "grad_norm": 0.43453726172447205, + "learning_rate": 3.368473094774668e-05, + "loss": 0.9569, + "step": 6912 + }, + { + "epoch": 0.6176871356132866, + "grad_norm": 0.5029697418212891, + "learning_rate": 3.3671052977539995e-05, + "loss": 0.9513, + "step": 6913 + }, + { + "epoch": 0.6177764871445485, + "grad_norm": 0.3645218312740326, + "learning_rate": 3.3657376375056684e-05, + "loss": 1.0016, + "step": 6914 + }, + { + "epoch": 0.6178658386758104, + "grad_norm": 0.5805116891860962, + "learning_rate": 3.36437011414423e-05, + "loss": 1.0221, + "step": 6915 + }, + { + "epoch": 0.6179551902070721, + "grad_norm": 0.4427826702594757, + "learning_rate": 3.363002727784228e-05, + "loss": 0.9114, + "step": 6916 + }, + { + "epoch": 0.618044541738334, + "grad_norm": 0.41726526618003845, + "learning_rate": 3.3616354785401996e-05, + "loss": 1.0393, + "step": 6917 + }, + { + "epoch": 0.6181338932695959, + "grad_norm": 0.6137450337409973, + "learning_rate": 3.360268366526662e-05, + "loss": 0.9014, + "step": 6918 + }, + { + "epoch": 0.6182232448008578, + "grad_norm": 0.4480533301830292, + "learning_rate": 3.358901391858126e-05, + "loss": 0.9186, + "step": 6919 + }, + { + "epoch": 0.6183125963321197, + "grad_norm": 0.4977112114429474, + "learning_rate": 3.357534554649092e-05, + "loss": 0.8681, + "step": 6920 + }, + { + "epoch": 0.6184019478633815, + "grad_norm": 0.44784173369407654, + "learning_rate": 3.3561678550140466e-05, + "loss": 0.9717, + "step": 6921 + }, + { + "epoch": 0.6184912993946434, + "grad_norm": 0.5142949223518372, + "learning_rate": 3.3548012930674656e-05, + "loss": 0.924, + "step": 6922 + }, + { + "epoch": 0.6185806509259052, + "grad_norm": 0.4716286361217499, + "learning_rate": 3.3534348689238115e-05, + "loss": 1.0101, + "step": 6923 + }, + { + "epoch": 0.6186700024571671, + "grad_norm": 0.4497227668762207, + "learning_rate": 3.352068582697539e-05, + "loss": 1.0712, + "step": 6924 + }, + { + "epoch": 0.618759353988429, + "grad_norm": 0.4125528633594513, + "learning_rate": 3.3507024345030884e-05, + "loss": 0.9306, + "step": 6925 + }, + { + "epoch": 0.6188487055196908, + "grad_norm": 0.47352614998817444, + "learning_rate": 3.349336424454889e-05, + "loss": 0.9388, + "step": 6926 + }, + { + "epoch": 0.6189380570509527, + "grad_norm": 0.4631035029888153, + "learning_rate": 3.347970552667361e-05, + "loss": 0.9669, + "step": 6927 + }, + { + "epoch": 0.6190274085822146, + "grad_norm": 0.42280247807502747, + "learning_rate": 3.346604819254907e-05, + "loss": 0.9841, + "step": 6928 + }, + { + "epoch": 0.6191167601134765, + "grad_norm": 0.4958139955997467, + "learning_rate": 3.3452392243319216e-05, + "loss": 1.0159, + "step": 6929 + }, + { + "epoch": 0.6192061116447383, + "grad_norm": 0.4598698019981384, + "learning_rate": 3.343873768012792e-05, + "loss": 0.9439, + "step": 6930 + }, + { + "epoch": 0.6192954631760001, + "grad_norm": 0.4202096462249756, + "learning_rate": 3.342508450411886e-05, + "loss": 1.0049, + "step": 6931 + }, + { + "epoch": 0.619384814707262, + "grad_norm": 0.43892961740493774, + "learning_rate": 3.341143271643565e-05, + "loss": 0.8916, + "step": 6932 + }, + { + "epoch": 0.6194741662385239, + "grad_norm": 0.4676835834980011, + "learning_rate": 3.339778231822177e-05, + "loss": 0.9904, + "step": 6933 + }, + { + "epoch": 0.6195635177697858, + "grad_norm": 0.4813263416290283, + "learning_rate": 3.338413331062059e-05, + "loss": 0.91, + "step": 6934 + }, + { + "epoch": 0.6196528693010477, + "grad_norm": 0.442055344581604, + "learning_rate": 3.3370485694775354e-05, + "loss": 0.9984, + "step": 6935 + }, + { + "epoch": 0.6197422208323096, + "grad_norm": 0.5169895887374878, + "learning_rate": 3.335683947182921e-05, + "loss": 0.9339, + "step": 6936 + }, + { + "epoch": 0.6198315723635713, + "grad_norm": 0.4888537526130676, + "learning_rate": 3.3343194642925166e-05, + "loss": 0.9264, + "step": 6937 + }, + { + "epoch": 0.6199209238948332, + "grad_norm": 0.5011920928955078, + "learning_rate": 3.332955120920612e-05, + "loss": 0.9406, + "step": 6938 + }, + { + "epoch": 0.6200102754260951, + "grad_norm": 0.4616535007953644, + "learning_rate": 3.331590917181484e-05, + "loss": 0.945, + "step": 6939 + }, + { + "epoch": 0.620099626957357, + "grad_norm": 0.5329020619392395, + "learning_rate": 3.330226853189402e-05, + "loss": 0.8971, + "step": 6940 + }, + { + "epoch": 0.6201889784886189, + "grad_norm": 0.4462106227874756, + "learning_rate": 3.328862929058619e-05, + "loss": 0.9299, + "step": 6941 + }, + { + "epoch": 0.6202783300198808, + "grad_norm": 0.6361902952194214, + "learning_rate": 3.327499144903378e-05, + "loss": 0.9665, + "step": 6942 + }, + { + "epoch": 0.6203676815511426, + "grad_norm": 0.4172038733959198, + "learning_rate": 3.326135500837911e-05, + "loss": 0.9784, + "step": 6943 + }, + { + "epoch": 0.6204570330824044, + "grad_norm": 0.4322963058948517, + "learning_rate": 3.324771996976439e-05, + "loss": 0.9376, + "step": 6944 + }, + { + "epoch": 0.6205463846136663, + "grad_norm": 0.4417525827884674, + "learning_rate": 3.323408633433168e-05, + "loss": 0.923, + "step": 6945 + }, + { + "epoch": 0.6206357361449282, + "grad_norm": 0.5419497489929199, + "learning_rate": 3.322045410322296e-05, + "loss": 0.7987, + "step": 6946 + }, + { + "epoch": 0.6207250876761901, + "grad_norm": 0.5265266299247742, + "learning_rate": 3.3206823277580054e-05, + "loss": 0.9383, + "step": 6947 + }, + { + "epoch": 0.6208144392074519, + "grad_norm": 0.3826887905597687, + "learning_rate": 3.3193193858544735e-05, + "loss": 1.0365, + "step": 6948 + }, + { + "epoch": 0.6209037907387138, + "grad_norm": 0.44361376762390137, + "learning_rate": 3.317956584725855e-05, + "loss": 0.9833, + "step": 6949 + }, + { + "epoch": 0.6209931422699757, + "grad_norm": 0.4837126135826111, + "learning_rate": 3.316593924486302e-05, + "loss": 0.8984, + "step": 6950 + }, + { + "epoch": 0.6210824938012375, + "grad_norm": 0.4288838505744934, + "learning_rate": 3.315231405249951e-05, + "loss": 0.9326, + "step": 6951 + }, + { + "epoch": 0.6211718453324994, + "grad_norm": 0.5369590520858765, + "learning_rate": 3.313869027130929e-05, + "loss": 0.9524, + "step": 6952 + }, + { + "epoch": 0.6212611968637612, + "grad_norm": 0.5428600907325745, + "learning_rate": 3.3125067902433485e-05, + "loss": 0.9537, + "step": 6953 + }, + { + "epoch": 0.6213505483950231, + "grad_norm": 0.43962809443473816, + "learning_rate": 3.311144694701313e-05, + "loss": 0.9584, + "step": 6954 + }, + { + "epoch": 0.621439899926285, + "grad_norm": 0.40536513924598694, + "learning_rate": 3.3097827406189094e-05, + "loss": 0.96, + "step": 6955 + }, + { + "epoch": 0.6215292514575469, + "grad_norm": 0.4061243236064911, + "learning_rate": 3.3084209281102184e-05, + "loss": 0.9844, + "step": 6956 + }, + { + "epoch": 0.6216186029888087, + "grad_norm": 0.44544127583503723, + "learning_rate": 3.307059257289306e-05, + "loss": 0.9206, + "step": 6957 + }, + { + "epoch": 0.6217079545200705, + "grad_norm": 0.4827932119369507, + "learning_rate": 3.305697728270226e-05, + "loss": 0.9254, + "step": 6958 + }, + { + "epoch": 0.6217973060513324, + "grad_norm": 0.5052177906036377, + "learning_rate": 3.3043363411670225e-05, + "loss": 0.9132, + "step": 6959 + }, + { + "epoch": 0.6218866575825943, + "grad_norm": 0.5290608406066895, + "learning_rate": 3.302975096093723e-05, + "loss": 0.9137, + "step": 6960 + }, + { + "epoch": 0.6219760091138562, + "grad_norm": 0.6665788292884827, + "learning_rate": 3.3016139931643486e-05, + "loss": 0.8932, + "step": 6961 + }, + { + "epoch": 0.6220653606451181, + "grad_norm": 0.44664615392684937, + "learning_rate": 3.300253032492906e-05, + "loss": 0.9808, + "step": 6962 + }, + { + "epoch": 0.62215471217638, + "grad_norm": 0.44915464520454407, + "learning_rate": 3.29889221419339e-05, + "loss": 0.9504, + "step": 6963 + }, + { + "epoch": 0.6222440637076417, + "grad_norm": 0.5096574425697327, + "learning_rate": 3.297531538379782e-05, + "loss": 0.9388, + "step": 6964 + }, + { + "epoch": 0.6223334152389036, + "grad_norm": 0.5032405853271484, + "learning_rate": 3.296171005166057e-05, + "loss": 0.8991, + "step": 6965 + }, + { + "epoch": 0.6224227667701655, + "grad_norm": 0.43075719475746155, + "learning_rate": 3.29481061466617e-05, + "loss": 0.9858, + "step": 6966 + }, + { + "epoch": 0.6225121183014274, + "grad_norm": 0.5037357211112976, + "learning_rate": 3.293450366994071e-05, + "loss": 0.8802, + "step": 6967 + }, + { + "epoch": 0.6226014698326893, + "grad_norm": 0.3918449878692627, + "learning_rate": 3.292090262263696e-05, + "loss": 1.063, + "step": 6968 + }, + { + "epoch": 0.6226908213639512, + "grad_norm": 0.5222644209861755, + "learning_rate": 3.290730300588965e-05, + "loss": 0.8485, + "step": 6969 + }, + { + "epoch": 0.622780172895213, + "grad_norm": 0.46805500984191895, + "learning_rate": 3.28937048208379e-05, + "loss": 0.9382, + "step": 6970 + }, + { + "epoch": 0.6228695244264748, + "grad_norm": 0.5328028798103333, + "learning_rate": 3.288010806862071e-05, + "loss": 0.8772, + "step": 6971 + }, + { + "epoch": 0.6229588759577367, + "grad_norm": 0.45489615201950073, + "learning_rate": 3.286651275037697e-05, + "loss": 0.9407, + "step": 6972 + }, + { + "epoch": 0.6230482274889986, + "grad_norm": 0.4729410409927368, + "learning_rate": 3.285291886724541e-05, + "loss": 0.9488, + "step": 6973 + }, + { + "epoch": 0.6231375790202605, + "grad_norm": 0.5226532816886902, + "learning_rate": 3.2839326420364664e-05, + "loss": 0.9016, + "step": 6974 + }, + { + "epoch": 0.6232269305515223, + "grad_norm": 0.5036323070526123, + "learning_rate": 3.282573541087325e-05, + "loss": 0.89, + "step": 6975 + }, + { + "epoch": 0.6233162820827842, + "grad_norm": 0.42593827843666077, + "learning_rate": 3.281214583990956e-05, + "loss": 1.067, + "step": 6976 + }, + { + "epoch": 0.6234056336140461, + "grad_norm": 0.46937912702560425, + "learning_rate": 3.2798557708611864e-05, + "loss": 0.9958, + "step": 6977 + }, + { + "epoch": 0.6234949851453079, + "grad_norm": 0.4938536286354065, + "learning_rate": 3.2784971018118346e-05, + "loss": 0.9676, + "step": 6978 + }, + { + "epoch": 0.6235843366765698, + "grad_norm": 0.5184669494628906, + "learning_rate": 3.2771385769566975e-05, + "loss": 0.9144, + "step": 6979 + }, + { + "epoch": 0.6236736882078316, + "grad_norm": 0.46924924850463867, + "learning_rate": 3.275780196409569e-05, + "loss": 0.9918, + "step": 6980 + }, + { + "epoch": 0.6237630397390935, + "grad_norm": 0.4476311504840851, + "learning_rate": 3.2744219602842276e-05, + "loss": 0.9499, + "step": 6981 + }, + { + "epoch": 0.6238523912703554, + "grad_norm": 0.4355534315109253, + "learning_rate": 3.27306386869444e-05, + "loss": 0.912, + "step": 6982 + }, + { + "epoch": 0.6239417428016173, + "grad_norm": 0.532875120639801, + "learning_rate": 3.271705921753962e-05, + "loss": 0.9276, + "step": 6983 + }, + { + "epoch": 0.6240310943328792, + "grad_norm": 0.4715759754180908, + "learning_rate": 3.270348119576536e-05, + "loss": 0.9415, + "step": 6984 + }, + { + "epoch": 0.624120445864141, + "grad_norm": 0.5215761065483093, + "learning_rate": 3.26899046227589e-05, + "loss": 0.9365, + "step": 6985 + }, + { + "epoch": 0.6242097973954028, + "grad_norm": 0.5269849896430969, + "learning_rate": 3.2676329499657455e-05, + "loss": 0.8021, + "step": 6986 + }, + { + "epoch": 0.6242991489266647, + "grad_norm": 0.43522122502326965, + "learning_rate": 3.266275582759808e-05, + "loss": 0.9598, + "step": 6987 + }, + { + "epoch": 0.6243885004579266, + "grad_norm": 0.45015889406204224, + "learning_rate": 3.2649183607717706e-05, + "loss": 0.9508, + "step": 6988 + }, + { + "epoch": 0.6244778519891885, + "grad_norm": 0.4154300093650818, + "learning_rate": 3.263561284115313e-05, + "loss": 0.9569, + "step": 6989 + }, + { + "epoch": 0.6245672035204504, + "grad_norm": 0.39608290791511536, + "learning_rate": 3.262204352904108e-05, + "loss": 0.9678, + "step": 6990 + }, + { + "epoch": 0.6246565550517122, + "grad_norm": 0.4975006878376007, + "learning_rate": 3.2608475672518115e-05, + "loss": 0.9861, + "step": 6991 + }, + { + "epoch": 0.624745906582974, + "grad_norm": 0.507334291934967, + "learning_rate": 3.259490927272071e-05, + "loss": 0.8913, + "step": 6992 + }, + { + "epoch": 0.6248352581142359, + "grad_norm": 0.46734029054641724, + "learning_rate": 3.2581344330785156e-05, + "loss": 0.9106, + "step": 6993 + }, + { + "epoch": 0.6249246096454978, + "grad_norm": 0.45459601283073425, + "learning_rate": 3.2567780847847693e-05, + "loss": 0.9296, + "step": 6994 + }, + { + "epoch": 0.6250139611767597, + "grad_norm": 0.45823296904563904, + "learning_rate": 3.25542188250444e-05, + "loss": 0.9551, + "step": 6995 + }, + { + "epoch": 0.6251033127080216, + "grad_norm": 0.4765163064002991, + "learning_rate": 3.2540658263511235e-05, + "loss": 0.9167, + "step": 6996 + }, + { + "epoch": 0.6251926642392834, + "grad_norm": 0.4322914183139801, + "learning_rate": 3.252709916438404e-05, + "loss": 0.9762, + "step": 6997 + }, + { + "epoch": 0.6252820157705453, + "grad_norm": 0.4407638907432556, + "learning_rate": 3.251354152879856e-05, + "loss": 0.9413, + "step": 6998 + }, + { + "epoch": 0.6253713673018071, + "grad_norm": 0.44821932911872864, + "learning_rate": 3.2499985357890356e-05, + "loss": 0.9412, + "step": 6999 + }, + { + "epoch": 0.625460718833069, + "grad_norm": 0.5897961854934692, + "learning_rate": 3.2486430652794906e-05, + "loss": 1.0047, + "step": 7000 + }, + { + "epoch": 0.6255500703643309, + "grad_norm": 0.37673619389533997, + "learning_rate": 3.247287741464758e-05, + "loss": 1.0576, + "step": 7001 + }, + { + "epoch": 0.6256394218955927, + "grad_norm": 0.43584850430488586, + "learning_rate": 3.245932564458359e-05, + "loss": 0.9994, + "step": 7002 + }, + { + "epoch": 0.6257287734268546, + "grad_norm": 0.4906632602214813, + "learning_rate": 3.244577534373805e-05, + "loss": 0.9025, + "step": 7003 + }, + { + "epoch": 0.6258181249581165, + "grad_norm": 0.4197581112384796, + "learning_rate": 3.2432226513245935e-05, + "loss": 0.9456, + "step": 7004 + }, + { + "epoch": 0.6259074764893784, + "grad_norm": 0.4052625000476837, + "learning_rate": 3.241867915424211e-05, + "loss": 0.9911, + "step": 7005 + }, + { + "epoch": 0.6259968280206402, + "grad_norm": 0.46664437651634216, + "learning_rate": 3.240513326786132e-05, + "loss": 0.9425, + "step": 7006 + }, + { + "epoch": 0.626086179551902, + "grad_norm": 0.5681549310684204, + "learning_rate": 3.239158885523815e-05, + "loss": 0.9522, + "step": 7007 + }, + { + "epoch": 0.6261755310831639, + "grad_norm": 0.4695439040660858, + "learning_rate": 3.237804591750713e-05, + "loss": 1.0061, + "step": 7008 + }, + { + "epoch": 0.6262648826144258, + "grad_norm": 0.4879624545574188, + "learning_rate": 3.236450445580258e-05, + "loss": 1.0038, + "step": 7009 + }, + { + "epoch": 0.6263542341456877, + "grad_norm": 0.4483213424682617, + "learning_rate": 3.2350964471258785e-05, + "loss": 0.9624, + "step": 7010 + }, + { + "epoch": 0.6264435856769496, + "grad_norm": 0.5487602949142456, + "learning_rate": 3.233742596500982e-05, + "loss": 0.8609, + "step": 7011 + }, + { + "epoch": 0.6265329372082115, + "grad_norm": 0.6317974925041199, + "learning_rate": 3.2323888938189696e-05, + "loss": 0.8496, + "step": 7012 + }, + { + "epoch": 0.6266222887394732, + "grad_norm": 0.5183166861534119, + "learning_rate": 3.231035339193229e-05, + "loss": 0.9314, + "step": 7013 + }, + { + "epoch": 0.6267116402707351, + "grad_norm": 0.4140419363975525, + "learning_rate": 3.2296819327371354e-05, + "loss": 1.0071, + "step": 7014 + }, + { + "epoch": 0.626800991801997, + "grad_norm": 0.4753478467464447, + "learning_rate": 3.228328674564049e-05, + "loss": 0.8971, + "step": 7015 + }, + { + "epoch": 0.6268903433332589, + "grad_norm": 0.4986949861049652, + "learning_rate": 3.226975564787322e-05, + "loss": 0.9599, + "step": 7016 + }, + { + "epoch": 0.6269796948645208, + "grad_norm": 0.4630643427371979, + "learning_rate": 3.2256226035202895e-05, + "loss": 0.9859, + "step": 7017 + }, + { + "epoch": 0.6270690463957826, + "grad_norm": 0.45809265971183777, + "learning_rate": 3.22426979087628e-05, + "loss": 0.9974, + "step": 7018 + }, + { + "epoch": 0.6271583979270444, + "grad_norm": 0.5661429762840271, + "learning_rate": 3.222917126968601e-05, + "loss": 0.8188, + "step": 7019 + }, + { + "epoch": 0.6272477494583063, + "grad_norm": 0.4159087538719177, + "learning_rate": 3.221564611910556e-05, + "loss": 0.941, + "step": 7020 + }, + { + "epoch": 0.6273371009895682, + "grad_norm": 0.4369848966598511, + "learning_rate": 3.22021224581543e-05, + "loss": 0.9944, + "step": 7021 + }, + { + "epoch": 0.6274264525208301, + "grad_norm": 0.43189677596092224, + "learning_rate": 3.218860028796501e-05, + "loss": 1.0337, + "step": 7022 + }, + { + "epoch": 0.627515804052092, + "grad_norm": 0.4517824947834015, + "learning_rate": 3.2175079609670286e-05, + "loss": 0.9424, + "step": 7023 + }, + { + "epoch": 0.6276051555833538, + "grad_norm": 0.42732521891593933, + "learning_rate": 3.216156042440267e-05, + "loss": 0.9269, + "step": 7024 + }, + { + "epoch": 0.6276945071146157, + "grad_norm": 0.5549626350402832, + "learning_rate": 3.2148042733294494e-05, + "loss": 0.898, + "step": 7025 + }, + { + "epoch": 0.6277838586458775, + "grad_norm": 0.4612114429473877, + "learning_rate": 3.2134526537478034e-05, + "loss": 0.9135, + "step": 7026 + }, + { + "epoch": 0.6278732101771394, + "grad_norm": 0.42315712571144104, + "learning_rate": 3.21210118380854e-05, + "loss": 0.9782, + "step": 7027 + }, + { + "epoch": 0.6279625617084013, + "grad_norm": 0.5238030552864075, + "learning_rate": 3.210749863624861e-05, + "loss": 0.923, + "step": 7028 + }, + { + "epoch": 0.6280519132396631, + "grad_norm": 0.39960044622421265, + "learning_rate": 3.209398693309954e-05, + "loss": 0.944, + "step": 7029 + }, + { + "epoch": 0.628141264770925, + "grad_norm": 0.5192845463752747, + "learning_rate": 3.2080476729769916e-05, + "loss": 0.9699, + "step": 7030 + }, + { + "epoch": 0.6282306163021869, + "grad_norm": 0.5337046980857849, + "learning_rate": 3.2066968027391374e-05, + "loss": 0.8788, + "step": 7031 + }, + { + "epoch": 0.6283199678334488, + "grad_norm": 0.4468163549900055, + "learning_rate": 3.20534608270954e-05, + "loss": 0.9746, + "step": 7032 + }, + { + "epoch": 0.6284093193647106, + "grad_norm": 0.4410332441329956, + "learning_rate": 3.20399551300134e-05, + "loss": 0.9883, + "step": 7033 + }, + { + "epoch": 0.6284986708959724, + "grad_norm": 0.4546487629413605, + "learning_rate": 3.202645093727659e-05, + "loss": 0.9566, + "step": 7034 + }, + { + "epoch": 0.6285880224272343, + "grad_norm": 0.5063059329986572, + "learning_rate": 3.2012948250016084e-05, + "loss": 0.865, + "step": 7035 + }, + { + "epoch": 0.6286773739584962, + "grad_norm": 0.432156503200531, + "learning_rate": 3.1999447069362904e-05, + "loss": 0.9478, + "step": 7036 + }, + { + "epoch": 0.6287667254897581, + "grad_norm": 0.5517258644104004, + "learning_rate": 3.19859473964479e-05, + "loss": 0.9291, + "step": 7037 + }, + { + "epoch": 0.62885607702102, + "grad_norm": 0.44080060720443726, + "learning_rate": 3.197244923240182e-05, + "loss": 1.0247, + "step": 7038 + }, + { + "epoch": 0.6289454285522819, + "grad_norm": 0.7278925180435181, + "learning_rate": 3.1958952578355295e-05, + "loss": 0.8903, + "step": 7039 + }, + { + "epoch": 0.6290347800835436, + "grad_norm": 0.44589099287986755, + "learning_rate": 3.194545743543878e-05, + "loss": 0.9401, + "step": 7040 + }, + { + "epoch": 0.6291241316148055, + "grad_norm": 0.5441766381263733, + "learning_rate": 3.193196380478264e-05, + "loss": 0.9553, + "step": 7041 + }, + { + "epoch": 0.6292134831460674, + "grad_norm": 0.4079403579235077, + "learning_rate": 3.191847168751714e-05, + "loss": 0.9159, + "step": 7042 + }, + { + "epoch": 0.6293028346773293, + "grad_norm": 0.5653771758079529, + "learning_rate": 3.190498108477237e-05, + "loss": 0.9289, + "step": 7043 + }, + { + "epoch": 0.6293921862085912, + "grad_norm": 0.48930737376213074, + "learning_rate": 3.18914919976783e-05, + "loss": 0.9681, + "step": 7044 + }, + { + "epoch": 0.629481537739853, + "grad_norm": 0.5332466959953308, + "learning_rate": 3.187800442736481e-05, + "loss": 0.9396, + "step": 7045 + }, + { + "epoch": 0.6295708892711149, + "grad_norm": 0.4455600678920746, + "learning_rate": 3.1864518374961606e-05, + "loss": 0.9359, + "step": 7046 + }, + { + "epoch": 0.6296602408023767, + "grad_norm": 0.47383755445480347, + "learning_rate": 3.1851033841598297e-05, + "loss": 0.9179, + "step": 7047 + }, + { + "epoch": 0.6297495923336386, + "grad_norm": 0.4770072102546692, + "learning_rate": 3.183755082840436e-05, + "loss": 0.9212, + "step": 7048 + }, + { + "epoch": 0.6298389438649005, + "grad_norm": 0.48469939827919006, + "learning_rate": 3.182406933650917e-05, + "loss": 0.9359, + "step": 7049 + }, + { + "epoch": 0.6299282953961624, + "grad_norm": 0.6045317053794861, + "learning_rate": 3.181058936704187e-05, + "loss": 0.9362, + "step": 7050 + }, + { + "epoch": 0.6300176469274242, + "grad_norm": 0.41952088475227356, + "learning_rate": 3.179711092113162e-05, + "loss": 0.9626, + "step": 7051 + }, + { + "epoch": 0.6301069984586861, + "grad_norm": 0.5453062653541565, + "learning_rate": 3.178363399990735e-05, + "loss": 0.8763, + "step": 7052 + }, + { + "epoch": 0.630196349989948, + "grad_norm": 0.47446975111961365, + "learning_rate": 3.1770158604497905e-05, + "loss": 0.9664, + "step": 7053 + }, + { + "epoch": 0.6302857015212098, + "grad_norm": 0.521207869052887, + "learning_rate": 3.175668473603199e-05, + "loss": 0.9685, + "step": 7054 + }, + { + "epoch": 0.6303750530524717, + "grad_norm": 0.47734490036964417, + "learning_rate": 3.17432123956382e-05, + "loss": 0.9631, + "step": 7055 + }, + { + "epoch": 0.6304644045837335, + "grad_norm": 0.5704666376113892, + "learning_rate": 3.172974158444496e-05, + "loss": 0.9473, + "step": 7056 + }, + { + "epoch": 0.6305537561149954, + "grad_norm": 0.5392587184906006, + "learning_rate": 3.171627230358063e-05, + "loss": 0.8843, + "step": 7057 + }, + { + "epoch": 0.6306431076462573, + "grad_norm": 0.5207484364509583, + "learning_rate": 3.1702804554173374e-05, + "loss": 0.9672, + "step": 7058 + }, + { + "epoch": 0.6307324591775192, + "grad_norm": 0.3899401128292084, + "learning_rate": 3.1689338337351273e-05, + "loss": 0.9844, + "step": 7059 + }, + { + "epoch": 0.6308218107087811, + "grad_norm": 0.4320841431617737, + "learning_rate": 3.1675873654242264e-05, + "loss": 0.9221, + "step": 7060 + }, + { + "epoch": 0.6309111622400428, + "grad_norm": 0.4138674736022949, + "learning_rate": 3.1662410505974146e-05, + "loss": 1.0483, + "step": 7061 + }, + { + "epoch": 0.6310005137713047, + "grad_norm": 0.5958953499794006, + "learning_rate": 3.164894889367463e-05, + "loss": 0.9235, + "step": 7062 + }, + { + "epoch": 0.6310898653025666, + "grad_norm": 0.4245629608631134, + "learning_rate": 3.1635488818471246e-05, + "loss": 0.9503, + "step": 7063 + }, + { + "epoch": 0.6311792168338285, + "grad_norm": 0.47923198342323303, + "learning_rate": 3.162203028149142e-05, + "loss": 0.9579, + "step": 7064 + }, + { + "epoch": 0.6312685683650904, + "grad_norm": 0.4923160672187805, + "learning_rate": 3.160857328386245e-05, + "loss": 0.9471, + "step": 7065 + }, + { + "epoch": 0.6313579198963523, + "grad_norm": 0.3958245515823364, + "learning_rate": 3.1595117826711514e-05, + "loss": 0.9887, + "step": 7066 + }, + { + "epoch": 0.6314472714276141, + "grad_norm": 0.4653833508491516, + "learning_rate": 3.1581663911165635e-05, + "loss": 0.9628, + "step": 7067 + }, + { + "epoch": 0.6315366229588759, + "grad_norm": 0.43516650795936584, + "learning_rate": 3.1568211538351736e-05, + "loss": 0.876, + "step": 7068 + }, + { + "epoch": 0.6316259744901378, + "grad_norm": 0.5455069541931152, + "learning_rate": 3.15547607093966e-05, + "loss": 0.8408, + "step": 7069 + }, + { + "epoch": 0.6317153260213997, + "grad_norm": 0.6274839639663696, + "learning_rate": 3.154131142542686e-05, + "loss": 0.9539, + "step": 7070 + }, + { + "epoch": 0.6318046775526616, + "grad_norm": 0.5620449781417847, + "learning_rate": 3.1527863687569026e-05, + "loss": 0.9296, + "step": 7071 + }, + { + "epoch": 0.6318940290839234, + "grad_norm": 0.5150429010391235, + "learning_rate": 3.1514417496949525e-05, + "loss": 0.8938, + "step": 7072 + }, + { + "epoch": 0.6319833806151853, + "grad_norm": 0.3932952880859375, + "learning_rate": 3.150097285469459e-05, + "loss": 0.9809, + "step": 7073 + }, + { + "epoch": 0.6320727321464472, + "grad_norm": 0.4048050045967102, + "learning_rate": 3.148752976193036e-05, + "loss": 0.9128, + "step": 7074 + }, + { + "epoch": 0.632162083677709, + "grad_norm": 0.45377394556999207, + "learning_rate": 3.147408821978285e-05, + "loss": 0.9428, + "step": 7075 + }, + { + "epoch": 0.6322514352089709, + "grad_norm": 0.47857969999313354, + "learning_rate": 3.146064822937793e-05, + "loss": 0.9388, + "step": 7076 + }, + { + "epoch": 0.6323407867402328, + "grad_norm": 0.42985495924949646, + "learning_rate": 3.144720979184133e-05, + "loss": 0.9978, + "step": 7077 + }, + { + "epoch": 0.6324301382714946, + "grad_norm": 0.4862077236175537, + "learning_rate": 3.1433772908298665e-05, + "loss": 0.9369, + "step": 7078 + }, + { + "epoch": 0.6325194898027565, + "grad_norm": 0.5292137861251831, + "learning_rate": 3.1420337579875424e-05, + "loss": 0.8707, + "step": 7079 + }, + { + "epoch": 0.6326088413340184, + "grad_norm": 0.5022360682487488, + "learning_rate": 3.140690380769696e-05, + "loss": 0.968, + "step": 7080 + }, + { + "epoch": 0.6326981928652802, + "grad_norm": 0.45004457235336304, + "learning_rate": 3.139347159288849e-05, + "loss": 0.9509, + "step": 7081 + }, + { + "epoch": 0.6327875443965421, + "grad_norm": 0.3994084894657135, + "learning_rate": 3.1380040936575094e-05, + "loss": 1.0183, + "step": 7082 + }, + { + "epoch": 0.6328768959278039, + "grad_norm": 0.4302981495857239, + "learning_rate": 3.136661183988175e-05, + "loss": 1.0077, + "step": 7083 + }, + { + "epoch": 0.6329662474590658, + "grad_norm": 0.5247629284858704, + "learning_rate": 3.135318430393328e-05, + "loss": 0.9228, + "step": 7084 + }, + { + "epoch": 0.6330555989903277, + "grad_norm": 0.44027116894721985, + "learning_rate": 3.133975832985438e-05, + "loss": 0.9233, + "step": 7085 + }, + { + "epoch": 0.6331449505215896, + "grad_norm": 0.4487362205982208, + "learning_rate": 3.1326333918769633e-05, + "loss": 0.9578, + "step": 7086 + }, + { + "epoch": 0.6332343020528515, + "grad_norm": 0.4788406789302826, + "learning_rate": 3.1312911071803464e-05, + "loss": 0.9458, + "step": 7087 + }, + { + "epoch": 0.6333236535841132, + "grad_norm": 0.5990272164344788, + "learning_rate": 3.1299489790080184e-05, + "loss": 0.8684, + "step": 7088 + }, + { + "epoch": 0.6334130051153751, + "grad_norm": 0.4570746123790741, + "learning_rate": 3.128607007472398e-05, + "loss": 1.0058, + "step": 7089 + }, + { + "epoch": 0.633502356646637, + "grad_norm": 0.45381224155426025, + "learning_rate": 3.127265192685887e-05, + "loss": 0.9837, + "step": 7090 + }, + { + "epoch": 0.6335917081778989, + "grad_norm": 0.46698102355003357, + "learning_rate": 3.1259235347608786e-05, + "loss": 0.9462, + "step": 7091 + }, + { + "epoch": 0.6336810597091608, + "grad_norm": 0.39173898100852966, + "learning_rate": 3.12458203380975e-05, + "loss": 0.9532, + "step": 7092 + }, + { + "epoch": 0.6337704112404227, + "grad_norm": 0.5905160307884216, + "learning_rate": 3.123240689944866e-05, + "loss": 0.8619, + "step": 7093 + }, + { + "epoch": 0.6338597627716845, + "grad_norm": 0.4507081210613251, + "learning_rate": 3.121899503278579e-05, + "loss": 0.9821, + "step": 7094 + }, + { + "epoch": 0.6339491143029463, + "grad_norm": 0.4796884059906006, + "learning_rate": 3.120558473923229e-05, + "loss": 0.9545, + "step": 7095 + }, + { + "epoch": 0.6340384658342082, + "grad_norm": 0.4841066896915436, + "learning_rate": 3.119217601991139e-05, + "loss": 0.8731, + "step": 7096 + }, + { + "epoch": 0.6341278173654701, + "grad_norm": 0.4773595631122589, + "learning_rate": 3.117876887594623e-05, + "loss": 0.9447, + "step": 7097 + }, + { + "epoch": 0.634217168896732, + "grad_norm": 0.5614939332008362, + "learning_rate": 3.116536330845979e-05, + "loss": 0.9185, + "step": 7098 + }, + { + "epoch": 0.6343065204279938, + "grad_norm": 0.440338671207428, + "learning_rate": 3.1151959318574964e-05, + "loss": 0.9498, + "step": 7099 + }, + { + "epoch": 0.6343958719592557, + "grad_norm": 0.43341484665870667, + "learning_rate": 3.113855690741443e-05, + "loss": 1.0054, + "step": 7100 + }, + { + "epoch": 0.6344852234905176, + "grad_norm": 0.5022625923156738, + "learning_rate": 3.1125156076100805e-05, + "loss": 0.8804, + "step": 7101 + }, + { + "epoch": 0.6345745750217794, + "grad_norm": 0.4659278392791748, + "learning_rate": 3.1111756825756546e-05, + "loss": 0.93, + "step": 7102 + }, + { + "epoch": 0.6346639265530413, + "grad_norm": 0.39844343066215515, + "learning_rate": 3.109835915750398e-05, + "loss": 1.0028, + "step": 7103 + }, + { + "epoch": 0.6347532780843032, + "grad_norm": 0.4734951853752136, + "learning_rate": 3.108496307246532e-05, + "loss": 0.9552, + "step": 7104 + }, + { + "epoch": 0.634842629615565, + "grad_norm": 0.4761861264705658, + "learning_rate": 3.107156857176262e-05, + "loss": 0.976, + "step": 7105 + }, + { + "epoch": 0.6349319811468269, + "grad_norm": 0.5420219302177429, + "learning_rate": 3.105817565651782e-05, + "loss": 0.8865, + "step": 7106 + }, + { + "epoch": 0.6350213326780888, + "grad_norm": 0.4390241205692291, + "learning_rate": 3.10447843278527e-05, + "loss": 0.9958, + "step": 7107 + }, + { + "epoch": 0.6351106842093507, + "grad_norm": 0.5415240526199341, + "learning_rate": 3.103139458688895e-05, + "loss": 0.8849, + "step": 7108 + }, + { + "epoch": 0.6352000357406125, + "grad_norm": 0.6151022911071777, + "learning_rate": 3.1018006434748113e-05, + "loss": 0.9167, + "step": 7109 + }, + { + "epoch": 0.6352893872718743, + "grad_norm": 0.4660266935825348, + "learning_rate": 3.100461987255155e-05, + "loss": 0.9687, + "step": 7110 + }, + { + "epoch": 0.6353787388031362, + "grad_norm": 0.4258476495742798, + "learning_rate": 3.0991234901420555e-05, + "loss": 0.9189, + "step": 7111 + }, + { + "epoch": 0.6354680903343981, + "grad_norm": 0.47770798206329346, + "learning_rate": 3.0977851522476254e-05, + "loss": 0.9047, + "step": 7112 + }, + { + "epoch": 0.63555744186566, + "grad_norm": 0.473283976316452, + "learning_rate": 3.096446973683966e-05, + "loss": 0.9983, + "step": 7113 + }, + { + "epoch": 0.6356467933969219, + "grad_norm": 0.4351825714111328, + "learning_rate": 3.0951089545631614e-05, + "loss": 0.9757, + "step": 7114 + }, + { + "epoch": 0.6357361449281838, + "grad_norm": 0.49671927094459534, + "learning_rate": 3.093771094997286e-05, + "loss": 0.8834, + "step": 7115 + }, + { + "epoch": 0.6358254964594455, + "grad_norm": 0.45592954754829407, + "learning_rate": 3.092433395098402e-05, + "loss": 0.9736, + "step": 7116 + }, + { + "epoch": 0.6359148479907074, + "grad_norm": 0.46876510977745056, + "learning_rate": 3.091095854978553e-05, + "loss": 0.9785, + "step": 7117 + }, + { + "epoch": 0.6360041995219693, + "grad_norm": 0.5314716696739197, + "learning_rate": 3.089758474749774e-05, + "loss": 0.9481, + "step": 7118 + }, + { + "epoch": 0.6360935510532312, + "grad_norm": 0.4798244833946228, + "learning_rate": 3.088421254524085e-05, + "loss": 0.8803, + "step": 7119 + }, + { + "epoch": 0.6361829025844931, + "grad_norm": 0.5197677612304688, + "learning_rate": 3.087084194413493e-05, + "loss": 0.8943, + "step": 7120 + }, + { + "epoch": 0.6362722541157549, + "grad_norm": 0.4947774112224579, + "learning_rate": 3.085747294529989e-05, + "loss": 0.9543, + "step": 7121 + }, + { + "epoch": 0.6363616056470168, + "grad_norm": 0.4913742244243622, + "learning_rate": 3.084410554985553e-05, + "loss": 0.9508, + "step": 7122 + }, + { + "epoch": 0.6364509571782786, + "grad_norm": 0.5018487572669983, + "learning_rate": 3.083073975892151e-05, + "loss": 0.8714, + "step": 7123 + }, + { + "epoch": 0.6365403087095405, + "grad_norm": 0.4363705813884735, + "learning_rate": 3.081737557361737e-05, + "loss": 0.946, + "step": 7124 + }, + { + "epoch": 0.6366296602408024, + "grad_norm": 0.3925338387489319, + "learning_rate": 3.0804012995062503e-05, + "loss": 0.988, + "step": 7125 + }, + { + "epoch": 0.6367190117720642, + "grad_norm": 0.4123278260231018, + "learning_rate": 3.0790652024376157e-05, + "loss": 0.9768, + "step": 7126 + }, + { + "epoch": 0.6368083633033261, + "grad_norm": 0.5473465323448181, + "learning_rate": 3.077729266267748e-05, + "loss": 0.9486, + "step": 7127 + }, + { + "epoch": 0.636897714834588, + "grad_norm": 0.5201464295387268, + "learning_rate": 3.076393491108542e-05, + "loss": 0.9381, + "step": 7128 + }, + { + "epoch": 0.6369870663658499, + "grad_norm": 0.4789254069328308, + "learning_rate": 3.075057877071886e-05, + "loss": 0.9006, + "step": 7129 + }, + { + "epoch": 0.6370764178971117, + "grad_norm": 0.4798673391342163, + "learning_rate": 3.0737224242696515e-05, + "loss": 0.9128, + "step": 7130 + }, + { + "epoch": 0.6371657694283736, + "grad_norm": 0.42166826128959656, + "learning_rate": 3.072387132813696e-05, + "loss": 1.0026, + "step": 7131 + }, + { + "epoch": 0.6372551209596354, + "grad_norm": 0.45716649293899536, + "learning_rate": 3.071052002815866e-05, + "loss": 0.9762, + "step": 7132 + }, + { + "epoch": 0.6373444724908973, + "grad_norm": 0.4412686824798584, + "learning_rate": 3.069717034387991e-05, + "loss": 0.9536, + "step": 7133 + }, + { + "epoch": 0.6374338240221592, + "grad_norm": 0.5291409492492676, + "learning_rate": 3.0683822276418895e-05, + "loss": 0.9105, + "step": 7134 + }, + { + "epoch": 0.6375231755534211, + "grad_norm": 0.41035082936286926, + "learning_rate": 3.0670475826893664e-05, + "loss": 0.9441, + "step": 7135 + }, + { + "epoch": 0.637612527084683, + "grad_norm": 0.44016411900520325, + "learning_rate": 3.065713099642211e-05, + "loss": 0.9229, + "step": 7136 + }, + { + "epoch": 0.6377018786159447, + "grad_norm": 0.5165585875511169, + "learning_rate": 3.0643787786122026e-05, + "loss": 0.9092, + "step": 7137 + }, + { + "epoch": 0.6377912301472066, + "grad_norm": 0.46023446321487427, + "learning_rate": 3.063044619711104e-05, + "loss": 0.9572, + "step": 7138 + }, + { + "epoch": 0.6378805816784685, + "grad_norm": 0.4530174136161804, + "learning_rate": 3.0617106230506645e-05, + "loss": 0.9166, + "step": 7139 + }, + { + "epoch": 0.6379699332097304, + "grad_norm": 0.45306262373924255, + "learning_rate": 3.0603767887426224e-05, + "loss": 1.0331, + "step": 7140 + }, + { + "epoch": 0.6380592847409923, + "grad_norm": 0.440663605928421, + "learning_rate": 3.059043116898698e-05, + "loss": 1.0083, + "step": 7141 + }, + { + "epoch": 0.6381486362722542, + "grad_norm": 0.48396119475364685, + "learning_rate": 3.057709607630601e-05, + "loss": 0.9481, + "step": 7142 + }, + { + "epoch": 0.638237987803516, + "grad_norm": 0.4982928931713104, + "learning_rate": 3.056376261050028e-05, + "loss": 1.0025, + "step": 7143 + }, + { + "epoch": 0.6383273393347778, + "grad_norm": 0.4071272313594818, + "learning_rate": 3.05504307726866e-05, + "loss": 1.0177, + "step": 7144 + }, + { + "epoch": 0.6384166908660397, + "grad_norm": 0.48170456290245056, + "learning_rate": 3.053710056398167e-05, + "loss": 0.8774, + "step": 7145 + }, + { + "epoch": 0.6385060423973016, + "grad_norm": 0.5535345077514648, + "learning_rate": 3.052377198550204e-05, + "loss": 0.8821, + "step": 7146 + }, + { + "epoch": 0.6385953939285635, + "grad_norm": 0.5003737807273865, + "learning_rate": 3.051044503836409e-05, + "loss": 0.9331, + "step": 7147 + }, + { + "epoch": 0.6386847454598253, + "grad_norm": 0.4324195086956024, + "learning_rate": 3.0497119723684108e-05, + "loss": 1.0236, + "step": 7148 + }, + { + "epoch": 0.6387740969910872, + "grad_norm": 0.5235957503318787, + "learning_rate": 3.0483796042578246e-05, + "loss": 0.9891, + "step": 7149 + }, + { + "epoch": 0.638863448522349, + "grad_norm": 0.5359724760055542, + "learning_rate": 3.047047399616251e-05, + "loss": 0.896, + "step": 7150 + }, + { + "epoch": 0.6389528000536109, + "grad_norm": 0.44273999333381653, + "learning_rate": 3.0457153585552723e-05, + "loss": 0.9874, + "step": 7151 + }, + { + "epoch": 0.6390421515848728, + "grad_norm": 0.41611048579216003, + "learning_rate": 3.0443834811864635e-05, + "loss": 0.9371, + "step": 7152 + }, + { + "epoch": 0.6391315031161346, + "grad_norm": 0.5038164854049683, + "learning_rate": 3.043051767621383e-05, + "loss": 0.9263, + "step": 7153 + }, + { + "epoch": 0.6392208546473965, + "grad_norm": 0.49609890580177307, + "learning_rate": 3.0417202179715776e-05, + "loss": 0.9053, + "step": 7154 + }, + { + "epoch": 0.6393102061786584, + "grad_norm": 0.4573724567890167, + "learning_rate": 3.0403888323485775e-05, + "loss": 0.9369, + "step": 7155 + }, + { + "epoch": 0.6393995577099203, + "grad_norm": 0.4710655212402344, + "learning_rate": 3.0390576108639e-05, + "loss": 0.9565, + "step": 7156 + }, + { + "epoch": 0.6394889092411821, + "grad_norm": 0.5273383259773254, + "learning_rate": 3.03772655362905e-05, + "loss": 1.0381, + "step": 7157 + }, + { + "epoch": 0.639578260772444, + "grad_norm": 0.5512682795524597, + "learning_rate": 3.0363956607555177e-05, + "loss": 0.9397, + "step": 7158 + }, + { + "epoch": 0.6396676123037058, + "grad_norm": 0.45379549264907837, + "learning_rate": 3.0350649323547796e-05, + "loss": 0.9626, + "step": 7159 + }, + { + "epoch": 0.6397569638349677, + "grad_norm": 0.4535743296146393, + "learning_rate": 3.0337343685383e-05, + "loss": 0.948, + "step": 7160 + }, + { + "epoch": 0.6398463153662296, + "grad_norm": 0.45462849736213684, + "learning_rate": 3.0324039694175233e-05, + "loss": 1.0285, + "step": 7161 + }, + { + "epoch": 0.6399356668974915, + "grad_norm": 0.5407901406288147, + "learning_rate": 3.0310737351038875e-05, + "loss": 0.9403, + "step": 7162 + }, + { + "epoch": 0.6400250184287534, + "grad_norm": 0.410166472196579, + "learning_rate": 3.029743665708814e-05, + "loss": 1.0002, + "step": 7163 + }, + { + "epoch": 0.6401143699600151, + "grad_norm": 0.501658022403717, + "learning_rate": 3.0284137613437098e-05, + "loss": 0.9004, + "step": 7164 + }, + { + "epoch": 0.640203721491277, + "grad_norm": 0.4671003818511963, + "learning_rate": 3.027084022119969e-05, + "loss": 0.9121, + "step": 7165 + }, + { + "epoch": 0.6402930730225389, + "grad_norm": 0.4434639811515808, + "learning_rate": 3.0257544481489712e-05, + "loss": 0.9303, + "step": 7166 + }, + { + "epoch": 0.6403824245538008, + "grad_norm": 0.46848881244659424, + "learning_rate": 3.024425039542082e-05, + "loss": 0.8991, + "step": 7167 + }, + { + "epoch": 0.6404717760850627, + "grad_norm": 0.4049372971057892, + "learning_rate": 3.0230957964106532e-05, + "loss": 0.9814, + "step": 7168 + }, + { + "epoch": 0.6405611276163246, + "grad_norm": 0.5770232677459717, + "learning_rate": 3.0217667188660248e-05, + "loss": 0.9142, + "step": 7169 + }, + { + "epoch": 0.6406504791475864, + "grad_norm": 0.5074619650840759, + "learning_rate": 3.0204378070195218e-05, + "loss": 0.9127, + "step": 7170 + }, + { + "epoch": 0.6407398306788482, + "grad_norm": 0.539682924747467, + "learning_rate": 3.01910906098245e-05, + "loss": 0.9482, + "step": 7171 + }, + { + "epoch": 0.6408291822101101, + "grad_norm": 0.5305405259132385, + "learning_rate": 3.0177804808661103e-05, + "loss": 0.9165, + "step": 7172 + }, + { + "epoch": 0.640918533741372, + "grad_norm": 0.39787405729293823, + "learning_rate": 3.0164520667817842e-05, + "loss": 0.9578, + "step": 7173 + }, + { + "epoch": 0.6410078852726339, + "grad_norm": 0.40553003549575806, + "learning_rate": 3.01512381884074e-05, + "loss": 1.0191, + "step": 7174 + }, + { + "epoch": 0.6410972368038957, + "grad_norm": 0.5072142481803894, + "learning_rate": 3.0137957371542336e-05, + "loss": 0.9339, + "step": 7175 + }, + { + "epoch": 0.6411865883351576, + "grad_norm": 0.402472585439682, + "learning_rate": 3.0124678218335058e-05, + "loss": 0.919, + "step": 7176 + }, + { + "epoch": 0.6412759398664195, + "grad_norm": 0.5144462585449219, + "learning_rate": 3.0111400729897833e-05, + "loss": 0.9673, + "step": 7177 + }, + { + "epoch": 0.6413652913976813, + "grad_norm": 0.5433812141418457, + "learning_rate": 3.009812490734279e-05, + "loss": 0.9127, + "step": 7178 + }, + { + "epoch": 0.6414546429289432, + "grad_norm": 0.522948145866394, + "learning_rate": 3.008485075178194e-05, + "loss": 0.8578, + "step": 7179 + }, + { + "epoch": 0.641543994460205, + "grad_norm": 0.4444359242916107, + "learning_rate": 3.0071578264327116e-05, + "loss": 0.9434, + "step": 7180 + }, + { + "epoch": 0.6416333459914669, + "grad_norm": 0.5288926959037781, + "learning_rate": 3.005830744609003e-05, + "loss": 0.8699, + "step": 7181 + }, + { + "epoch": 0.6417226975227288, + "grad_norm": 0.5076068639755249, + "learning_rate": 3.004503829818225e-05, + "loss": 0.905, + "step": 7182 + }, + { + "epoch": 0.6418120490539907, + "grad_norm": 0.49402233958244324, + "learning_rate": 3.003177082171523e-05, + "loss": 0.9201, + "step": 7183 + }, + { + "epoch": 0.6419014005852526, + "grad_norm": 0.5896358489990234, + "learning_rate": 3.0018505017800246e-05, + "loss": 0.8503, + "step": 7184 + }, + { + "epoch": 0.6419907521165144, + "grad_norm": 0.5615974068641663, + "learning_rate": 3.0005240887548445e-05, + "loss": 1.0653, + "step": 7185 + }, + { + "epoch": 0.6420801036477762, + "grad_norm": 0.42065030336380005, + "learning_rate": 2.9991978432070856e-05, + "loss": 0.9577, + "step": 7186 + }, + { + "epoch": 0.6421694551790381, + "grad_norm": 0.44661208987236023, + "learning_rate": 2.9978717652478344e-05, + "loss": 0.999, + "step": 7187 + }, + { + "epoch": 0.6422588067103, + "grad_norm": 0.489004909992218, + "learning_rate": 2.9965458549881638e-05, + "loss": 0.872, + "step": 7188 + }, + { + "epoch": 0.6423481582415619, + "grad_norm": 0.43301481008529663, + "learning_rate": 2.9952201125391332e-05, + "loss": 0.9497, + "step": 7189 + }, + { + "epoch": 0.6424375097728238, + "grad_norm": 0.46835726499557495, + "learning_rate": 2.993894538011789e-05, + "loss": 1.0229, + "step": 7190 + }, + { + "epoch": 0.6425268613040857, + "grad_norm": 0.4566435217857361, + "learning_rate": 2.9925691315171594e-05, + "loss": 0.9432, + "step": 7191 + }, + { + "epoch": 0.6426162128353474, + "grad_norm": 0.490376353263855, + "learning_rate": 2.9912438931662624e-05, + "loss": 0.9158, + "step": 7192 + }, + { + "epoch": 0.6427055643666093, + "grad_norm": 0.45299986004829407, + "learning_rate": 2.9899188230701014e-05, + "loss": 0.942, + "step": 7193 + }, + { + "epoch": 0.6427949158978712, + "grad_norm": 0.40514639019966125, + "learning_rate": 2.9885939213396647e-05, + "loss": 0.987, + "step": 7194 + }, + { + "epoch": 0.6428842674291331, + "grad_norm": 0.46090513467788696, + "learning_rate": 2.987269188085927e-05, + "loss": 1.0108, + "step": 7195 + }, + { + "epoch": 0.642973618960395, + "grad_norm": 0.5533647537231445, + "learning_rate": 2.9859446234198494e-05, + "loss": 0.8314, + "step": 7196 + }, + { + "epoch": 0.6430629704916568, + "grad_norm": 0.5012393593788147, + "learning_rate": 2.9846202274523776e-05, + "loss": 0.8779, + "step": 7197 + }, + { + "epoch": 0.6431523220229187, + "grad_norm": 0.4513954818248749, + "learning_rate": 2.9832960002944454e-05, + "loss": 0.855, + "step": 7198 + }, + { + "epoch": 0.6432416735541805, + "grad_norm": 0.49379584193229675, + "learning_rate": 2.981971942056968e-05, + "loss": 0.9278, + "step": 7199 + }, + { + "epoch": 0.6433310250854424, + "grad_norm": 0.43649551272392273, + "learning_rate": 2.9806480528508517e-05, + "loss": 0.9354, + "step": 7200 + }, + { + "epoch": 0.6434203766167043, + "grad_norm": 0.35923606157302856, + "learning_rate": 2.9793243327869868e-05, + "loss": 0.9558, + "step": 7201 + }, + { + "epoch": 0.6435097281479661, + "grad_norm": 0.4612623453140259, + "learning_rate": 2.978000781976248e-05, + "loss": 0.9682, + "step": 7202 + }, + { + "epoch": 0.643599079679228, + "grad_norm": 0.4402258098125458, + "learning_rate": 2.9766774005294952e-05, + "loss": 1.0167, + "step": 7203 + }, + { + "epoch": 0.6436884312104899, + "grad_norm": 0.6181815266609192, + "learning_rate": 2.9753541885575777e-05, + "loss": 0.877, + "step": 7204 + }, + { + "epoch": 0.6437777827417518, + "grad_norm": 0.5466234683990479, + "learning_rate": 2.9740311461713273e-05, + "loss": 0.936, + "step": 7205 + }, + { + "epoch": 0.6438671342730136, + "grad_norm": 0.4415399432182312, + "learning_rate": 2.9727082734815637e-05, + "loss": 0.9895, + "step": 7206 + }, + { + "epoch": 0.6439564858042754, + "grad_norm": 0.47382205724716187, + "learning_rate": 2.9713855705990923e-05, + "loss": 0.9684, + "step": 7207 + }, + { + "epoch": 0.6440458373355373, + "grad_norm": 0.5399396419525146, + "learning_rate": 2.970063037634703e-05, + "loss": 0.8178, + "step": 7208 + }, + { + "epoch": 0.6441351888667992, + "grad_norm": 0.5536864995956421, + "learning_rate": 2.9687406746991708e-05, + "loss": 0.9333, + "step": 7209 + }, + { + "epoch": 0.6442245403980611, + "grad_norm": 0.4546963572502136, + "learning_rate": 2.967418481903259e-05, + "loss": 0.993, + "step": 7210 + }, + { + "epoch": 0.644313891929323, + "grad_norm": 0.42409881949424744, + "learning_rate": 2.966096459357718e-05, + "loss": 0.9383, + "step": 7211 + }, + { + "epoch": 0.6444032434605848, + "grad_norm": 0.4200609624385834, + "learning_rate": 2.9647746071732757e-05, + "loss": 0.8847, + "step": 7212 + }, + { + "epoch": 0.6444925949918466, + "grad_norm": 0.5027011632919312, + "learning_rate": 2.9634529254606542e-05, + "loss": 0.8904, + "step": 7213 + }, + { + "epoch": 0.6445819465231085, + "grad_norm": 0.4371449053287506, + "learning_rate": 2.962131414330558e-05, + "loss": 0.9849, + "step": 7214 + }, + { + "epoch": 0.6446712980543704, + "grad_norm": 0.42081284523010254, + "learning_rate": 2.9608100738936783e-05, + "loss": 0.9994, + "step": 7215 + }, + { + "epoch": 0.6447606495856323, + "grad_norm": 0.454153448343277, + "learning_rate": 2.9594889042606923e-05, + "loss": 0.9647, + "step": 7216 + }, + { + "epoch": 0.6448500011168942, + "grad_norm": 0.4745718836784363, + "learning_rate": 2.958167905542259e-05, + "loss": 0.959, + "step": 7217 + }, + { + "epoch": 0.644939352648156, + "grad_norm": 0.5167459845542908, + "learning_rate": 2.9568470778490287e-05, + "loss": 0.9293, + "step": 7218 + }, + { + "epoch": 0.6450287041794178, + "grad_norm": 0.5096081495285034, + "learning_rate": 2.9555264212916334e-05, + "loss": 0.9004, + "step": 7219 + }, + { + "epoch": 0.6451180557106797, + "grad_norm": 0.5786259770393372, + "learning_rate": 2.9542059359806935e-05, + "loss": 1.0463, + "step": 7220 + }, + { + "epoch": 0.6452074072419416, + "grad_norm": 0.3960155248641968, + "learning_rate": 2.9528856220268147e-05, + "loss": 0.9449, + "step": 7221 + }, + { + "epoch": 0.6452967587732035, + "grad_norm": 0.48387083411216736, + "learning_rate": 2.951565479540584e-05, + "loss": 0.9286, + "step": 7222 + }, + { + "epoch": 0.6453861103044654, + "grad_norm": 0.46584317088127136, + "learning_rate": 2.9502455086325787e-05, + "loss": 0.922, + "step": 7223 + }, + { + "epoch": 0.6454754618357272, + "grad_norm": 0.42123234272003174, + "learning_rate": 2.9489257094133616e-05, + "loss": 0.9701, + "step": 7224 + }, + { + "epoch": 0.6455648133669891, + "grad_norm": 0.5903416275978088, + "learning_rate": 2.9476060819934786e-05, + "loss": 0.896, + "step": 7225 + }, + { + "epoch": 0.6456541648982509, + "grad_norm": 0.4230227768421173, + "learning_rate": 2.946286626483463e-05, + "loss": 0.9556, + "step": 7226 + }, + { + "epoch": 0.6457435164295128, + "grad_norm": 0.5032885670661926, + "learning_rate": 2.9449673429938342e-05, + "loss": 0.934, + "step": 7227 + }, + { + "epoch": 0.6458328679607747, + "grad_norm": 0.5191749334335327, + "learning_rate": 2.943648231635095e-05, + "loss": 0.8587, + "step": 7228 + }, + { + "epoch": 0.6459222194920365, + "grad_norm": 0.49752548336982727, + "learning_rate": 2.942329292517736e-05, + "loss": 0.9805, + "step": 7229 + }, + { + "epoch": 0.6460115710232984, + "grad_norm": 0.5034040212631226, + "learning_rate": 2.9410105257522314e-05, + "loss": 0.9645, + "step": 7230 + }, + { + "epoch": 0.6461009225545603, + "grad_norm": 0.5222768783569336, + "learning_rate": 2.9396919314490447e-05, + "loss": 0.9295, + "step": 7231 + }, + { + "epoch": 0.6461902740858222, + "grad_norm": 0.4623466730117798, + "learning_rate": 2.9383735097186175e-05, + "loss": 0.9802, + "step": 7232 + }, + { + "epoch": 0.646279625617084, + "grad_norm": 0.49886736273765564, + "learning_rate": 2.9370552606713852e-05, + "loss": 0.9273, + "step": 7233 + }, + { + "epoch": 0.6463689771483458, + "grad_norm": 0.547577440738678, + "learning_rate": 2.935737184417764e-05, + "loss": 0.9833, + "step": 7234 + }, + { + "epoch": 0.6464583286796077, + "grad_norm": 0.4106947183609009, + "learning_rate": 2.9344192810681577e-05, + "loss": 1.0004, + "step": 7235 + }, + { + "epoch": 0.6465476802108696, + "grad_norm": 0.41953131556510925, + "learning_rate": 2.933101550732953e-05, + "loss": 0.9349, + "step": 7236 + }, + { + "epoch": 0.6466370317421315, + "grad_norm": 0.4484841823577881, + "learning_rate": 2.9317839935225254e-05, + "loss": 0.9114, + "step": 7237 + }, + { + "epoch": 0.6467263832733934, + "grad_norm": 0.4939296245574951, + "learning_rate": 2.9304666095472334e-05, + "loss": 0.9474, + "step": 7238 + }, + { + "epoch": 0.6468157348046553, + "grad_norm": 0.409227579832077, + "learning_rate": 2.9291493989174234e-05, + "loss": 1.0057, + "step": 7239 + }, + { + "epoch": 0.646905086335917, + "grad_norm": 0.44199416041374207, + "learning_rate": 2.9278323617434245e-05, + "loss": 0.899, + "step": 7240 + }, + { + "epoch": 0.6469944378671789, + "grad_norm": 0.4873824417591095, + "learning_rate": 2.9265154981355547e-05, + "loss": 0.9753, + "step": 7241 + }, + { + "epoch": 0.6470837893984408, + "grad_norm": 0.4727482497692108, + "learning_rate": 2.9251988082041115e-05, + "loss": 0.9691, + "step": 7242 + }, + { + "epoch": 0.6471731409297027, + "grad_norm": 0.4748595952987671, + "learning_rate": 2.9238822920593844e-05, + "loss": 0.993, + "step": 7243 + }, + { + "epoch": 0.6472624924609646, + "grad_norm": 0.490640252828598, + "learning_rate": 2.9225659498116452e-05, + "loss": 0.9925, + "step": 7244 + }, + { + "epoch": 0.6473518439922265, + "grad_norm": 0.5214620232582092, + "learning_rate": 2.9212497815711516e-05, + "loss": 0.9996, + "step": 7245 + }, + { + "epoch": 0.6474411955234883, + "grad_norm": 0.5297476649284363, + "learning_rate": 2.9199337874481465e-05, + "loss": 0.9351, + "step": 7246 + }, + { + "epoch": 0.6475305470547501, + "grad_norm": 0.4094926416873932, + "learning_rate": 2.9186179675528597e-05, + "loss": 0.9445, + "step": 7247 + }, + { + "epoch": 0.647619898586012, + "grad_norm": 0.5132883787155151, + "learning_rate": 2.9173023219955032e-05, + "loss": 0.8968, + "step": 7248 + }, + { + "epoch": 0.6477092501172739, + "grad_norm": 0.5564563870429993, + "learning_rate": 2.9159868508862766e-05, + "loss": 0.8778, + "step": 7249 + }, + { + "epoch": 0.6477986016485358, + "grad_norm": 0.5214046835899353, + "learning_rate": 2.9146715543353652e-05, + "loss": 0.9337, + "step": 7250 + }, + { + "epoch": 0.6478879531797976, + "grad_norm": 0.38234221935272217, + "learning_rate": 2.9133564324529415e-05, + "loss": 0.9849, + "step": 7251 + }, + { + "epoch": 0.6479773047110595, + "grad_norm": 0.4253350794315338, + "learning_rate": 2.912041485349157e-05, + "loss": 0.9478, + "step": 7252 + }, + { + "epoch": 0.6480666562423214, + "grad_norm": 0.436994731426239, + "learning_rate": 2.9107267131341537e-05, + "loss": 0.9162, + "step": 7253 + }, + { + "epoch": 0.6481560077735832, + "grad_norm": 0.4581502676010132, + "learning_rate": 2.9094121159180588e-05, + "loss": 0.9918, + "step": 7254 + }, + { + "epoch": 0.6482453593048451, + "grad_norm": 0.45387864112854004, + "learning_rate": 2.908097693810983e-05, + "loss": 0.8974, + "step": 7255 + }, + { + "epoch": 0.6483347108361069, + "grad_norm": 0.4383721947669983, + "learning_rate": 2.9067834469230225e-05, + "loss": 0.9876, + "step": 7256 + }, + { + "epoch": 0.6484240623673688, + "grad_norm": 0.4127906858921051, + "learning_rate": 2.9054693753642614e-05, + "loss": 0.9452, + "step": 7257 + }, + { + "epoch": 0.6485134138986307, + "grad_norm": 0.530967116355896, + "learning_rate": 2.9041554792447655e-05, + "loss": 0.918, + "step": 7258 + }, + { + "epoch": 0.6486027654298926, + "grad_norm": 0.48077839612960815, + "learning_rate": 2.9028417586745887e-05, + "loss": 0.9524, + "step": 7259 + }, + { + "epoch": 0.6486921169611545, + "grad_norm": 0.47966933250427246, + "learning_rate": 2.9015282137637688e-05, + "loss": 0.9938, + "step": 7260 + }, + { + "epoch": 0.6487814684924162, + "grad_norm": 0.5792236328125, + "learning_rate": 2.900214844622331e-05, + "loss": 0.9617, + "step": 7261 + }, + { + "epoch": 0.6488708200236781, + "grad_norm": 0.5350083708763123, + "learning_rate": 2.8989016513602802e-05, + "loss": 0.9807, + "step": 7262 + }, + { + "epoch": 0.64896017155494, + "grad_norm": 0.4303712844848633, + "learning_rate": 2.8975886340876117e-05, + "loss": 0.9293, + "step": 7263 + }, + { + "epoch": 0.6490495230862019, + "grad_norm": 0.451147198677063, + "learning_rate": 2.896275792914306e-05, + "loss": 1.0082, + "step": 7264 + }, + { + "epoch": 0.6491388746174638, + "grad_norm": 0.37288275361061096, + "learning_rate": 2.8949631279503264e-05, + "loss": 1.0025, + "step": 7265 + }, + { + "epoch": 0.6492282261487257, + "grad_norm": 0.480672687292099, + "learning_rate": 2.8936506393056223e-05, + "loss": 0.9353, + "step": 7266 + }, + { + "epoch": 0.6493175776799875, + "grad_norm": 0.547631561756134, + "learning_rate": 2.89233832709013e-05, + "loss": 0.925, + "step": 7267 + }, + { + "epoch": 0.6494069292112493, + "grad_norm": 0.43532001972198486, + "learning_rate": 2.8910261914137682e-05, + "loss": 0.9405, + "step": 7268 + }, + { + "epoch": 0.6494962807425112, + "grad_norm": 0.45452019572257996, + "learning_rate": 2.8897142323864433e-05, + "loss": 0.9886, + "step": 7269 + }, + { + "epoch": 0.6495856322737731, + "grad_norm": 0.4508405923843384, + "learning_rate": 2.8884024501180456e-05, + "loss": 0.9328, + "step": 7270 + }, + { + "epoch": 0.649674983805035, + "grad_norm": 0.47930291295051575, + "learning_rate": 2.887090844718453e-05, + "loss": 0.9157, + "step": 7271 + }, + { + "epoch": 0.6497643353362968, + "grad_norm": 0.47216659784317017, + "learning_rate": 2.8857794162975214e-05, + "loss": 0.9874, + "step": 7272 + }, + { + "epoch": 0.6498536868675587, + "grad_norm": 0.4564785063266754, + "learning_rate": 2.8844681649651e-05, + "loss": 0.9879, + "step": 7273 + }, + { + "epoch": 0.6499430383988205, + "grad_norm": 0.5167818069458008, + "learning_rate": 2.8831570908310202e-05, + "loss": 0.9293, + "step": 7274 + }, + { + "epoch": 0.6500323899300824, + "grad_norm": 0.4396851658821106, + "learning_rate": 2.8818461940050967e-05, + "loss": 0.8964, + "step": 7275 + }, + { + "epoch": 0.6501217414613443, + "grad_norm": 0.5146901607513428, + "learning_rate": 2.8805354745971337e-05, + "loss": 0.9672, + "step": 7276 + }, + { + "epoch": 0.6502110929926062, + "grad_norm": 0.5108723044395447, + "learning_rate": 2.879224932716918e-05, + "loss": 0.9948, + "step": 7277 + }, + { + "epoch": 0.650300444523868, + "grad_norm": 0.469088613986969, + "learning_rate": 2.877914568474218e-05, + "loss": 0.9109, + "step": 7278 + }, + { + "epoch": 0.6503897960551299, + "grad_norm": 0.4965616464614868, + "learning_rate": 2.8766043819787925e-05, + "loss": 0.8499, + "step": 7279 + }, + { + "epoch": 0.6504791475863918, + "grad_norm": 0.47035840153694153, + "learning_rate": 2.875294373340384e-05, + "loss": 0.9244, + "step": 7280 + }, + { + "epoch": 0.6505684991176536, + "grad_norm": 0.5100567936897278, + "learning_rate": 2.8739845426687218e-05, + "loss": 0.9717, + "step": 7281 + }, + { + "epoch": 0.6506578506489155, + "grad_norm": 0.40733376145362854, + "learning_rate": 2.8726748900735133e-05, + "loss": 0.9693, + "step": 7282 + }, + { + "epoch": 0.6507472021801773, + "grad_norm": 0.4507538676261902, + "learning_rate": 2.8713654156644588e-05, + "loss": 0.9329, + "step": 7283 + }, + { + "epoch": 0.6508365537114392, + "grad_norm": 0.44749560952186584, + "learning_rate": 2.87005611955124e-05, + "loss": 0.9209, + "step": 7284 + }, + { + "epoch": 0.6509259052427011, + "grad_norm": 0.3972071409225464, + "learning_rate": 2.8687470018435246e-05, + "loss": 1.0053, + "step": 7285 + }, + { + "epoch": 0.651015256773963, + "grad_norm": 0.4977121353149414, + "learning_rate": 2.867438062650966e-05, + "loss": 0.9951, + "step": 7286 + }, + { + "epoch": 0.6511046083052249, + "grad_norm": 0.42729929089546204, + "learning_rate": 2.866129302083201e-05, + "loss": 0.9505, + "step": 7287 + }, + { + "epoch": 0.6511939598364866, + "grad_norm": 0.3862477242946625, + "learning_rate": 2.8648207202498524e-05, + "loss": 0.9529, + "step": 7288 + }, + { + "epoch": 0.6512833113677485, + "grad_norm": 0.5908159017562866, + "learning_rate": 2.8635123172605273e-05, + "loss": 0.9252, + "step": 7289 + }, + { + "epoch": 0.6513726628990104, + "grad_norm": 0.5458805561065674, + "learning_rate": 2.8622040932248196e-05, + "loss": 0.9316, + "step": 7290 + }, + { + "epoch": 0.6514620144302723, + "grad_norm": 0.5677804350852966, + "learning_rate": 2.8608960482523056e-05, + "loss": 0.9166, + "step": 7291 + }, + { + "epoch": 0.6515513659615342, + "grad_norm": 0.57185298204422, + "learning_rate": 2.859588182452551e-05, + "loss": 0.9909, + "step": 7292 + }, + { + "epoch": 0.6516407174927961, + "grad_norm": 0.4821870028972626, + "learning_rate": 2.8582804959350994e-05, + "loss": 0.9059, + "step": 7293 + }, + { + "epoch": 0.651730069024058, + "grad_norm": 0.39487797021865845, + "learning_rate": 2.8569729888094853e-05, + "loss": 0.9615, + "step": 7294 + }, + { + "epoch": 0.6518194205553197, + "grad_norm": 0.5088570713996887, + "learning_rate": 2.8556656611852274e-05, + "loss": 0.9741, + "step": 7295 + }, + { + "epoch": 0.6519087720865816, + "grad_norm": 0.4211069941520691, + "learning_rate": 2.8543585131718263e-05, + "loss": 1.0125, + "step": 7296 + }, + { + "epoch": 0.6519981236178435, + "grad_norm": 0.46207818388938904, + "learning_rate": 2.853051544878771e-05, + "loss": 0.9296, + "step": 7297 + }, + { + "epoch": 0.6520874751491054, + "grad_norm": 0.454182893037796, + "learning_rate": 2.851744756415533e-05, + "loss": 0.8923, + "step": 7298 + }, + { + "epoch": 0.6521768266803672, + "grad_norm": 0.4857379198074341, + "learning_rate": 2.850438147891571e-05, + "loss": 0.9274, + "step": 7299 + }, + { + "epoch": 0.6522661782116291, + "grad_norm": 0.42418479919433594, + "learning_rate": 2.8491317194163265e-05, + "loss": 1.0274, + "step": 7300 + }, + { + "epoch": 0.652355529742891, + "grad_norm": 0.49535322189331055, + "learning_rate": 2.847825471099227e-05, + "loss": 0.9725, + "step": 7301 + }, + { + "epoch": 0.6524448812741528, + "grad_norm": 0.46349287033081055, + "learning_rate": 2.8465194030496872e-05, + "loss": 0.9706, + "step": 7302 + }, + { + "epoch": 0.6525342328054147, + "grad_norm": 0.5424239635467529, + "learning_rate": 2.8452135153771e-05, + "loss": 0.9002, + "step": 7303 + }, + { + "epoch": 0.6526235843366766, + "grad_norm": 0.5158531069755554, + "learning_rate": 2.843907808190849e-05, + "loss": 0.8912, + "step": 7304 + }, + { + "epoch": 0.6527129358679384, + "grad_norm": 0.44023188948631287, + "learning_rate": 2.8426022816003012e-05, + "loss": 1.0164, + "step": 7305 + }, + { + "epoch": 0.6528022873992003, + "grad_norm": 0.6754381060600281, + "learning_rate": 2.841296935714809e-05, + "loss": 0.9589, + "step": 7306 + }, + { + "epoch": 0.6528916389304622, + "grad_norm": 0.65952068567276, + "learning_rate": 2.8399917706437074e-05, + "loss": 0.8979, + "step": 7307 + }, + { + "epoch": 0.6529809904617241, + "grad_norm": 0.431598037481308, + "learning_rate": 2.8386867864963202e-05, + "loss": 0.8975, + "step": 7308 + }, + { + "epoch": 0.6530703419929859, + "grad_norm": 0.45453155040740967, + "learning_rate": 2.8373819833819526e-05, + "loss": 0.9543, + "step": 7309 + }, + { + "epoch": 0.6531596935242477, + "grad_norm": 0.4382008910179138, + "learning_rate": 2.836077361409897e-05, + "loss": 1.0249, + "step": 7310 + }, + { + "epoch": 0.6532490450555096, + "grad_norm": 0.4890137314796448, + "learning_rate": 2.8347729206894268e-05, + "loss": 0.9056, + "step": 7311 + }, + { + "epoch": 0.6533383965867715, + "grad_norm": 0.4039430022239685, + "learning_rate": 2.8334686613298034e-05, + "loss": 0.995, + "step": 7312 + }, + { + "epoch": 0.6534277481180334, + "grad_norm": 0.5166674256324768, + "learning_rate": 2.8321645834402737e-05, + "loss": 0.9533, + "step": 7313 + }, + { + "epoch": 0.6535170996492953, + "grad_norm": 0.5235624313354492, + "learning_rate": 2.8308606871300697e-05, + "loss": 0.9465, + "step": 7314 + }, + { + "epoch": 0.6536064511805572, + "grad_norm": 0.46239298582077026, + "learning_rate": 2.8295569725084027e-05, + "loss": 0.9373, + "step": 7315 + }, + { + "epoch": 0.6536958027118189, + "grad_norm": 0.46263396739959717, + "learning_rate": 2.828253439684474e-05, + "loss": 0.9634, + "step": 7316 + }, + { + "epoch": 0.6537851542430808, + "grad_norm": 0.6211736798286438, + "learning_rate": 2.8269500887674687e-05, + "loss": 0.9254, + "step": 7317 + }, + { + "epoch": 0.6538745057743427, + "grad_norm": 0.5086511969566345, + "learning_rate": 2.825646919866557e-05, + "loss": 0.8853, + "step": 7318 + }, + { + "epoch": 0.6539638573056046, + "grad_norm": 0.4071234464645386, + "learning_rate": 2.8243439330908926e-05, + "loss": 0.9643, + "step": 7319 + }, + { + "epoch": 0.6540532088368665, + "grad_norm": 0.4825928807258606, + "learning_rate": 2.8230411285496145e-05, + "loss": 0.9184, + "step": 7320 + }, + { + "epoch": 0.6541425603681283, + "grad_norm": 0.5398180484771729, + "learning_rate": 2.8217385063518463e-05, + "loss": 0.921, + "step": 7321 + }, + { + "epoch": 0.6542319118993902, + "grad_norm": 0.5476343631744385, + "learning_rate": 2.8204360666067e-05, + "loss": 0.9528, + "step": 7322 + }, + { + "epoch": 0.654321263430652, + "grad_norm": 0.4591327905654907, + "learning_rate": 2.819133809423262e-05, + "loss": 0.9868, + "step": 7323 + }, + { + "epoch": 0.6544106149619139, + "grad_norm": 0.5206472277641296, + "learning_rate": 2.8178317349106155e-05, + "loss": 0.9039, + "step": 7324 + }, + { + "epoch": 0.6544999664931758, + "grad_norm": 0.4743161201477051, + "learning_rate": 2.8165298431778197e-05, + "loss": 0.9459, + "step": 7325 + }, + { + "epoch": 0.6545893180244376, + "grad_norm": 0.48798489570617676, + "learning_rate": 2.8152281343339248e-05, + "loss": 0.9241, + "step": 7326 + }, + { + "epoch": 0.6546786695556995, + "grad_norm": 0.4876413643360138, + "learning_rate": 2.8139266084879614e-05, + "loss": 0.9567, + "step": 7327 + }, + { + "epoch": 0.6547680210869614, + "grad_norm": 0.5455504059791565, + "learning_rate": 2.812625265748946e-05, + "loss": 1.0707, + "step": 7328 + }, + { + "epoch": 0.6548573726182233, + "grad_norm": 0.4286678731441498, + "learning_rate": 2.811324106225881e-05, + "loss": 0.9682, + "step": 7329 + }, + { + "epoch": 0.6549467241494851, + "grad_norm": 0.4500292241573334, + "learning_rate": 2.8100231300277514e-05, + "loss": 0.8929, + "step": 7330 + }, + { + "epoch": 0.655036075680747, + "grad_norm": 0.4849509596824646, + "learning_rate": 2.8087223372635286e-05, + "loss": 1.0102, + "step": 7331 + }, + { + "epoch": 0.6551254272120088, + "grad_norm": 0.4313583970069885, + "learning_rate": 2.8074217280421688e-05, + "loss": 0.9825, + "step": 7332 + }, + { + "epoch": 0.6552147787432707, + "grad_norm": 0.485607773065567, + "learning_rate": 2.8061213024726085e-05, + "loss": 0.8919, + "step": 7333 + }, + { + "epoch": 0.6553041302745326, + "grad_norm": 0.5385865569114685, + "learning_rate": 2.8048210606637744e-05, + "loss": 0.9059, + "step": 7334 + }, + { + "epoch": 0.6553934818057945, + "grad_norm": 0.4551725387573242, + "learning_rate": 2.803521002724575e-05, + "loss": 0.9603, + "step": 7335 + }, + { + "epoch": 0.6554828333370563, + "grad_norm": 0.5102852582931519, + "learning_rate": 2.8022211287639044e-05, + "loss": 0.8227, + "step": 7336 + }, + { + "epoch": 0.6555721848683181, + "grad_norm": 0.5619865655899048, + "learning_rate": 2.8009214388906414e-05, + "loss": 0.9732, + "step": 7337 + }, + { + "epoch": 0.65566153639958, + "grad_norm": 0.4476434886455536, + "learning_rate": 2.7996219332136486e-05, + "loss": 1.0222, + "step": 7338 + }, + { + "epoch": 0.6557508879308419, + "grad_norm": 0.4480132758617401, + "learning_rate": 2.7983226118417728e-05, + "loss": 0.8978, + "step": 7339 + }, + { + "epoch": 0.6558402394621038, + "grad_norm": 0.5554249286651611, + "learning_rate": 2.7970234748838466e-05, + "loss": 0.8385, + "step": 7340 + }, + { + "epoch": 0.6559295909933657, + "grad_norm": 0.4294489622116089, + "learning_rate": 2.7957245224486862e-05, + "loss": 0.9374, + "step": 7341 + }, + { + "epoch": 0.6560189425246276, + "grad_norm": 0.5135436058044434, + "learning_rate": 2.7944257546450948e-05, + "loss": 0.9616, + "step": 7342 + }, + { + "epoch": 0.6561082940558893, + "grad_norm": 0.47493478655815125, + "learning_rate": 2.793127171581854e-05, + "loss": 0.867, + "step": 7343 + }, + { + "epoch": 0.6561976455871512, + "grad_norm": 0.4486094117164612, + "learning_rate": 2.7918287733677372e-05, + "loss": 0.9386, + "step": 7344 + }, + { + "epoch": 0.6562869971184131, + "grad_norm": 0.42966026067733765, + "learning_rate": 2.7905305601114972e-05, + "loss": 1.0198, + "step": 7345 + }, + { + "epoch": 0.656376348649675, + "grad_norm": 0.4275778830051422, + "learning_rate": 2.7892325319218744e-05, + "loss": 1.0089, + "step": 7346 + }, + { + "epoch": 0.6564657001809369, + "grad_norm": 0.4956458508968353, + "learning_rate": 2.787934688907594e-05, + "loss": 0.9616, + "step": 7347 + }, + { + "epoch": 0.6565550517121987, + "grad_norm": 0.5669902563095093, + "learning_rate": 2.7866370311773603e-05, + "loss": 0.8564, + "step": 7348 + }, + { + "epoch": 0.6566444032434606, + "grad_norm": 0.5185860991477966, + "learning_rate": 2.7853395588398677e-05, + "loss": 0.9535, + "step": 7349 + }, + { + "epoch": 0.6567337547747224, + "grad_norm": 0.4602813124656677, + "learning_rate": 2.784042272003794e-05, + "loss": 0.9282, + "step": 7350 + }, + { + "epoch": 0.6568231063059843, + "grad_norm": 0.42367443442344666, + "learning_rate": 2.7827451707778007e-05, + "loss": 0.9314, + "step": 7351 + }, + { + "epoch": 0.6569124578372462, + "grad_norm": 0.40423354506492615, + "learning_rate": 2.7814482552705346e-05, + "loss": 1.036, + "step": 7352 + }, + { + "epoch": 0.657001809368508, + "grad_norm": 0.4704086184501648, + "learning_rate": 2.780151525590624e-05, + "loss": 0.9161, + "step": 7353 + }, + { + "epoch": 0.6570911608997699, + "grad_norm": 0.4368595778942108, + "learning_rate": 2.7788549818466847e-05, + "loss": 1.052, + "step": 7354 + }, + { + "epoch": 0.6571805124310318, + "grad_norm": 0.5148953199386597, + "learning_rate": 2.7775586241473173e-05, + "loss": 0.9886, + "step": 7355 + }, + { + "epoch": 0.6572698639622937, + "grad_norm": 0.43235522508621216, + "learning_rate": 2.7762624526011038e-05, + "loss": 0.8993, + "step": 7356 + }, + { + "epoch": 0.6573592154935555, + "grad_norm": 0.5775102376937866, + "learning_rate": 2.774966467316613e-05, + "loss": 0.9632, + "step": 7357 + }, + { + "epoch": 0.6574485670248174, + "grad_norm": 0.4877246022224426, + "learning_rate": 2.7736706684023982e-05, + "loss": 0.8856, + "step": 7358 + }, + { + "epoch": 0.6575379185560792, + "grad_norm": 0.4451915919780731, + "learning_rate": 2.772375055966996e-05, + "loss": 0.9679, + "step": 7359 + }, + { + "epoch": 0.6576272700873411, + "grad_norm": 0.4021294414997101, + "learning_rate": 2.7710796301189268e-05, + "loss": 0.9819, + "step": 7360 + }, + { + "epoch": 0.657716621618603, + "grad_norm": 0.4669017791748047, + "learning_rate": 2.7697843909666977e-05, + "loss": 0.9477, + "step": 7361 + }, + { + "epoch": 0.6578059731498649, + "grad_norm": 0.48270532488822937, + "learning_rate": 2.7684893386188003e-05, + "loss": 0.993, + "step": 7362 + }, + { + "epoch": 0.6578953246811268, + "grad_norm": 0.4589235186576843, + "learning_rate": 2.767194473183705e-05, + "loss": 0.9158, + "step": 7363 + }, + { + "epoch": 0.6579846762123885, + "grad_norm": 0.4602046012878418, + "learning_rate": 2.765899794769873e-05, + "loss": 0.9123, + "step": 7364 + }, + { + "epoch": 0.6580740277436504, + "grad_norm": 0.4203275144100189, + "learning_rate": 2.7646053034857457e-05, + "loss": 0.9179, + "step": 7365 + }, + { + "epoch": 0.6581633792749123, + "grad_norm": 0.464804083108902, + "learning_rate": 2.7633109994397533e-05, + "loss": 0.9164, + "step": 7366 + }, + { + "epoch": 0.6582527308061742, + "grad_norm": 0.4711710810661316, + "learning_rate": 2.762016882740305e-05, + "loss": 0.9645, + "step": 7367 + }, + { + "epoch": 0.6583420823374361, + "grad_norm": 0.5707644820213318, + "learning_rate": 2.7607229534957984e-05, + "loss": 0.9153, + "step": 7368 + }, + { + "epoch": 0.658431433868698, + "grad_norm": 0.39793646335601807, + "learning_rate": 2.7594292118146137e-05, + "loss": 0.9331, + "step": 7369 + }, + { + "epoch": 0.6585207853999598, + "grad_norm": 0.4051116108894348, + "learning_rate": 2.7581356578051143e-05, + "loss": 0.9709, + "step": 7370 + }, + { + "epoch": 0.6586101369312216, + "grad_norm": 0.5276316404342651, + "learning_rate": 2.756842291575651e-05, + "loss": 0.9835, + "step": 7371 + }, + { + "epoch": 0.6586994884624835, + "grad_norm": 0.4137079417705536, + "learning_rate": 2.7555491132345557e-05, + "loss": 0.9525, + "step": 7372 + }, + { + "epoch": 0.6587888399937454, + "grad_norm": 0.5179648995399475, + "learning_rate": 2.7542561228901485e-05, + "loss": 1.0175, + "step": 7373 + }, + { + "epoch": 0.6588781915250073, + "grad_norm": 0.5228958129882812, + "learning_rate": 2.752963320650727e-05, + "loss": 0.8682, + "step": 7374 + }, + { + "epoch": 0.6589675430562691, + "grad_norm": 0.49394968152046204, + "learning_rate": 2.7516707066245796e-05, + "loss": 0.9222, + "step": 7375 + }, + { + "epoch": 0.659056894587531, + "grad_norm": 0.5419045686721802, + "learning_rate": 2.7503782809199753e-05, + "loss": 0.8827, + "step": 7376 + }, + { + "epoch": 0.6591462461187929, + "grad_norm": 0.4782697260379791, + "learning_rate": 2.7490860436451692e-05, + "loss": 0.9742, + "step": 7377 + }, + { + "epoch": 0.6592355976500547, + "grad_norm": 0.521144688129425, + "learning_rate": 2.7477939949084e-05, + "loss": 0.8683, + "step": 7378 + }, + { + "epoch": 0.6593249491813166, + "grad_norm": 0.4898735284805298, + "learning_rate": 2.7465021348178903e-05, + "loss": 0.9738, + "step": 7379 + }, + { + "epoch": 0.6594143007125784, + "grad_norm": 0.5147672891616821, + "learning_rate": 2.7452104634818497e-05, + "loss": 0.8741, + "step": 7380 + }, + { + "epoch": 0.6595036522438403, + "grad_norm": 0.5325078368186951, + "learning_rate": 2.7439189810084655e-05, + "loss": 0.9312, + "step": 7381 + }, + { + "epoch": 0.6595930037751022, + "grad_norm": 0.45045092701911926, + "learning_rate": 2.7426276875059143e-05, + "loss": 0.9429, + "step": 7382 + }, + { + "epoch": 0.6596823553063641, + "grad_norm": 0.45888474583625793, + "learning_rate": 2.7413365830823557e-05, + "loss": 0.9197, + "step": 7383 + }, + { + "epoch": 0.659771706837626, + "grad_norm": 0.4505671262741089, + "learning_rate": 2.7400456678459363e-05, + "loss": 0.8995, + "step": 7384 + }, + { + "epoch": 0.6598610583688878, + "grad_norm": 0.6128329634666443, + "learning_rate": 2.7387549419047798e-05, + "loss": 0.895, + "step": 7385 + }, + { + "epoch": 0.6599504099001496, + "grad_norm": 0.44439369440078735, + "learning_rate": 2.7374644053669997e-05, + "loss": 0.9927, + "step": 7386 + }, + { + "epoch": 0.6600397614314115, + "grad_norm": 0.520553469657898, + "learning_rate": 2.7361740583406924e-05, + "loss": 0.93, + "step": 7387 + }, + { + "epoch": 0.6601291129626734, + "grad_norm": 0.5277306437492371, + "learning_rate": 2.734883900933939e-05, + "loss": 0.9059, + "step": 7388 + }, + { + "epoch": 0.6602184644939353, + "grad_norm": 0.5034501552581787, + "learning_rate": 2.7335939332548032e-05, + "loss": 0.9507, + "step": 7389 + }, + { + "epoch": 0.6603078160251972, + "grad_norm": 0.5344181656837463, + "learning_rate": 2.7323041554113333e-05, + "loss": 0.834, + "step": 7390 + }, + { + "epoch": 0.660397167556459, + "grad_norm": 0.5650686621665955, + "learning_rate": 2.731014567511562e-05, + "loss": 0.9379, + "step": 7391 + }, + { + "epoch": 0.6604865190877208, + "grad_norm": 0.5536824464797974, + "learning_rate": 2.7297251696635074e-05, + "loss": 0.9655, + "step": 7392 + }, + { + "epoch": 0.6605758706189827, + "grad_norm": 0.6149418950080872, + "learning_rate": 2.7284359619751704e-05, + "loss": 0.9708, + "step": 7393 + }, + { + "epoch": 0.6606652221502446, + "grad_norm": 0.5840936303138733, + "learning_rate": 2.7271469445545327e-05, + "loss": 0.9482, + "step": 7394 + }, + { + "epoch": 0.6607545736815065, + "grad_norm": 0.42912179231643677, + "learning_rate": 2.7258581175095654e-05, + "loss": 0.9896, + "step": 7395 + }, + { + "epoch": 0.6608439252127684, + "grad_norm": 0.44300276041030884, + "learning_rate": 2.7245694809482214e-05, + "loss": 0.9567, + "step": 7396 + }, + { + "epoch": 0.6609332767440302, + "grad_norm": 0.482937753200531, + "learning_rate": 2.7232810349784375e-05, + "loss": 0.945, + "step": 7397 + }, + { + "epoch": 0.661022628275292, + "grad_norm": 0.40860891342163086, + "learning_rate": 2.721992779708136e-05, + "loss": 0.914, + "step": 7398 + }, + { + "epoch": 0.6611119798065539, + "grad_norm": 0.4908876121044159, + "learning_rate": 2.7207047152452196e-05, + "loss": 0.9459, + "step": 7399 + }, + { + "epoch": 0.6612013313378158, + "grad_norm": 0.475569486618042, + "learning_rate": 2.7194168416975797e-05, + "loss": 0.9524, + "step": 7400 + }, + { + "epoch": 0.6612906828690777, + "grad_norm": 0.44693323969841003, + "learning_rate": 2.7181291591730883e-05, + "loss": 0.886, + "step": 7401 + }, + { + "epoch": 0.6613800344003395, + "grad_norm": 0.613734781742096, + "learning_rate": 2.7168416677796028e-05, + "loss": 0.9082, + "step": 7402 + }, + { + "epoch": 0.6614693859316014, + "grad_norm": 0.43988481163978577, + "learning_rate": 2.715554367624966e-05, + "loss": 0.9724, + "step": 7403 + }, + { + "epoch": 0.6615587374628633, + "grad_norm": 0.37471234798431396, + "learning_rate": 2.7142672588170002e-05, + "loss": 0.9339, + "step": 7404 + }, + { + "epoch": 0.6616480889941251, + "grad_norm": 0.45363959670066833, + "learning_rate": 2.712980341463515e-05, + "loss": 0.9341, + "step": 7405 + }, + { + "epoch": 0.661737440525387, + "grad_norm": 0.5071055293083191, + "learning_rate": 2.711693615672305e-05, + "loss": 0.9066, + "step": 7406 + }, + { + "epoch": 0.6618267920566488, + "grad_norm": 0.4539679288864136, + "learning_rate": 2.710407081551145e-05, + "loss": 0.975, + "step": 7407 + }, + { + "epoch": 0.6619161435879107, + "grad_norm": 0.5037643909454346, + "learning_rate": 2.7091207392077977e-05, + "loss": 0.918, + "step": 7408 + }, + { + "epoch": 0.6620054951191726, + "grad_norm": 0.47487950325012207, + "learning_rate": 2.707834588750008e-05, + "loss": 0.8837, + "step": 7409 + }, + { + "epoch": 0.6620948466504345, + "grad_norm": 0.40642839670181274, + "learning_rate": 2.7065486302855037e-05, + "loss": 0.9693, + "step": 7410 + }, + { + "epoch": 0.6621841981816964, + "grad_norm": 0.4429019093513489, + "learning_rate": 2.705262863921998e-05, + "loss": 1.0066, + "step": 7411 + }, + { + "epoch": 0.6622735497129582, + "grad_norm": 0.5285657644271851, + "learning_rate": 2.703977289767188e-05, + "loss": 0.925, + "step": 7412 + }, + { + "epoch": 0.66236290124422, + "grad_norm": 0.533004105091095, + "learning_rate": 2.7026919079287555e-05, + "loss": 0.951, + "step": 7413 + }, + { + "epoch": 0.6624522527754819, + "grad_norm": 0.5454312562942505, + "learning_rate": 2.701406718514361e-05, + "loss": 0.96, + "step": 7414 + }, + { + "epoch": 0.6625416043067438, + "grad_norm": 0.4193102717399597, + "learning_rate": 2.7001217216316553e-05, + "loss": 0.9904, + "step": 7415 + }, + { + "epoch": 0.6626309558380057, + "grad_norm": 0.43523266911506653, + "learning_rate": 2.698836917388271e-05, + "loss": 0.9572, + "step": 7416 + }, + { + "epoch": 0.6627203073692676, + "grad_norm": 0.4869917035102844, + "learning_rate": 2.6975523058918252e-05, + "loss": 1.0183, + "step": 7417 + }, + { + "epoch": 0.6628096589005295, + "grad_norm": 0.4953290820121765, + "learning_rate": 2.6962678872499137e-05, + "loss": 0.8993, + "step": 7418 + }, + { + "epoch": 0.6628990104317912, + "grad_norm": 0.5153881907463074, + "learning_rate": 2.6949836615701225e-05, + "loss": 0.9066, + "step": 7419 + }, + { + "epoch": 0.6629883619630531, + "grad_norm": 0.42058584094047546, + "learning_rate": 2.6936996289600198e-05, + "loss": 0.989, + "step": 7420 + }, + { + "epoch": 0.663077713494315, + "grad_norm": 0.4563521146774292, + "learning_rate": 2.6924157895271563e-05, + "loss": 0.9595, + "step": 7421 + }, + { + "epoch": 0.6631670650255769, + "grad_norm": 0.5149219036102295, + "learning_rate": 2.6911321433790677e-05, + "loss": 0.9232, + "step": 7422 + }, + { + "epoch": 0.6632564165568388, + "grad_norm": 0.478681743144989, + "learning_rate": 2.6898486906232746e-05, + "loss": 0.9574, + "step": 7423 + }, + { + "epoch": 0.6633457680881006, + "grad_norm": 0.4329462945461273, + "learning_rate": 2.6885654313672763e-05, + "loss": 0.9712, + "step": 7424 + }, + { + "epoch": 0.6634351196193625, + "grad_norm": 0.5026130676269531, + "learning_rate": 2.6872823657185614e-05, + "loss": 0.9287, + "step": 7425 + }, + { + "epoch": 0.6635244711506243, + "grad_norm": 0.49470841884613037, + "learning_rate": 2.6859994937846e-05, + "loss": 0.876, + "step": 7426 + }, + { + "epoch": 0.6636138226818862, + "grad_norm": 0.49729084968566895, + "learning_rate": 2.6847168156728463e-05, + "loss": 0.8896, + "step": 7427 + }, + { + "epoch": 0.6637031742131481, + "grad_norm": 0.4407115876674652, + "learning_rate": 2.6834343314907384e-05, + "loss": 0.9779, + "step": 7428 + }, + { + "epoch": 0.66379252574441, + "grad_norm": 0.3972383737564087, + "learning_rate": 2.682152041345699e-05, + "loss": 0.9496, + "step": 7429 + }, + { + "epoch": 0.6638818772756718, + "grad_norm": 0.42927953600883484, + "learning_rate": 2.6808699453451313e-05, + "loss": 0.9761, + "step": 7430 + }, + { + "epoch": 0.6639712288069337, + "grad_norm": 0.37218815088272095, + "learning_rate": 2.679588043596427e-05, + "loss": 0.9326, + "step": 7431 + }, + { + "epoch": 0.6640605803381956, + "grad_norm": 0.513047456741333, + "learning_rate": 2.6783063362069573e-05, + "loss": 0.9675, + "step": 7432 + }, + { + "epoch": 0.6641499318694574, + "grad_norm": 0.4514136016368866, + "learning_rate": 2.677024823284081e-05, + "loss": 0.9793, + "step": 7433 + }, + { + "epoch": 0.6642392834007192, + "grad_norm": 0.5317848324775696, + "learning_rate": 2.6757435049351353e-05, + "loss": 0.8597, + "step": 7434 + }, + { + "epoch": 0.6643286349319811, + "grad_norm": 0.6172546148300171, + "learning_rate": 2.6744623812674463e-05, + "loss": 0.8547, + "step": 7435 + }, + { + "epoch": 0.664417986463243, + "grad_norm": 0.45025524497032166, + "learning_rate": 2.6731814523883202e-05, + "loss": 0.9209, + "step": 7436 + }, + { + "epoch": 0.6645073379945049, + "grad_norm": 0.4900471866130829, + "learning_rate": 2.6719007184050504e-05, + "loss": 0.8974, + "step": 7437 + }, + { + "epoch": 0.6645966895257668, + "grad_norm": 0.4560643434524536, + "learning_rate": 2.6706201794249108e-05, + "loss": 0.9878, + "step": 7438 + }, + { + "epoch": 0.6646860410570287, + "grad_norm": 0.41704943776130676, + "learning_rate": 2.6693398355551613e-05, + "loss": 1.011, + "step": 7439 + }, + { + "epoch": 0.6647753925882904, + "grad_norm": 0.6909659504890442, + "learning_rate": 2.668059686903043e-05, + "loss": 0.905, + "step": 7440 + }, + { + "epoch": 0.6648647441195523, + "grad_norm": 0.4692802429199219, + "learning_rate": 2.6667797335757827e-05, + "loss": 0.957, + "step": 7441 + }, + { + "epoch": 0.6649540956508142, + "grad_norm": 0.47277992963790894, + "learning_rate": 2.66549997568059e-05, + "loss": 0.9593, + "step": 7442 + }, + { + "epoch": 0.6650434471820761, + "grad_norm": 0.4334234595298767, + "learning_rate": 2.6642204133246605e-05, + "loss": 0.9594, + "step": 7443 + }, + { + "epoch": 0.665132798713338, + "grad_norm": 0.43913981318473816, + "learning_rate": 2.662941046615167e-05, + "loss": 0.9695, + "step": 7444 + }, + { + "epoch": 0.6652221502445999, + "grad_norm": 0.43409544229507446, + "learning_rate": 2.661661875659272e-05, + "loss": 0.9474, + "step": 7445 + }, + { + "epoch": 0.6653115017758617, + "grad_norm": 0.5566879510879517, + "learning_rate": 2.6603829005641202e-05, + "loss": 0.9446, + "step": 7446 + }, + { + "epoch": 0.6654008533071235, + "grad_norm": 0.503091037273407, + "learning_rate": 2.6591041214368385e-05, + "loss": 0.8959, + "step": 7447 + }, + { + "epoch": 0.6654902048383854, + "grad_norm": 0.4617300033569336, + "learning_rate": 2.6578255383845384e-05, + "loss": 1.0043, + "step": 7448 + }, + { + "epoch": 0.6655795563696473, + "grad_norm": 0.5025232434272766, + "learning_rate": 2.6565471515143157e-05, + "loss": 0.9259, + "step": 7449 + }, + { + "epoch": 0.6656689079009092, + "grad_norm": 0.5556004047393799, + "learning_rate": 2.6552689609332504e-05, + "loss": 0.9494, + "step": 7450 + }, + { + "epoch": 0.665758259432171, + "grad_norm": 0.5287407040596008, + "learning_rate": 2.653990966748401e-05, + "loss": 0.9567, + "step": 7451 + }, + { + "epoch": 0.6658476109634329, + "grad_norm": 0.5063120126724243, + "learning_rate": 2.652713169066815e-05, + "loss": 0.8983, + "step": 7452 + }, + { + "epoch": 0.6659369624946948, + "grad_norm": 0.4847102463245392, + "learning_rate": 2.6514355679955205e-05, + "loss": 0.9043, + "step": 7453 + }, + { + "epoch": 0.6660263140259566, + "grad_norm": 0.4578264355659485, + "learning_rate": 2.650158163641534e-05, + "loss": 0.974, + "step": 7454 + }, + { + "epoch": 0.6661156655572185, + "grad_norm": 0.481656014919281, + "learning_rate": 2.648880956111846e-05, + "loss": 0.8898, + "step": 7455 + }, + { + "epoch": 0.6662050170884803, + "grad_norm": 0.47429630160331726, + "learning_rate": 2.6476039455134393e-05, + "loss": 0.972, + "step": 7456 + }, + { + "epoch": 0.6662943686197422, + "grad_norm": 0.48434558510780334, + "learning_rate": 2.6463271319532766e-05, + "loss": 1.0256, + "step": 7457 + }, + { + "epoch": 0.6663837201510041, + "grad_norm": 0.5073295831680298, + "learning_rate": 2.645050515538306e-05, + "loss": 0.8717, + "step": 7458 + }, + { + "epoch": 0.666473071682266, + "grad_norm": 0.5684570670127869, + "learning_rate": 2.643774096375456e-05, + "loss": 0.8968, + "step": 7459 + }, + { + "epoch": 0.6665624232135279, + "grad_norm": 0.44450321793556213, + "learning_rate": 2.642497874571641e-05, + "loss": 0.9466, + "step": 7460 + }, + { + "epoch": 0.6666517747447896, + "grad_norm": 0.40622055530548096, + "learning_rate": 2.6412218502337582e-05, + "loss": 0.9863, + "step": 7461 + }, + { + "epoch": 0.6667411262760515, + "grad_norm": 0.42449328303337097, + "learning_rate": 2.6399460234686877e-05, + "loss": 0.935, + "step": 7462 + }, + { + "epoch": 0.6668304778073134, + "grad_norm": 0.5839447975158691, + "learning_rate": 2.6386703943832947e-05, + "loss": 0.9843, + "step": 7463 + }, + { + "epoch": 0.6669198293385753, + "grad_norm": 0.5379756093025208, + "learning_rate": 2.6373949630844287e-05, + "loss": 0.88, + "step": 7464 + }, + { + "epoch": 0.6670091808698372, + "grad_norm": 0.4397304058074951, + "learning_rate": 2.6361197296789153e-05, + "loss": 0.934, + "step": 7465 + }, + { + "epoch": 0.6670985324010991, + "grad_norm": 0.5913897156715393, + "learning_rate": 2.6348446942735716e-05, + "loss": 0.9605, + "step": 7466 + }, + { + "epoch": 0.6671878839323608, + "grad_norm": 0.4340857267379761, + "learning_rate": 2.6335698569751956e-05, + "loss": 0.9753, + "step": 7467 + }, + { + "epoch": 0.6672772354636227, + "grad_norm": 0.41628775000572205, + "learning_rate": 2.6322952178905692e-05, + "loss": 0.9943, + "step": 7468 + }, + { + "epoch": 0.6673665869948846, + "grad_norm": 0.5939970016479492, + "learning_rate": 2.631020777126455e-05, + "loss": 0.8947, + "step": 7469 + }, + { + "epoch": 0.6674559385261465, + "grad_norm": 0.413511723279953, + "learning_rate": 2.6297465347896026e-05, + "loss": 0.9991, + "step": 7470 + }, + { + "epoch": 0.6675452900574084, + "grad_norm": 0.48279207944869995, + "learning_rate": 2.6284724909867432e-05, + "loss": 0.9453, + "step": 7471 + }, + { + "epoch": 0.6676346415886703, + "grad_norm": 0.4602436125278473, + "learning_rate": 2.6271986458245912e-05, + "loss": 0.9822, + "step": 7472 + }, + { + "epoch": 0.6677239931199321, + "grad_norm": 0.4432784616947174, + "learning_rate": 2.6259249994098455e-05, + "loss": 0.9506, + "step": 7473 + }, + { + "epoch": 0.6678133446511939, + "grad_norm": 0.5207743644714355, + "learning_rate": 2.624651551849188e-05, + "loss": 0.8715, + "step": 7474 + }, + { + "epoch": 0.6679026961824558, + "grad_norm": 0.49934712052345276, + "learning_rate": 2.623378303249281e-05, + "loss": 0.9471, + "step": 7475 + }, + { + "epoch": 0.6679920477137177, + "grad_norm": 0.5130594968795776, + "learning_rate": 2.622105253716774e-05, + "loss": 0.9037, + "step": 7476 + }, + { + "epoch": 0.6680813992449796, + "grad_norm": 0.538914680480957, + "learning_rate": 2.6208324033582986e-05, + "loss": 0.9393, + "step": 7477 + }, + { + "epoch": 0.6681707507762414, + "grad_norm": 0.5530039668083191, + "learning_rate": 2.6195597522804692e-05, + "loss": 0.8773, + "step": 7478 + }, + { + "epoch": 0.6682601023075033, + "grad_norm": 0.4622054696083069, + "learning_rate": 2.6182873005898845e-05, + "loss": 0.9016, + "step": 7479 + }, + { + "epoch": 0.6683494538387652, + "grad_norm": 0.43117639422416687, + "learning_rate": 2.6170150483931257e-05, + "loss": 0.9483, + "step": 7480 + }, + { + "epoch": 0.668438805370027, + "grad_norm": 0.45902514457702637, + "learning_rate": 2.6157429957967566e-05, + "loss": 0.9158, + "step": 7481 + }, + { + "epoch": 0.6685281569012889, + "grad_norm": 0.6419569849967957, + "learning_rate": 2.6144711429073265e-05, + "loss": 0.913, + "step": 7482 + }, + { + "epoch": 0.6686175084325507, + "grad_norm": 0.38565734028816223, + "learning_rate": 2.6131994898313684e-05, + "loss": 0.9589, + "step": 7483 + }, + { + "epoch": 0.6687068599638126, + "grad_norm": 0.43412715196609497, + "learning_rate": 2.6119280366753917e-05, + "loss": 0.9686, + "step": 7484 + }, + { + "epoch": 0.6687962114950745, + "grad_norm": 0.4611121714115143, + "learning_rate": 2.610656783545898e-05, + "loss": 1.0003, + "step": 7485 + }, + { + "epoch": 0.6688855630263364, + "grad_norm": 0.4799754321575165, + "learning_rate": 2.6093857305493664e-05, + "loss": 0.969, + "step": 7486 + }, + { + "epoch": 0.6689749145575983, + "grad_norm": 0.556797444820404, + "learning_rate": 2.6081148777922643e-05, + "loss": 0.9443, + "step": 7487 + }, + { + "epoch": 0.66906426608886, + "grad_norm": 0.4632977247238159, + "learning_rate": 2.606844225381035e-05, + "loss": 0.9519, + "step": 7488 + }, + { + "epoch": 0.6691536176201219, + "grad_norm": 0.43375352025032043, + "learning_rate": 2.6055737734221108e-05, + "loss": 0.9369, + "step": 7489 + }, + { + "epoch": 0.6692429691513838, + "grad_norm": 0.48179739713668823, + "learning_rate": 2.604303522021906e-05, + "loss": 0.8513, + "step": 7490 + }, + { + "epoch": 0.6693323206826457, + "grad_norm": 0.45359307527542114, + "learning_rate": 2.6030334712868177e-05, + "loss": 1.0067, + "step": 7491 + }, + { + "epoch": 0.6694216722139076, + "grad_norm": 0.4388836622238159, + "learning_rate": 2.6017636213232255e-05, + "loss": 0.9204, + "step": 7492 + }, + { + "epoch": 0.6695110237451695, + "grad_norm": 0.44517624378204346, + "learning_rate": 2.600493972237493e-05, + "loss": 0.955, + "step": 7493 + }, + { + "epoch": 0.6696003752764313, + "grad_norm": 0.5145146250724792, + "learning_rate": 2.5992245241359702e-05, + "loss": 0.8947, + "step": 7494 + }, + { + "epoch": 0.6696897268076931, + "grad_norm": 0.6348103880882263, + "learning_rate": 2.5979552771249814e-05, + "loss": 0.975, + "step": 7495 + }, + { + "epoch": 0.669779078338955, + "grad_norm": 0.4274086654186249, + "learning_rate": 2.596686231310842e-05, + "loss": 0.9465, + "step": 7496 + }, + { + "epoch": 0.6698684298702169, + "grad_norm": 0.6270675659179688, + "learning_rate": 2.595417386799849e-05, + "loss": 0.8768, + "step": 7497 + }, + { + "epoch": 0.6699577814014788, + "grad_norm": 0.5795266032218933, + "learning_rate": 2.5941487436982803e-05, + "loss": 0.8254, + "step": 7498 + }, + { + "epoch": 0.6700471329327407, + "grad_norm": 0.4406552016735077, + "learning_rate": 2.592880302112399e-05, + "loss": 0.9277, + "step": 7499 + }, + { + "epoch": 0.6701364844640025, + "grad_norm": 0.4358079135417938, + "learning_rate": 2.5916120621484498e-05, + "loss": 0.9509, + "step": 7500 + }, + { + "epoch": 0.6702258359952644, + "grad_norm": 0.5085236430168152, + "learning_rate": 2.590344023912663e-05, + "loss": 0.8566, + "step": 7501 + }, + { + "epoch": 0.6703151875265262, + "grad_norm": 0.5449510812759399, + "learning_rate": 2.5890761875112485e-05, + "loss": 0.9698, + "step": 7502 + }, + { + "epoch": 0.6704045390577881, + "grad_norm": 0.49688923358917236, + "learning_rate": 2.587808553050402e-05, + "loss": 0.9613, + "step": 7503 + }, + { + "epoch": 0.67049389058905, + "grad_norm": 0.43229207396507263, + "learning_rate": 2.586541120636303e-05, + "loss": 0.9708, + "step": 7504 + }, + { + "epoch": 0.6705832421203118, + "grad_norm": 0.4369189739227295, + "learning_rate": 2.5852738903751095e-05, + "loss": 0.9745, + "step": 7505 + }, + { + "epoch": 0.6706725936515737, + "grad_norm": 0.42944756150245667, + "learning_rate": 2.5840068623729668e-05, + "loss": 0.9261, + "step": 7506 + }, + { + "epoch": 0.6707619451828356, + "grad_norm": 0.4990726113319397, + "learning_rate": 2.5827400367360015e-05, + "loss": 0.975, + "step": 7507 + }, + { + "epoch": 0.6708512967140975, + "grad_norm": 0.4963349401950836, + "learning_rate": 2.5814734135703245e-05, + "loss": 0.9149, + "step": 7508 + }, + { + "epoch": 0.6709406482453593, + "grad_norm": 0.4562934339046478, + "learning_rate": 2.5802069929820294e-05, + "loss": 1.0564, + "step": 7509 + }, + { + "epoch": 0.6710299997766211, + "grad_norm": 0.4936973750591278, + "learning_rate": 2.578940775077191e-05, + "loss": 0.921, + "step": 7510 + }, + { + "epoch": 0.671119351307883, + "grad_norm": 0.4688092768192291, + "learning_rate": 2.5776747599618688e-05, + "loss": 0.8868, + "step": 7511 + }, + { + "epoch": 0.6712087028391449, + "grad_norm": 0.525898277759552, + "learning_rate": 2.5764089477421067e-05, + "loss": 0.8827, + "step": 7512 + }, + { + "epoch": 0.6712980543704068, + "grad_norm": 0.5553557872772217, + "learning_rate": 2.5751433385239288e-05, + "loss": 0.9034, + "step": 7513 + }, + { + "epoch": 0.6713874059016687, + "grad_norm": 0.5192905068397522, + "learning_rate": 2.5738779324133445e-05, + "loss": 0.9762, + "step": 7514 + }, + { + "epoch": 0.6714767574329306, + "grad_norm": 0.45627352595329285, + "learning_rate": 2.5726127295163428e-05, + "loss": 0.9173, + "step": 7515 + }, + { + "epoch": 0.6715661089641923, + "grad_norm": 0.5443771481513977, + "learning_rate": 2.5713477299388987e-05, + "loss": 0.9714, + "step": 7516 + }, + { + "epoch": 0.6716554604954542, + "grad_norm": 0.4559047520160675, + "learning_rate": 2.57008293378697e-05, + "loss": 0.9483, + "step": 7517 + }, + { + "epoch": 0.6717448120267161, + "grad_norm": 0.47910940647125244, + "learning_rate": 2.568818341166496e-05, + "loss": 0.9451, + "step": 7518 + }, + { + "epoch": 0.671834163557978, + "grad_norm": 0.4128173589706421, + "learning_rate": 2.5675539521834012e-05, + "loss": 0.9631, + "step": 7519 + }, + { + "epoch": 0.6719235150892399, + "grad_norm": 0.424180805683136, + "learning_rate": 2.5662897669435925e-05, + "loss": 0.9432, + "step": 7520 + }, + { + "epoch": 0.6720128666205017, + "grad_norm": 0.4650351405143738, + "learning_rate": 2.5650257855529558e-05, + "loss": 0.9263, + "step": 7521 + }, + { + "epoch": 0.6721022181517636, + "grad_norm": 0.5273159742355347, + "learning_rate": 2.5637620081173642e-05, + "loss": 0.8891, + "step": 7522 + }, + { + "epoch": 0.6721915696830254, + "grad_norm": 0.43151742219924927, + "learning_rate": 2.5624984347426727e-05, + "loss": 0.9896, + "step": 7523 + }, + { + "epoch": 0.6722809212142873, + "grad_norm": 0.4380139112472534, + "learning_rate": 2.5612350655347195e-05, + "loss": 1.0411, + "step": 7524 + }, + { + "epoch": 0.6723702727455492, + "grad_norm": 0.49319833517074585, + "learning_rate": 2.559971900599326e-05, + "loss": 0.8419, + "step": 7525 + }, + { + "epoch": 0.672459624276811, + "grad_norm": 0.44189539551734924, + "learning_rate": 2.5587089400422938e-05, + "loss": 0.9321, + "step": 7526 + }, + { + "epoch": 0.6725489758080729, + "grad_norm": 0.4883861243724823, + "learning_rate": 2.55744618396941e-05, + "loss": 0.9207, + "step": 7527 + }, + { + "epoch": 0.6726383273393348, + "grad_norm": 0.46611472964286804, + "learning_rate": 2.5561836324864442e-05, + "loss": 0.8813, + "step": 7528 + }, + { + "epoch": 0.6727276788705966, + "grad_norm": 0.49315086007118225, + "learning_rate": 2.554921285699148e-05, + "loss": 0.8707, + "step": 7529 + }, + { + "epoch": 0.6728170304018585, + "grad_norm": 0.47379156947135925, + "learning_rate": 2.5536591437132563e-05, + "loss": 0.9026, + "step": 7530 + }, + { + "epoch": 0.6729063819331204, + "grad_norm": 0.4609242081642151, + "learning_rate": 2.552397206634488e-05, + "loss": 0.9346, + "step": 7531 + }, + { + "epoch": 0.6729957334643822, + "grad_norm": 0.4267633855342865, + "learning_rate": 2.5511354745685433e-05, + "loss": 0.9378, + "step": 7532 + }, + { + "epoch": 0.6730850849956441, + "grad_norm": 0.4892594516277313, + "learning_rate": 2.5498739476211054e-05, + "loss": 0.9893, + "step": 7533 + }, + { + "epoch": 0.673174436526906, + "grad_norm": 0.6081541180610657, + "learning_rate": 2.5486126258978427e-05, + "loss": 0.9632, + "step": 7534 + }, + { + "epoch": 0.6732637880581679, + "grad_norm": 0.49728408455848694, + "learning_rate": 2.547351509504401e-05, + "loss": 0.9476, + "step": 7535 + }, + { + "epoch": 0.6733531395894297, + "grad_norm": 0.44689908623695374, + "learning_rate": 2.5460905985464134e-05, + "loss": 1.0134, + "step": 7536 + }, + { + "epoch": 0.6734424911206915, + "grad_norm": 0.5165348052978516, + "learning_rate": 2.544829893129495e-05, + "loss": 0.9183, + "step": 7537 + }, + { + "epoch": 0.6735318426519534, + "grad_norm": 0.45430701971054077, + "learning_rate": 2.5435693933592432e-05, + "loss": 0.9704, + "step": 7538 + }, + { + "epoch": 0.6736211941832153, + "grad_norm": 0.4814325273036957, + "learning_rate": 2.5423090993412383e-05, + "loss": 0.9752, + "step": 7539 + }, + { + "epoch": 0.6737105457144772, + "grad_norm": 0.5146889090538025, + "learning_rate": 2.5410490111810435e-05, + "loss": 0.9822, + "step": 7540 + }, + { + "epoch": 0.6737998972457391, + "grad_norm": 0.47414878010749817, + "learning_rate": 2.5397891289842052e-05, + "loss": 1.0163, + "step": 7541 + }, + { + "epoch": 0.673889248777001, + "grad_norm": 0.4566519558429718, + "learning_rate": 2.5385294528562507e-05, + "loss": 0.9218, + "step": 7542 + }, + { + "epoch": 0.6739786003082627, + "grad_norm": 0.7057138681411743, + "learning_rate": 2.537269982902692e-05, + "loss": 0.8424, + "step": 7543 + }, + { + "epoch": 0.6740679518395246, + "grad_norm": 0.5519537329673767, + "learning_rate": 2.536010719229023e-05, + "loss": 0.8996, + "step": 7544 + }, + { + "epoch": 0.6741573033707865, + "grad_norm": 0.46758851408958435, + "learning_rate": 2.5347516619407223e-05, + "loss": 0.8998, + "step": 7545 + }, + { + "epoch": 0.6742466549020484, + "grad_norm": 0.452288955450058, + "learning_rate": 2.533492811143246e-05, + "loss": 0.8949, + "step": 7546 + }, + { + "epoch": 0.6743360064333103, + "grad_norm": 0.46302369236946106, + "learning_rate": 2.532234166942038e-05, + "loss": 0.9162, + "step": 7547 + }, + { + "epoch": 0.6744253579645721, + "grad_norm": 0.46767961978912354, + "learning_rate": 2.5309757294425222e-05, + "loss": 0.9122, + "step": 7548 + }, + { + "epoch": 0.674514709495834, + "grad_norm": 0.42991432547569275, + "learning_rate": 2.5297174987501077e-05, + "loss": 1.0339, + "step": 7549 + }, + { + "epoch": 0.6746040610270958, + "grad_norm": 0.40119388699531555, + "learning_rate": 2.528459474970184e-05, + "loss": 1.0173, + "step": 7550 + }, + { + "epoch": 0.6746934125583577, + "grad_norm": 0.4925364553928375, + "learning_rate": 2.5272016582081236e-05, + "loss": 0.9063, + "step": 7551 + }, + { + "epoch": 0.6747827640896196, + "grad_norm": 0.4693504273891449, + "learning_rate": 2.525944048569282e-05, + "loss": 0.9513, + "step": 7552 + }, + { + "epoch": 0.6748721156208815, + "grad_norm": 0.4908875524997711, + "learning_rate": 2.524686646159001e-05, + "loss": 0.9932, + "step": 7553 + }, + { + "epoch": 0.6749614671521433, + "grad_norm": 0.5330108404159546, + "learning_rate": 2.5234294510825957e-05, + "loss": 0.925, + "step": 7554 + }, + { + "epoch": 0.6750508186834052, + "grad_norm": 0.41894859075546265, + "learning_rate": 2.5221724634453724e-05, + "loss": 0.9464, + "step": 7555 + }, + { + "epoch": 0.6751401702146671, + "grad_norm": 0.4790763556957245, + "learning_rate": 2.5209156833526172e-05, + "loss": 0.9183, + "step": 7556 + }, + { + "epoch": 0.6752295217459289, + "grad_norm": 0.6181892156600952, + "learning_rate": 2.5196591109096e-05, + "loss": 0.8195, + "step": 7557 + }, + { + "epoch": 0.6753188732771908, + "grad_norm": 0.4377688765525818, + "learning_rate": 2.5184027462215686e-05, + "loss": 0.9403, + "step": 7558 + }, + { + "epoch": 0.6754082248084526, + "grad_norm": 0.4455429017543793, + "learning_rate": 2.5171465893937602e-05, + "loss": 0.9458, + "step": 7559 + }, + { + "epoch": 0.6754975763397145, + "grad_norm": 0.451734334230423, + "learning_rate": 2.51589064053139e-05, + "loss": 0.9318, + "step": 7560 + }, + { + "epoch": 0.6755869278709764, + "grad_norm": 0.39565378427505493, + "learning_rate": 2.5146348997396567e-05, + "loss": 1.0195, + "step": 7561 + }, + { + "epoch": 0.6756762794022383, + "grad_norm": 0.5276391506195068, + "learning_rate": 2.5133793671237433e-05, + "loss": 0.9303, + "step": 7562 + }, + { + "epoch": 0.6757656309335002, + "grad_norm": 0.5773701667785645, + "learning_rate": 2.512124042788813e-05, + "loss": 0.9137, + "step": 7563 + }, + { + "epoch": 0.6758549824647619, + "grad_norm": 0.45947515964508057, + "learning_rate": 2.5108689268400132e-05, + "loss": 0.933, + "step": 7564 + }, + { + "epoch": 0.6759443339960238, + "grad_norm": 0.4164651930332184, + "learning_rate": 2.5096140193824748e-05, + "loss": 0.9255, + "step": 7565 + }, + { + "epoch": 0.6760336855272857, + "grad_norm": 0.5537623763084412, + "learning_rate": 2.5083593205213063e-05, + "loss": 0.8421, + "step": 7566 + }, + { + "epoch": 0.6761230370585476, + "grad_norm": 0.412296861410141, + "learning_rate": 2.5071048303616028e-05, + "loss": 0.9733, + "step": 7567 + }, + { + "epoch": 0.6762123885898095, + "grad_norm": 0.6003457307815552, + "learning_rate": 2.5058505490084428e-05, + "loss": 0.8458, + "step": 7568 + }, + { + "epoch": 0.6763017401210714, + "grad_norm": 0.4704027771949768, + "learning_rate": 2.504596476566885e-05, + "loss": 0.9236, + "step": 7569 + }, + { + "epoch": 0.6763910916523332, + "grad_norm": 0.40603572130203247, + "learning_rate": 2.5033426131419714e-05, + "loss": 0.9337, + "step": 7570 + }, + { + "epoch": 0.676480443183595, + "grad_norm": 0.433475136756897, + "learning_rate": 2.5020889588387266e-05, + "loss": 0.9907, + "step": 7571 + }, + { + "epoch": 0.6765697947148569, + "grad_norm": 0.44331127405166626, + "learning_rate": 2.500835513762157e-05, + "loss": 0.9091, + "step": 7572 + }, + { + "epoch": 0.6766591462461188, + "grad_norm": 0.43762436509132385, + "learning_rate": 2.4995822780172522e-05, + "loss": 0.9437, + "step": 7573 + }, + { + "epoch": 0.6767484977773807, + "grad_norm": 0.4718449115753174, + "learning_rate": 2.4983292517089846e-05, + "loss": 0.9747, + "step": 7574 + }, + { + "epoch": 0.6768378493086425, + "grad_norm": 0.48513391613960266, + "learning_rate": 2.4970764349423093e-05, + "loss": 0.9207, + "step": 7575 + }, + { + "epoch": 0.6769272008399044, + "grad_norm": 0.5163725018501282, + "learning_rate": 2.49582382782216e-05, + "loss": 0.915, + "step": 7576 + }, + { + "epoch": 0.6770165523711663, + "grad_norm": 0.592780351638794, + "learning_rate": 2.4945714304534585e-05, + "loss": 0.7994, + "step": 7577 + }, + { + "epoch": 0.6771059039024281, + "grad_norm": 0.44290387630462646, + "learning_rate": 2.4933192429411052e-05, + "loss": 0.9619, + "step": 7578 + }, + { + "epoch": 0.67719525543369, + "grad_norm": 0.5108716487884521, + "learning_rate": 2.4920672653899847e-05, + "loss": 0.9351, + "step": 7579 + }, + { + "epoch": 0.6772846069649519, + "grad_norm": 0.6224034428596497, + "learning_rate": 2.490815497904963e-05, + "loss": 0.9648, + "step": 7580 + }, + { + "epoch": 0.6773739584962137, + "grad_norm": 0.4487568736076355, + "learning_rate": 2.4895639405908894e-05, + "loss": 0.9194, + "step": 7581 + }, + { + "epoch": 0.6774633100274756, + "grad_norm": 0.5798953771591187, + "learning_rate": 2.4883125935525953e-05, + "loss": 0.8891, + "step": 7582 + }, + { + "epoch": 0.6775526615587375, + "grad_norm": 0.5566365122795105, + "learning_rate": 2.487061456894894e-05, + "loss": 0.9089, + "step": 7583 + }, + { + "epoch": 0.6776420130899994, + "grad_norm": 0.5041350722312927, + "learning_rate": 2.485810530722582e-05, + "loss": 1.0092, + "step": 7584 + }, + { + "epoch": 0.6777313646212612, + "grad_norm": 0.47851383686065674, + "learning_rate": 2.484559815140439e-05, + "loss": 0.8815, + "step": 7585 + }, + { + "epoch": 0.677820716152523, + "grad_norm": 0.5732426047325134, + "learning_rate": 2.4833093102532222e-05, + "loss": 0.9326, + "step": 7586 + }, + { + "epoch": 0.6779100676837849, + "grad_norm": 0.5327209234237671, + "learning_rate": 2.482059016165677e-05, + "loss": 0.8568, + "step": 7587 + }, + { + "epoch": 0.6779994192150468, + "grad_norm": 0.5125129222869873, + "learning_rate": 2.4808089329825286e-05, + "loss": 0.897, + "step": 7588 + }, + { + "epoch": 0.6780887707463087, + "grad_norm": 0.4826566278934479, + "learning_rate": 2.479559060808484e-05, + "loss": 0.9773, + "step": 7589 + }, + { + "epoch": 0.6781781222775706, + "grad_norm": 0.5305161476135254, + "learning_rate": 2.4783093997482364e-05, + "loss": 0.8638, + "step": 7590 + }, + { + "epoch": 0.6782674738088323, + "grad_norm": 0.4348098635673523, + "learning_rate": 2.477059949906454e-05, + "loss": 0.9555, + "step": 7591 + }, + { + "epoch": 0.6783568253400942, + "grad_norm": 0.4758349061012268, + "learning_rate": 2.4758107113877934e-05, + "loss": 1.0283, + "step": 7592 + }, + { + "epoch": 0.6784461768713561, + "grad_norm": 0.5508620142936707, + "learning_rate": 2.474561684296891e-05, + "loss": 0.8579, + "step": 7593 + }, + { + "epoch": 0.678535528402618, + "grad_norm": 0.5436617732048035, + "learning_rate": 2.4733128687383678e-05, + "loss": 0.9722, + "step": 7594 + }, + { + "epoch": 0.6786248799338799, + "grad_norm": 0.42772892117500305, + "learning_rate": 2.4720642648168256e-05, + "loss": 0.9366, + "step": 7595 + }, + { + "epoch": 0.6787142314651418, + "grad_norm": 0.6332234740257263, + "learning_rate": 2.4708158726368452e-05, + "loss": 0.8871, + "step": 7596 + }, + { + "epoch": 0.6788035829964036, + "grad_norm": 0.43669572472572327, + "learning_rate": 2.4695676923029952e-05, + "loss": 0.9658, + "step": 7597 + }, + { + "epoch": 0.6788929345276654, + "grad_norm": 0.492891401052475, + "learning_rate": 2.468319723919823e-05, + "loss": 0.9739, + "step": 7598 + }, + { + "epoch": 0.6789822860589273, + "grad_norm": 0.4240998923778534, + "learning_rate": 2.4670719675918597e-05, + "loss": 0.9061, + "step": 7599 + }, + { + "epoch": 0.6790716375901892, + "grad_norm": 0.4478083848953247, + "learning_rate": 2.465824423423618e-05, + "loss": 1.009, + "step": 7600 + }, + { + "epoch": 0.6791609891214511, + "grad_norm": 0.4915962815284729, + "learning_rate": 2.4645770915195937e-05, + "loss": 0.9018, + "step": 7601 + }, + { + "epoch": 0.679250340652713, + "grad_norm": 0.42449894547462463, + "learning_rate": 2.4633299719842633e-05, + "loss": 0.985, + "step": 7602 + }, + { + "epoch": 0.6793396921839748, + "grad_norm": 0.44549936056137085, + "learning_rate": 2.4620830649220873e-05, + "loss": 0.9358, + "step": 7603 + }, + { + "epoch": 0.6794290437152367, + "grad_norm": 0.47829920053482056, + "learning_rate": 2.460836370437506e-05, + "loss": 0.9446, + "step": 7604 + }, + { + "epoch": 0.6795183952464985, + "grad_norm": 0.5621015429496765, + "learning_rate": 2.4595898886349466e-05, + "loss": 0.9252, + "step": 7605 + }, + { + "epoch": 0.6796077467777604, + "grad_norm": 0.49867716431617737, + "learning_rate": 2.458343619618811e-05, + "loss": 0.9293, + "step": 7606 + }, + { + "epoch": 0.6796970983090223, + "grad_norm": 0.4213707447052002, + "learning_rate": 2.4570975634934888e-05, + "loss": 0.9489, + "step": 7607 + }, + { + "epoch": 0.6797864498402841, + "grad_norm": 0.48737624287605286, + "learning_rate": 2.455851720363352e-05, + "loss": 0.9583, + "step": 7608 + }, + { + "epoch": 0.679875801371546, + "grad_norm": 0.45338955521583557, + "learning_rate": 2.4546060903327512e-05, + "loss": 0.9993, + "step": 7609 + }, + { + "epoch": 0.6799651529028079, + "grad_norm": 0.509468138217926, + "learning_rate": 2.453360673506023e-05, + "loss": 0.8476, + "step": 7610 + }, + { + "epoch": 0.6800545044340698, + "grad_norm": 0.4940629303455353, + "learning_rate": 2.4521154699874833e-05, + "loss": 0.9198, + "step": 7611 + }, + { + "epoch": 0.6801438559653316, + "grad_norm": 0.41363492608070374, + "learning_rate": 2.450870479881432e-05, + "loss": 0.9574, + "step": 7612 + }, + { + "epoch": 0.6802332074965934, + "grad_norm": 0.4342886805534363, + "learning_rate": 2.4496257032921494e-05, + "loss": 0.9672, + "step": 7613 + }, + { + "epoch": 0.6803225590278553, + "grad_norm": 0.5025731921195984, + "learning_rate": 2.4483811403238987e-05, + "loss": 0.9102, + "step": 7614 + }, + { + "epoch": 0.6804119105591172, + "grad_norm": 0.5143190622329712, + "learning_rate": 2.4471367910809284e-05, + "loss": 0.9767, + "step": 7615 + }, + { + "epoch": 0.6805012620903791, + "grad_norm": 0.442281574010849, + "learning_rate": 2.4458926556674615e-05, + "loss": 0.9579, + "step": 7616 + }, + { + "epoch": 0.680590613621641, + "grad_norm": 0.4672459065914154, + "learning_rate": 2.4446487341877095e-05, + "loss": 0.9006, + "step": 7617 + }, + { + "epoch": 0.6806799651529029, + "grad_norm": 0.4161359667778015, + "learning_rate": 2.4434050267458636e-05, + "loss": 0.9494, + "step": 7618 + }, + { + "epoch": 0.6807693166841646, + "grad_norm": 0.4646640717983246, + "learning_rate": 2.4421615334460986e-05, + "loss": 0.9457, + "step": 7619 + }, + { + "epoch": 0.6808586682154265, + "grad_norm": 0.45113250613212585, + "learning_rate": 2.4409182543925698e-05, + "loss": 0.9247, + "step": 7620 + }, + { + "epoch": 0.6809480197466884, + "grad_norm": 0.4302740693092346, + "learning_rate": 2.4396751896894144e-05, + "loss": 0.9624, + "step": 7621 + }, + { + "epoch": 0.6810373712779503, + "grad_norm": 0.42872077226638794, + "learning_rate": 2.438432339440753e-05, + "loss": 0.9573, + "step": 7622 + }, + { + "epoch": 0.6811267228092122, + "grad_norm": 0.3714028596878052, + "learning_rate": 2.43718970375069e-05, + "loss": 0.9537, + "step": 7623 + }, + { + "epoch": 0.681216074340474, + "grad_norm": 0.42748263478279114, + "learning_rate": 2.435947282723305e-05, + "loss": 0.8772, + "step": 7624 + }, + { + "epoch": 0.6813054258717359, + "grad_norm": 0.42798498272895813, + "learning_rate": 2.4347050764626656e-05, + "loss": 0.9627, + "step": 7625 + }, + { + "epoch": 0.6813947774029977, + "grad_norm": 0.5037773251533508, + "learning_rate": 2.43346308507282e-05, + "loss": 0.9368, + "step": 7626 + }, + { + "epoch": 0.6814841289342596, + "grad_norm": 0.49071839451789856, + "learning_rate": 2.432221308657799e-05, + "loss": 0.9451, + "step": 7627 + }, + { + "epoch": 0.6815734804655215, + "grad_norm": 0.519668459892273, + "learning_rate": 2.430979747321615e-05, + "loss": 0.8967, + "step": 7628 + }, + { + "epoch": 0.6816628319967833, + "grad_norm": 0.389871746301651, + "learning_rate": 2.4297384011682595e-05, + "loss": 0.9651, + "step": 7629 + }, + { + "epoch": 0.6817521835280452, + "grad_norm": 0.5641822814941406, + "learning_rate": 2.42849727030171e-05, + "loss": 0.8365, + "step": 7630 + }, + { + "epoch": 0.6818415350593071, + "grad_norm": 0.4539962410926819, + "learning_rate": 2.427256354825924e-05, + "loss": 0.9822, + "step": 7631 + }, + { + "epoch": 0.681930886590569, + "grad_norm": 0.4748673439025879, + "learning_rate": 2.4260156548448427e-05, + "loss": 0.9252, + "step": 7632 + }, + { + "epoch": 0.6820202381218308, + "grad_norm": 0.4344809353351593, + "learning_rate": 2.424775170462386e-05, + "loss": 0.9967, + "step": 7633 + }, + { + "epoch": 0.6821095896530927, + "grad_norm": 0.4680819511413574, + "learning_rate": 2.4235349017824588e-05, + "loss": 0.9189, + "step": 7634 + }, + { + "epoch": 0.6821989411843545, + "grad_norm": 0.47244375944137573, + "learning_rate": 2.422294848908947e-05, + "loss": 0.9564, + "step": 7635 + }, + { + "epoch": 0.6822882927156164, + "grad_norm": 0.47211724519729614, + "learning_rate": 2.4210550119457197e-05, + "loss": 1.0198, + "step": 7636 + }, + { + "epoch": 0.6823776442468783, + "grad_norm": 0.4619308114051819, + "learning_rate": 2.419815390996623e-05, + "loss": 0.9311, + "step": 7637 + }, + { + "epoch": 0.6824669957781402, + "grad_norm": 0.5194936394691467, + "learning_rate": 2.41857598616549e-05, + "loss": 0.9223, + "step": 7638 + }, + { + "epoch": 0.6825563473094021, + "grad_norm": 0.4341084063053131, + "learning_rate": 2.4173367975561345e-05, + "loss": 0.9385, + "step": 7639 + }, + { + "epoch": 0.6826456988406638, + "grad_norm": 0.5083869695663452, + "learning_rate": 2.416097825272351e-05, + "loss": 0.9852, + "step": 7640 + }, + { + "epoch": 0.6827350503719257, + "grad_norm": 0.6154478192329407, + "learning_rate": 2.4148590694179168e-05, + "loss": 0.8429, + "step": 7641 + }, + { + "epoch": 0.6828244019031876, + "grad_norm": 0.49730563163757324, + "learning_rate": 2.413620530096592e-05, + "loss": 0.9575, + "step": 7642 + }, + { + "epoch": 0.6829137534344495, + "grad_norm": 0.5614471435546875, + "learning_rate": 2.412382207412116e-05, + "loss": 0.8966, + "step": 7643 + }, + { + "epoch": 0.6830031049657114, + "grad_norm": 0.4349358081817627, + "learning_rate": 2.4111441014682123e-05, + "loss": 0.9805, + "step": 7644 + }, + { + "epoch": 0.6830924564969733, + "grad_norm": 0.40343138575553894, + "learning_rate": 2.4099062123685852e-05, + "loss": 0.9149, + "step": 7645 + }, + { + "epoch": 0.6831818080282351, + "grad_norm": 0.4427891969680786, + "learning_rate": 2.4086685402169234e-05, + "loss": 0.9948, + "step": 7646 + }, + { + "epoch": 0.6832711595594969, + "grad_norm": 0.5709788203239441, + "learning_rate": 2.407431085116891e-05, + "loss": 0.946, + "step": 7647 + }, + { + "epoch": 0.6833605110907588, + "grad_norm": 0.4347441494464874, + "learning_rate": 2.4061938471721395e-05, + "loss": 1.0256, + "step": 7648 + }, + { + "epoch": 0.6834498626220207, + "grad_norm": 0.509070098400116, + "learning_rate": 2.4049568264863022e-05, + "loss": 0.885, + "step": 7649 + }, + { + "epoch": 0.6835392141532826, + "grad_norm": 0.5047274231910706, + "learning_rate": 2.403720023162991e-05, + "loss": 0.9298, + "step": 7650 + }, + { + "epoch": 0.6836285656845444, + "grad_norm": 0.4579404890537262, + "learning_rate": 2.4024834373058023e-05, + "loss": 0.9002, + "step": 7651 + }, + { + "epoch": 0.6837179172158063, + "grad_norm": 0.4215395748615265, + "learning_rate": 2.4012470690183136e-05, + "loss": 1.0175, + "step": 7652 + }, + { + "epoch": 0.6838072687470681, + "grad_norm": 0.4573095738887787, + "learning_rate": 2.4000109184040837e-05, + "loss": 0.9326, + "step": 7653 + }, + { + "epoch": 0.68389662027833, + "grad_norm": 0.4098723530769348, + "learning_rate": 2.3987749855666532e-05, + "loss": 1.0006, + "step": 7654 + }, + { + "epoch": 0.6839859718095919, + "grad_norm": 0.4912080764770508, + "learning_rate": 2.3975392706095446e-05, + "loss": 0.8999, + "step": 7655 + }, + { + "epoch": 0.6840753233408537, + "grad_norm": 0.4591172933578491, + "learning_rate": 2.3963037736362643e-05, + "loss": 0.9086, + "step": 7656 + }, + { + "epoch": 0.6841646748721156, + "grad_norm": 0.4539848864078522, + "learning_rate": 2.3950684947502944e-05, + "loss": 0.9633, + "step": 7657 + }, + { + "epoch": 0.6842540264033775, + "grad_norm": 0.462184876203537, + "learning_rate": 2.3938334340551044e-05, + "loss": 0.9912, + "step": 7658 + }, + { + "epoch": 0.6843433779346394, + "grad_norm": 0.46925488114356995, + "learning_rate": 2.3925985916541443e-05, + "loss": 0.9211, + "step": 7659 + }, + { + "epoch": 0.6844327294659012, + "grad_norm": 0.5577223300933838, + "learning_rate": 2.3913639676508472e-05, + "loss": 0.8373, + "step": 7660 + }, + { + "epoch": 0.684522080997163, + "grad_norm": 0.42695876955986023, + "learning_rate": 2.390129562148622e-05, + "loss": 0.8683, + "step": 7661 + }, + { + "epoch": 0.6846114325284249, + "grad_norm": 0.5081654191017151, + "learning_rate": 2.3888953752508647e-05, + "loss": 0.8314, + "step": 7662 + }, + { + "epoch": 0.6847007840596868, + "grad_norm": 0.6528906226158142, + "learning_rate": 2.387661407060952e-05, + "loss": 0.9019, + "step": 7663 + }, + { + "epoch": 0.6847901355909487, + "grad_norm": 0.3996724784374237, + "learning_rate": 2.3864276576822426e-05, + "loss": 0.9953, + "step": 7664 + }, + { + "epoch": 0.6848794871222106, + "grad_norm": 0.4197225868701935, + "learning_rate": 2.385194127218075e-05, + "loss": 1.0005, + "step": 7665 + }, + { + "epoch": 0.6849688386534725, + "grad_norm": 0.51161789894104, + "learning_rate": 2.3839608157717734e-05, + "loss": 0.9327, + "step": 7666 + }, + { + "epoch": 0.6850581901847342, + "grad_norm": 0.4613009989261627, + "learning_rate": 2.3827277234466362e-05, + "loss": 1.0265, + "step": 7667 + }, + { + "epoch": 0.6851475417159961, + "grad_norm": 0.4957435429096222, + "learning_rate": 2.3814948503459507e-05, + "loss": 0.9579, + "step": 7668 + }, + { + "epoch": 0.685236893247258, + "grad_norm": 0.5043631792068481, + "learning_rate": 2.380262196572982e-05, + "loss": 0.9527, + "step": 7669 + }, + { + "epoch": 0.6853262447785199, + "grad_norm": 0.5093596577644348, + "learning_rate": 2.3790297622309794e-05, + "loss": 0.9432, + "step": 7670 + }, + { + "epoch": 0.6854155963097818, + "grad_norm": 0.433877557516098, + "learning_rate": 2.3777975474231718e-05, + "loss": 0.8894, + "step": 7671 + }, + { + "epoch": 0.6855049478410437, + "grad_norm": 0.558192253112793, + "learning_rate": 2.3765655522527695e-05, + "loss": 0.9565, + "step": 7672 + }, + { + "epoch": 0.6855942993723055, + "grad_norm": 0.4382516145706177, + "learning_rate": 2.3753337768229667e-05, + "loss": 0.9967, + "step": 7673 + }, + { + "epoch": 0.6856836509035673, + "grad_norm": 0.5013807415962219, + "learning_rate": 2.374102221236937e-05, + "loss": 0.9572, + "step": 7674 + }, + { + "epoch": 0.6857730024348292, + "grad_norm": 0.43265098333358765, + "learning_rate": 2.372870885597836e-05, + "loss": 0.961, + "step": 7675 + }, + { + "epoch": 0.6858623539660911, + "grad_norm": 0.4570571780204773, + "learning_rate": 2.371639770008804e-05, + "loss": 0.9556, + "step": 7676 + }, + { + "epoch": 0.685951705497353, + "grad_norm": 0.4653104841709137, + "learning_rate": 2.370408874572955e-05, + "loss": 0.9336, + "step": 7677 + }, + { + "epoch": 0.6860410570286148, + "grad_norm": 0.5548616051673889, + "learning_rate": 2.3691781993933926e-05, + "loss": 1.0193, + "step": 7678 + }, + { + "epoch": 0.6861304085598767, + "grad_norm": 0.41967645287513733, + "learning_rate": 2.3679477445731986e-05, + "loss": 0.9393, + "step": 7679 + }, + { + "epoch": 0.6862197600911386, + "grad_norm": 0.4959544241428375, + "learning_rate": 2.366717510215436e-05, + "loss": 0.8876, + "step": 7680 + }, + { + "epoch": 0.6863091116224004, + "grad_norm": 0.45173823833465576, + "learning_rate": 2.3654874964231518e-05, + "loss": 0.9034, + "step": 7681 + }, + { + "epoch": 0.6863984631536623, + "grad_norm": 0.4388938248157501, + "learning_rate": 2.3642577032993705e-05, + "loss": 0.8964, + "step": 7682 + }, + { + "epoch": 0.6864878146849241, + "grad_norm": 0.42833590507507324, + "learning_rate": 2.363028130947102e-05, + "loss": 0.9542, + "step": 7683 + }, + { + "epoch": 0.686577166216186, + "grad_norm": 0.556098997592926, + "learning_rate": 2.361798779469336e-05, + "loss": 0.9357, + "step": 7684 + }, + { + "epoch": 0.6866665177474479, + "grad_norm": 0.5818011164665222, + "learning_rate": 2.3605696489690427e-05, + "loss": 0.8631, + "step": 7685 + }, + { + "epoch": 0.6867558692787098, + "grad_norm": 0.40382498502731323, + "learning_rate": 2.3593407395491778e-05, + "loss": 0.9778, + "step": 7686 + }, + { + "epoch": 0.6868452208099717, + "grad_norm": 0.44755667448043823, + "learning_rate": 2.358112051312672e-05, + "loss": 0.9621, + "step": 7687 + }, + { + "epoch": 0.6869345723412335, + "grad_norm": 0.5690574049949646, + "learning_rate": 2.3568835843624422e-05, + "loss": 0.897, + "step": 7688 + }, + { + "epoch": 0.6870239238724953, + "grad_norm": 0.5574625134468079, + "learning_rate": 2.3556553388013852e-05, + "loss": 0.9749, + "step": 7689 + }, + { + "epoch": 0.6871132754037572, + "grad_norm": 0.4903115928173065, + "learning_rate": 2.3544273147323807e-05, + "loss": 0.9872, + "step": 7690 + }, + { + "epoch": 0.6872026269350191, + "grad_norm": 0.4215288460254669, + "learning_rate": 2.3531995122582883e-05, + "loss": 0.9412, + "step": 7691 + }, + { + "epoch": 0.687291978466281, + "grad_norm": 0.5319769978523254, + "learning_rate": 2.3519719314819493e-05, + "loss": 0.9143, + "step": 7692 + }, + { + "epoch": 0.6873813299975429, + "grad_norm": 0.5102341175079346, + "learning_rate": 2.3507445725061895e-05, + "loss": 0.8844, + "step": 7693 + }, + { + "epoch": 0.6874706815288048, + "grad_norm": 0.6138080954551697, + "learning_rate": 2.3495174354338084e-05, + "loss": 0.9267, + "step": 7694 + }, + { + "epoch": 0.6875600330600665, + "grad_norm": 0.4891884922981262, + "learning_rate": 2.348290520367595e-05, + "loss": 0.9271, + "step": 7695 + }, + { + "epoch": 0.6876493845913284, + "grad_norm": 0.44753071665763855, + "learning_rate": 2.3470638274103147e-05, + "loss": 0.9276, + "step": 7696 + }, + { + "epoch": 0.6877387361225903, + "grad_norm": 0.44476214051246643, + "learning_rate": 2.3458373566647174e-05, + "loss": 0.9433, + "step": 7697 + }, + { + "epoch": 0.6878280876538522, + "grad_norm": 0.39918237924575806, + "learning_rate": 2.344611108233535e-05, + "loss": 0.9188, + "step": 7698 + }, + { + "epoch": 0.687917439185114, + "grad_norm": 0.4474645256996155, + "learning_rate": 2.343385082219475e-05, + "loss": 0.95, + "step": 7699 + }, + { + "epoch": 0.6880067907163759, + "grad_norm": 0.455310195684433, + "learning_rate": 2.342159278725231e-05, + "loss": 0.9567, + "step": 7700 + }, + { + "epoch": 0.6880961422476378, + "grad_norm": 0.5603063702583313, + "learning_rate": 2.3409336978534783e-05, + "loss": 0.8424, + "step": 7701 + }, + { + "epoch": 0.6881854937788996, + "grad_norm": 0.4089902937412262, + "learning_rate": 2.3397083397068724e-05, + "loss": 0.9721, + "step": 7702 + }, + { + "epoch": 0.6882748453101615, + "grad_norm": 0.563758373260498, + "learning_rate": 2.3384832043880495e-05, + "loss": 0.9647, + "step": 7703 + }, + { + "epoch": 0.6883641968414234, + "grad_norm": 0.4580053389072418, + "learning_rate": 2.337258291999628e-05, + "loss": 1.037, + "step": 7704 + }, + { + "epoch": 0.6884535483726852, + "grad_norm": 0.5256210565567017, + "learning_rate": 2.336033602644207e-05, + "loss": 0.9317, + "step": 7705 + }, + { + "epoch": 0.6885428999039471, + "grad_norm": 0.5540886521339417, + "learning_rate": 2.3348091364243703e-05, + "loss": 0.9263, + "step": 7706 + }, + { + "epoch": 0.688632251435209, + "grad_norm": 0.4322797954082489, + "learning_rate": 2.3335848934426746e-05, + "loss": 0.9876, + "step": 7707 + }, + { + "epoch": 0.6887216029664709, + "grad_norm": 0.49090680480003357, + "learning_rate": 2.3323608738016663e-05, + "loss": 0.9329, + "step": 7708 + }, + { + "epoch": 0.6888109544977327, + "grad_norm": 0.5516027808189392, + "learning_rate": 2.3311370776038698e-05, + "loss": 1.0222, + "step": 7709 + }, + { + "epoch": 0.6889003060289945, + "grad_norm": 0.4046795666217804, + "learning_rate": 2.3299135049517913e-05, + "loss": 0.9255, + "step": 7710 + }, + { + "epoch": 0.6889896575602564, + "grad_norm": 0.5095672607421875, + "learning_rate": 2.3286901559479175e-05, + "loss": 0.9252, + "step": 7711 + }, + { + "epoch": 0.6890790090915183, + "grad_norm": 0.5826500058174133, + "learning_rate": 2.3274670306947173e-05, + "loss": 0.7873, + "step": 7712 + }, + { + "epoch": 0.6891683606227802, + "grad_norm": 0.5110095143318176, + "learning_rate": 2.3262441292946407e-05, + "loss": 0.9253, + "step": 7713 + }, + { + "epoch": 0.6892577121540421, + "grad_norm": 0.4405474364757538, + "learning_rate": 2.3250214518501184e-05, + "loss": 0.9763, + "step": 7714 + }, + { + "epoch": 0.689347063685304, + "grad_norm": 0.42986834049224854, + "learning_rate": 2.3237989984635628e-05, + "loss": 0.9512, + "step": 7715 + }, + { + "epoch": 0.6894364152165657, + "grad_norm": 0.42974916100502014, + "learning_rate": 2.3225767692373686e-05, + "loss": 0.9491, + "step": 7716 + }, + { + "epoch": 0.6895257667478276, + "grad_norm": 0.41702863574028015, + "learning_rate": 2.3213547642739082e-05, + "loss": 0.9853, + "step": 7717 + }, + { + "epoch": 0.6896151182790895, + "grad_norm": 0.45753413438796997, + "learning_rate": 2.3201329836755382e-05, + "loss": 0.9201, + "step": 7718 + }, + { + "epoch": 0.6897044698103514, + "grad_norm": 0.6328713297843933, + "learning_rate": 2.3189114275445963e-05, + "loss": 0.8626, + "step": 7719 + }, + { + "epoch": 0.6897938213416133, + "grad_norm": 0.4760781228542328, + "learning_rate": 2.3176900959834004e-05, + "loss": 0.9065, + "step": 7720 + }, + { + "epoch": 0.6898831728728752, + "grad_norm": 0.5676954388618469, + "learning_rate": 2.3164689890942504e-05, + "loss": 0.9782, + "step": 7721 + }, + { + "epoch": 0.6899725244041369, + "grad_norm": 0.4092563986778259, + "learning_rate": 2.315248106979427e-05, + "loss": 1.0402, + "step": 7722 + }, + { + "epoch": 0.6900618759353988, + "grad_norm": 0.4689728915691376, + "learning_rate": 2.3140274497411918e-05, + "loss": 0.9736, + "step": 7723 + }, + { + "epoch": 0.6901512274666607, + "grad_norm": 0.5073100924491882, + "learning_rate": 2.3128070174817884e-05, + "loss": 0.9342, + "step": 7724 + }, + { + "epoch": 0.6902405789979226, + "grad_norm": 0.48991286754608154, + "learning_rate": 2.311586810303441e-05, + "loss": 0.9554, + "step": 7725 + }, + { + "epoch": 0.6903299305291845, + "grad_norm": 0.45478180050849915, + "learning_rate": 2.3103668283083564e-05, + "loss": 0.9772, + "step": 7726 + }, + { + "epoch": 0.6904192820604463, + "grad_norm": 0.5075895190238953, + "learning_rate": 2.3091470715987167e-05, + "loss": 0.8529, + "step": 7727 + }, + { + "epoch": 0.6905086335917082, + "grad_norm": 0.5047464966773987, + "learning_rate": 2.307927540276693e-05, + "loss": 0.9655, + "step": 7728 + }, + { + "epoch": 0.69059798512297, + "grad_norm": 0.47214803099632263, + "learning_rate": 2.306708234444433e-05, + "loss": 0.9132, + "step": 7729 + }, + { + "epoch": 0.6906873366542319, + "grad_norm": 0.4735872745513916, + "learning_rate": 2.305489154204067e-05, + "loss": 0.8661, + "step": 7730 + }, + { + "epoch": 0.6907766881854938, + "grad_norm": 0.4646959900856018, + "learning_rate": 2.304270299657707e-05, + "loss": 0.9096, + "step": 7731 + }, + { + "epoch": 0.6908660397167556, + "grad_norm": 0.45085829496383667, + "learning_rate": 2.3030516709074424e-05, + "loss": 0.9554, + "step": 7732 + }, + { + "epoch": 0.6909553912480175, + "grad_norm": 0.47358912229537964, + "learning_rate": 2.3018332680553477e-05, + "loss": 0.9515, + "step": 7733 + }, + { + "epoch": 0.6910447427792794, + "grad_norm": 0.4590679407119751, + "learning_rate": 2.3006150912034774e-05, + "loss": 0.9225, + "step": 7734 + }, + { + "epoch": 0.6911340943105413, + "grad_norm": 0.5170496702194214, + "learning_rate": 2.2993971404538668e-05, + "loss": 0.8962, + "step": 7735 + }, + { + "epoch": 0.6912234458418031, + "grad_norm": 0.4720090627670288, + "learning_rate": 2.298179415908531e-05, + "loss": 0.9482, + "step": 7736 + }, + { + "epoch": 0.691312797373065, + "grad_norm": 0.572025716304779, + "learning_rate": 2.296961917669471e-05, + "loss": 0.8859, + "step": 7737 + }, + { + "epoch": 0.6914021489043268, + "grad_norm": 0.4607776403427124, + "learning_rate": 2.295744645838661e-05, + "loss": 0.9991, + "step": 7738 + }, + { + "epoch": 0.6914915004355887, + "grad_norm": 0.5466684699058533, + "learning_rate": 2.2945276005180623e-05, + "loss": 0.896, + "step": 7739 + }, + { + "epoch": 0.6915808519668506, + "grad_norm": 0.48179617524147034, + "learning_rate": 2.293310781809615e-05, + "loss": 0.9066, + "step": 7740 + }, + { + "epoch": 0.6916702034981125, + "grad_norm": 0.4328363239765167, + "learning_rate": 2.292094189815241e-05, + "loss": 0.9941, + "step": 7741 + }, + { + "epoch": 0.6917595550293744, + "grad_norm": 0.4751478135585785, + "learning_rate": 2.290877824636843e-05, + "loss": 0.9366, + "step": 7742 + }, + { + "epoch": 0.6918489065606361, + "grad_norm": 0.47883662581443787, + "learning_rate": 2.2896616863763038e-05, + "loss": 1.0028, + "step": 7743 + }, + { + "epoch": 0.691938258091898, + "grad_norm": 0.5021872520446777, + "learning_rate": 2.2884457751354887e-05, + "loss": 0.9387, + "step": 7744 + }, + { + "epoch": 0.6920276096231599, + "grad_norm": 0.45752233266830444, + "learning_rate": 2.2872300910162436e-05, + "loss": 0.9381, + "step": 7745 + }, + { + "epoch": 0.6921169611544218, + "grad_norm": 0.465744286775589, + "learning_rate": 2.2860146341203937e-05, + "loss": 0.921, + "step": 7746 + }, + { + "epoch": 0.6922063126856837, + "grad_norm": 0.47951745986938477, + "learning_rate": 2.2847994045497496e-05, + "loss": 0.9759, + "step": 7747 + }, + { + "epoch": 0.6922956642169455, + "grad_norm": 0.4690631628036499, + "learning_rate": 2.2835844024060953e-05, + "loss": 0.9177, + "step": 7748 + }, + { + "epoch": 0.6923850157482074, + "grad_norm": 0.5409393310546875, + "learning_rate": 2.282369627791202e-05, + "loss": 0.905, + "step": 7749 + }, + { + "epoch": 0.6924743672794692, + "grad_norm": 0.5218047499656677, + "learning_rate": 2.2811550808068205e-05, + "loss": 0.9922, + "step": 7750 + }, + { + "epoch": 0.6925637188107311, + "grad_norm": 0.5113648176193237, + "learning_rate": 2.2799407615546815e-05, + "loss": 0.9426, + "step": 7751 + }, + { + "epoch": 0.692653070341993, + "grad_norm": 0.4514075517654419, + "learning_rate": 2.278726670136498e-05, + "loss": 0.9237, + "step": 7752 + }, + { + "epoch": 0.6927424218732549, + "grad_norm": 0.4475104510784149, + "learning_rate": 2.277512806653962e-05, + "loss": 0.9978, + "step": 7753 + }, + { + "epoch": 0.6928317734045167, + "grad_norm": 0.4956379532814026, + "learning_rate": 2.2762991712087484e-05, + "loss": 0.8725, + "step": 7754 + }, + { + "epoch": 0.6929211249357786, + "grad_norm": 0.52278733253479, + "learning_rate": 2.2750857639025113e-05, + "loss": 0.8942, + "step": 7755 + }, + { + "epoch": 0.6930104764670405, + "grad_norm": 0.5214927792549133, + "learning_rate": 2.2738725848368875e-05, + "loss": 0.9485, + "step": 7756 + }, + { + "epoch": 0.6930998279983023, + "grad_norm": 0.44215962290763855, + "learning_rate": 2.272659634113495e-05, + "loss": 0.9248, + "step": 7757 + }, + { + "epoch": 0.6931891795295642, + "grad_norm": 0.4372926652431488, + "learning_rate": 2.271446911833927e-05, + "loss": 0.9661, + "step": 7758 + }, + { + "epoch": 0.693278531060826, + "grad_norm": 0.4903610050678253, + "learning_rate": 2.2702344180997647e-05, + "loss": 0.945, + "step": 7759 + }, + { + "epoch": 0.6933678825920879, + "grad_norm": 0.5382173657417297, + "learning_rate": 2.2690221530125676e-05, + "loss": 0.9443, + "step": 7760 + }, + { + "epoch": 0.6934572341233498, + "grad_norm": 0.5113039016723633, + "learning_rate": 2.2678101166738746e-05, + "loss": 0.9368, + "step": 7761 + }, + { + "epoch": 0.6935465856546117, + "grad_norm": 0.6735799312591553, + "learning_rate": 2.2665983091852083e-05, + "loss": 0.7989, + "step": 7762 + }, + { + "epoch": 0.6936359371858736, + "grad_norm": 0.4851955771446228, + "learning_rate": 2.2653867306480708e-05, + "loss": 0.9514, + "step": 7763 + }, + { + "epoch": 0.6937252887171353, + "grad_norm": 0.4901287853717804, + "learning_rate": 2.2641753811639417e-05, + "loss": 0.9322, + "step": 7764 + }, + { + "epoch": 0.6938146402483972, + "grad_norm": 0.5687100291252136, + "learning_rate": 2.262964260834286e-05, + "loss": 0.8931, + "step": 7765 + }, + { + "epoch": 0.6939039917796591, + "grad_norm": 0.45068779587745667, + "learning_rate": 2.2617533697605485e-05, + "loss": 0.9799, + "step": 7766 + }, + { + "epoch": 0.693993343310921, + "grad_norm": 0.4677877724170685, + "learning_rate": 2.260542708044154e-05, + "loss": 0.9831, + "step": 7767 + }, + { + "epoch": 0.6940826948421829, + "grad_norm": 0.4960726201534271, + "learning_rate": 2.2593322757865097e-05, + "loss": 0.8693, + "step": 7768 + }, + { + "epoch": 0.6941720463734448, + "grad_norm": 0.5460396409034729, + "learning_rate": 2.258122073088999e-05, + "loss": 0.8695, + "step": 7769 + }, + { + "epoch": 0.6942613979047066, + "grad_norm": 0.44287580251693726, + "learning_rate": 2.2569121000529915e-05, + "loss": 0.9818, + "step": 7770 + }, + { + "epoch": 0.6943507494359684, + "grad_norm": 0.5541728734970093, + "learning_rate": 2.2557023567798342e-05, + "loss": 0.974, + "step": 7771 + }, + { + "epoch": 0.6944401009672303, + "grad_norm": 0.5429545044898987, + "learning_rate": 2.254492843370857e-05, + "loss": 0.9813, + "step": 7772 + }, + { + "epoch": 0.6945294524984922, + "grad_norm": 0.42696717381477356, + "learning_rate": 2.2532835599273687e-05, + "loss": 0.8715, + "step": 7773 + }, + { + "epoch": 0.6946188040297541, + "grad_norm": 0.49874842166900635, + "learning_rate": 2.2520745065506603e-05, + "loss": 0.8765, + "step": 7774 + }, + { + "epoch": 0.694708155561016, + "grad_norm": 0.49501675367355347, + "learning_rate": 2.2508656833420026e-05, + "loss": 0.9732, + "step": 7775 + }, + { + "epoch": 0.6947975070922778, + "grad_norm": 0.48265352845191956, + "learning_rate": 2.2496570904026483e-05, + "loss": 0.8776, + "step": 7776 + }, + { + "epoch": 0.6948868586235397, + "grad_norm": 0.46912387013435364, + "learning_rate": 2.2484487278338305e-05, + "loss": 0.9019, + "step": 7777 + }, + { + "epoch": 0.6949762101548015, + "grad_norm": 0.4739210605621338, + "learning_rate": 2.2472405957367593e-05, + "loss": 0.899, + "step": 7778 + }, + { + "epoch": 0.6950655616860634, + "grad_norm": 0.4149891436100006, + "learning_rate": 2.2460326942126307e-05, + "loss": 0.9424, + "step": 7779 + }, + { + "epoch": 0.6951549132173253, + "grad_norm": 0.43324029445648193, + "learning_rate": 2.2448250233626195e-05, + "loss": 0.927, + "step": 7780 + }, + { + "epoch": 0.6952442647485871, + "grad_norm": 0.7626486420631409, + "learning_rate": 2.2436175832878802e-05, + "loss": 0.9243, + "step": 7781 + }, + { + "epoch": 0.695333616279849, + "grad_norm": 0.4425656795501709, + "learning_rate": 2.242410374089549e-05, + "loss": 0.9539, + "step": 7782 + }, + { + "epoch": 0.6954229678111109, + "grad_norm": 0.4871158003807068, + "learning_rate": 2.2412033958687433e-05, + "loss": 0.8842, + "step": 7783 + }, + { + "epoch": 0.6955123193423727, + "grad_norm": 0.5576094388961792, + "learning_rate": 2.2399966487265596e-05, + "loss": 0.9452, + "step": 7784 + }, + { + "epoch": 0.6956016708736346, + "grad_norm": 0.41526561975479126, + "learning_rate": 2.238790132764076e-05, + "loss": 0.988, + "step": 7785 + }, + { + "epoch": 0.6956910224048964, + "grad_norm": 0.4997550845146179, + "learning_rate": 2.237583848082351e-05, + "loss": 0.942, + "step": 7786 + }, + { + "epoch": 0.6957803739361583, + "grad_norm": 0.44697871804237366, + "learning_rate": 2.2363777947824265e-05, + "loss": 0.9652, + "step": 7787 + }, + { + "epoch": 0.6958697254674202, + "grad_norm": 0.4410659372806549, + "learning_rate": 2.2351719729653175e-05, + "loss": 0.9358, + "step": 7788 + }, + { + "epoch": 0.6959590769986821, + "grad_norm": 0.39497098326683044, + "learning_rate": 2.233966382732027e-05, + "loss": 0.9459, + "step": 7789 + }, + { + "epoch": 0.696048428529944, + "grad_norm": 0.45145925879478455, + "learning_rate": 2.232761024183535e-05, + "loss": 0.9192, + "step": 7790 + }, + { + "epoch": 0.6961377800612057, + "grad_norm": 0.46766191720962524, + "learning_rate": 2.2315558974208045e-05, + "loss": 0.9029, + "step": 7791 + }, + { + "epoch": 0.6962271315924676, + "grad_norm": 0.4594910740852356, + "learning_rate": 2.2303510025447765e-05, + "loss": 0.8901, + "step": 7792 + }, + { + "epoch": 0.6963164831237295, + "grad_norm": 0.48282474279403687, + "learning_rate": 2.229146339656375e-05, + "loss": 0.9668, + "step": 7793 + }, + { + "epoch": 0.6964058346549914, + "grad_norm": 0.5255979895591736, + "learning_rate": 2.227941908856503e-05, + "loss": 0.932, + "step": 7794 + }, + { + "epoch": 0.6964951861862533, + "grad_norm": 0.4911576509475708, + "learning_rate": 2.2267377102460436e-05, + "loss": 0.9966, + "step": 7795 + }, + { + "epoch": 0.6965845377175152, + "grad_norm": 0.5194364190101624, + "learning_rate": 2.2255337439258633e-05, + "loss": 0.9707, + "step": 7796 + }, + { + "epoch": 0.696673889248777, + "grad_norm": 0.5468015074729919, + "learning_rate": 2.2243300099968046e-05, + "loss": 0.8358, + "step": 7797 + }, + { + "epoch": 0.6967632407800388, + "grad_norm": 0.47449544072151184, + "learning_rate": 2.2231265085596938e-05, + "loss": 0.9037, + "step": 7798 + }, + { + "epoch": 0.6968525923113007, + "grad_norm": 0.48344165086746216, + "learning_rate": 2.2219232397153366e-05, + "loss": 0.8762, + "step": 7799 + }, + { + "epoch": 0.6969419438425626, + "grad_norm": 0.433371365070343, + "learning_rate": 2.220720203564521e-05, + "loss": 0.9288, + "step": 7800 + }, + { + "epoch": 0.6970312953738245, + "grad_norm": 0.49428948760032654, + "learning_rate": 2.219517400208015e-05, + "loss": 0.9456, + "step": 7801 + }, + { + "epoch": 0.6971206469050863, + "grad_norm": 0.4290115535259247, + "learning_rate": 2.2183148297465627e-05, + "loss": 0.9207, + "step": 7802 + }, + { + "epoch": 0.6972099984363482, + "grad_norm": 0.4194071590900421, + "learning_rate": 2.217112492280894e-05, + "loss": 0.9429, + "step": 7803 + }, + { + "epoch": 0.6972993499676101, + "grad_norm": 0.4213373363018036, + "learning_rate": 2.2159103879117177e-05, + "loss": 0.9443, + "step": 7804 + }, + { + "epoch": 0.6973887014988719, + "grad_norm": 0.4857761263847351, + "learning_rate": 2.2147085167397223e-05, + "loss": 0.9155, + "step": 7805 + }, + { + "epoch": 0.6974780530301338, + "grad_norm": 0.4734998941421509, + "learning_rate": 2.2135068788655782e-05, + "loss": 0.9608, + "step": 7806 + }, + { + "epoch": 0.6975674045613957, + "grad_norm": 0.47898635268211365, + "learning_rate": 2.2123054743899346e-05, + "loss": 0.9165, + "step": 7807 + }, + { + "epoch": 0.6976567560926575, + "grad_norm": 0.556919515132904, + "learning_rate": 2.211104303413424e-05, + "loss": 0.9658, + "step": 7808 + }, + { + "epoch": 0.6977461076239194, + "grad_norm": 0.3948970139026642, + "learning_rate": 2.2099033660366537e-05, + "loss": 1.032, + "step": 7809 + }, + { + "epoch": 0.6978354591551813, + "grad_norm": 0.4545915126800537, + "learning_rate": 2.2087026623602164e-05, + "loss": 0.9332, + "step": 7810 + }, + { + "epoch": 0.6979248106864432, + "grad_norm": 0.4447174668312073, + "learning_rate": 2.207502192484685e-05, + "loss": 0.9648, + "step": 7811 + }, + { + "epoch": 0.698014162217705, + "grad_norm": 0.5227632522583008, + "learning_rate": 2.2063019565106102e-05, + "loss": 0.9064, + "step": 7812 + }, + { + "epoch": 0.6981035137489668, + "grad_norm": 0.41880759596824646, + "learning_rate": 2.2051019545385255e-05, + "loss": 0.9345, + "step": 7813 + }, + { + "epoch": 0.6981928652802287, + "grad_norm": 0.5738639831542969, + "learning_rate": 2.2039021866689435e-05, + "loss": 0.8841, + "step": 7814 + }, + { + "epoch": 0.6982822168114906, + "grad_norm": 0.38003218173980713, + "learning_rate": 2.202702653002358e-05, + "loss": 0.9731, + "step": 7815 + }, + { + "epoch": 0.6983715683427525, + "grad_norm": 0.46742263436317444, + "learning_rate": 2.2015033536392422e-05, + "loss": 0.9019, + "step": 7816 + }, + { + "epoch": 0.6984609198740144, + "grad_norm": 0.57480788230896, + "learning_rate": 2.20030428868005e-05, + "loss": 0.9423, + "step": 7817 + }, + { + "epoch": 0.6985502714052763, + "grad_norm": 0.4974348545074463, + "learning_rate": 2.199105458225218e-05, + "loss": 0.9786, + "step": 7818 + }, + { + "epoch": 0.698639622936538, + "grad_norm": 0.5195631980895996, + "learning_rate": 2.197906862375158e-05, + "loss": 0.9327, + "step": 7819 + }, + { + "epoch": 0.6987289744677999, + "grad_norm": 0.4291267991065979, + "learning_rate": 2.1967085012302663e-05, + "loss": 0.9707, + "step": 7820 + }, + { + "epoch": 0.6988183259990618, + "grad_norm": 0.5776417255401611, + "learning_rate": 2.1955103748909185e-05, + "loss": 0.836, + "step": 7821 + }, + { + "epoch": 0.6989076775303237, + "grad_norm": 0.45474281907081604, + "learning_rate": 2.19431248345747e-05, + "loss": 0.9669, + "step": 7822 + }, + { + "epoch": 0.6989970290615856, + "grad_norm": 0.41181471943855286, + "learning_rate": 2.193114827030258e-05, + "loss": 0.9725, + "step": 7823 + }, + { + "epoch": 0.6990863805928474, + "grad_norm": 0.5399705767631531, + "learning_rate": 2.191917405709598e-05, + "loss": 0.8856, + "step": 7824 + }, + { + "epoch": 0.6991757321241093, + "grad_norm": 0.37962204217910767, + "learning_rate": 2.1907202195957882e-05, + "loss": 0.9573, + "step": 7825 + }, + { + "epoch": 0.6992650836553711, + "grad_norm": 0.5898367762565613, + "learning_rate": 2.1895232687891043e-05, + "loss": 0.9001, + "step": 7826 + }, + { + "epoch": 0.699354435186633, + "grad_norm": 0.4448642432689667, + "learning_rate": 2.1883265533898038e-05, + "loss": 0.9668, + "step": 7827 + }, + { + "epoch": 0.6994437867178949, + "grad_norm": 0.42721062898635864, + "learning_rate": 2.187130073498127e-05, + "loss": 0.96, + "step": 7828 + }, + { + "epoch": 0.6995331382491567, + "grad_norm": 0.6047458648681641, + "learning_rate": 2.1859338292142876e-05, + "loss": 0.9264, + "step": 7829 + }, + { + "epoch": 0.6996224897804186, + "grad_norm": 0.4223198890686035, + "learning_rate": 2.184737820638486e-05, + "loss": 0.9497, + "step": 7830 + }, + { + "epoch": 0.6997118413116805, + "grad_norm": 0.5067864060401917, + "learning_rate": 2.1835420478709e-05, + "loss": 0.8954, + "step": 7831 + }, + { + "epoch": 0.6998011928429424, + "grad_norm": 0.4639846682548523, + "learning_rate": 2.182346511011689e-05, + "loss": 0.9765, + "step": 7832 + }, + { + "epoch": 0.6998905443742042, + "grad_norm": 0.44720324873924255, + "learning_rate": 2.1811512101609922e-05, + "loss": 0.9591, + "step": 7833 + }, + { + "epoch": 0.699979895905466, + "grad_norm": 0.4672408699989319, + "learning_rate": 2.17995614541893e-05, + "loss": 0.9872, + "step": 7834 + }, + { + "epoch": 0.7000692474367279, + "grad_norm": 0.5736651420593262, + "learning_rate": 2.1787613168855974e-05, + "loss": 0.9987, + "step": 7835 + }, + { + "epoch": 0.7001585989679898, + "grad_norm": 0.49598804116249084, + "learning_rate": 2.1775667246610775e-05, + "loss": 0.905, + "step": 7836 + }, + { + "epoch": 0.7002479504992517, + "grad_norm": 0.41947072744369507, + "learning_rate": 2.1763723688454298e-05, + "loss": 0.9417, + "step": 7837 + }, + { + "epoch": 0.7003373020305136, + "grad_norm": 0.4067612588405609, + "learning_rate": 2.1751782495386946e-05, + "loss": 0.9841, + "step": 7838 + }, + { + "epoch": 0.7004266535617755, + "grad_norm": 0.4044066071510315, + "learning_rate": 2.1739843668408904e-05, + "loss": 0.9668, + "step": 7839 + }, + { + "epoch": 0.7005160050930372, + "grad_norm": 0.712028980255127, + "learning_rate": 2.1727907208520187e-05, + "loss": 0.8248, + "step": 7840 + }, + { + "epoch": 0.7006053566242991, + "grad_norm": 0.4395425617694855, + "learning_rate": 2.1715973116720594e-05, + "loss": 0.9757, + "step": 7841 + }, + { + "epoch": 0.700694708155561, + "grad_norm": 0.4635796546936035, + "learning_rate": 2.1704041394009745e-05, + "loss": 0.9057, + "step": 7842 + }, + { + "epoch": 0.7007840596868229, + "grad_norm": 0.46956396102905273, + "learning_rate": 2.169211204138704e-05, + "loss": 0.9642, + "step": 7843 + }, + { + "epoch": 0.7008734112180848, + "grad_norm": 0.4673489034175873, + "learning_rate": 2.1680185059851692e-05, + "loss": 0.9328, + "step": 7844 + }, + { + "epoch": 0.7009627627493467, + "grad_norm": 0.4535006880760193, + "learning_rate": 2.166826045040271e-05, + "loss": 0.9427, + "step": 7845 + }, + { + "epoch": 0.7010521142806084, + "grad_norm": 0.5039295554161072, + "learning_rate": 2.165633821403892e-05, + "loss": 0.894, + "step": 7846 + }, + { + "epoch": 0.7011414658118703, + "grad_norm": 0.6144269704818726, + "learning_rate": 2.1644418351758917e-05, + "loss": 0.9041, + "step": 7847 + }, + { + "epoch": 0.7012308173431322, + "grad_norm": 0.5136603116989136, + "learning_rate": 2.1632500864561146e-05, + "loss": 0.8892, + "step": 7848 + }, + { + "epoch": 0.7013201688743941, + "grad_norm": 0.46805840730667114, + "learning_rate": 2.1620585753443786e-05, + "loss": 1.002, + "step": 7849 + }, + { + "epoch": 0.701409520405656, + "grad_norm": 0.41075465083122253, + "learning_rate": 2.160867301940487e-05, + "loss": 0.9253, + "step": 7850 + }, + { + "epoch": 0.7014988719369178, + "grad_norm": 0.46872520446777344, + "learning_rate": 2.1596762663442218e-05, + "loss": 0.909, + "step": 7851 + }, + { + "epoch": 0.7015882234681797, + "grad_norm": 0.5055715441703796, + "learning_rate": 2.1584854686553453e-05, + "loss": 0.9786, + "step": 7852 + }, + { + "epoch": 0.7016775749994415, + "grad_norm": 0.48132845759391785, + "learning_rate": 2.1572949089735987e-05, + "loss": 0.9081, + "step": 7853 + }, + { + "epoch": 0.7017669265307034, + "grad_norm": 0.5568394660949707, + "learning_rate": 2.1561045873987046e-05, + "loss": 0.8675, + "step": 7854 + }, + { + "epoch": 0.7018562780619653, + "grad_norm": 0.5073636770248413, + "learning_rate": 2.1549145040303654e-05, + "loss": 0.9566, + "step": 7855 + }, + { + "epoch": 0.7019456295932271, + "grad_norm": 0.42798787355422974, + "learning_rate": 2.153724658968263e-05, + "loss": 0.9682, + "step": 7856 + }, + { + "epoch": 0.702034981124489, + "grad_norm": 0.5035516023635864, + "learning_rate": 2.152535052312059e-05, + "loss": 0.9313, + "step": 7857 + }, + { + "epoch": 0.7021243326557509, + "grad_norm": 0.43961191177368164, + "learning_rate": 2.1513456841613982e-05, + "loss": 0.9328, + "step": 7858 + }, + { + "epoch": 0.7022136841870128, + "grad_norm": 0.4986516833305359, + "learning_rate": 2.1501565546158993e-05, + "loss": 0.9385, + "step": 7859 + }, + { + "epoch": 0.7023030357182746, + "grad_norm": 0.5121946930885315, + "learning_rate": 2.148967663775166e-05, + "loss": 0.8721, + "step": 7860 + }, + { + "epoch": 0.7023923872495365, + "grad_norm": 0.6112991571426392, + "learning_rate": 2.1477790117387808e-05, + "loss": 0.9234, + "step": 7861 + }, + { + "epoch": 0.7024817387807983, + "grad_norm": 0.4206686317920685, + "learning_rate": 2.1465905986063056e-05, + "loss": 0.9169, + "step": 7862 + }, + { + "epoch": 0.7025710903120602, + "grad_norm": 0.5358665585517883, + "learning_rate": 2.145402424477283e-05, + "loss": 0.9301, + "step": 7863 + }, + { + "epoch": 0.7026604418433221, + "grad_norm": 0.5316134095191956, + "learning_rate": 2.1442144894512352e-05, + "loss": 0.9495, + "step": 7864 + }, + { + "epoch": 0.702749793374584, + "grad_norm": 0.5568259954452515, + "learning_rate": 2.1430267936276637e-05, + "loss": 0.9321, + "step": 7865 + }, + { + "epoch": 0.7028391449058459, + "grad_norm": 0.39786767959594727, + "learning_rate": 2.1418393371060542e-05, + "loss": 0.9312, + "step": 7866 + }, + { + "epoch": 0.7029284964371076, + "grad_norm": 0.6305972933769226, + "learning_rate": 2.1406521199858637e-05, + "loss": 0.8965, + "step": 7867 + }, + { + "epoch": 0.7030178479683695, + "grad_norm": 0.5135489106178284, + "learning_rate": 2.1394651423665368e-05, + "loss": 1.0525, + "step": 7868 + }, + { + "epoch": 0.7031071994996314, + "grad_norm": 0.5106921792030334, + "learning_rate": 2.1382784043474953e-05, + "loss": 0.9549, + "step": 7869 + }, + { + "epoch": 0.7031965510308933, + "grad_norm": 0.462363064289093, + "learning_rate": 2.1370919060281415e-05, + "loss": 0.9733, + "step": 7870 + }, + { + "epoch": 0.7032859025621552, + "grad_norm": 0.4888690710067749, + "learning_rate": 2.135905647507858e-05, + "loss": 0.8726, + "step": 7871 + }, + { + "epoch": 0.7033752540934171, + "grad_norm": 0.5045296549797058, + "learning_rate": 2.1347196288860045e-05, + "loss": 0.9865, + "step": 7872 + }, + { + "epoch": 0.7034646056246789, + "grad_norm": 0.47808846831321716, + "learning_rate": 2.1335338502619233e-05, + "loss": 0.9196, + "step": 7873 + }, + { + "epoch": 0.7035539571559407, + "grad_norm": 0.42234256863594055, + "learning_rate": 2.1323483117349368e-05, + "loss": 0.9596, + "step": 7874 + }, + { + "epoch": 0.7036433086872026, + "grad_norm": 0.432258278131485, + "learning_rate": 2.131163013404346e-05, + "loss": 1.0006, + "step": 7875 + }, + { + "epoch": 0.7037326602184645, + "grad_norm": 0.45173177123069763, + "learning_rate": 2.1299779553694323e-05, + "loss": 0.9875, + "step": 7876 + }, + { + "epoch": 0.7038220117497264, + "grad_norm": 0.5817151069641113, + "learning_rate": 2.128793137729457e-05, + "loss": 0.8725, + "step": 7877 + }, + { + "epoch": 0.7039113632809882, + "grad_norm": 0.4585109055042267, + "learning_rate": 2.1276085605836636e-05, + "loss": 0.891, + "step": 7878 + }, + { + "epoch": 0.7040007148122501, + "grad_norm": 0.547097384929657, + "learning_rate": 2.1264242240312687e-05, + "loss": 0.9042, + "step": 7879 + }, + { + "epoch": 0.704090066343512, + "grad_norm": 0.6232752799987793, + "learning_rate": 2.1252401281714752e-05, + "loss": 0.8304, + "step": 7880 + }, + { + "epoch": 0.7041794178747738, + "grad_norm": 0.4578670859336853, + "learning_rate": 2.1240562731034635e-05, + "loss": 0.8873, + "step": 7881 + }, + { + "epoch": 0.7042687694060357, + "grad_norm": 0.47056087851524353, + "learning_rate": 2.1228726589263942e-05, + "loss": 0.9029, + "step": 7882 + }, + { + "epoch": 0.7043581209372975, + "grad_norm": 0.48560357093811035, + "learning_rate": 2.1216892857394083e-05, + "loss": 0.8779, + "step": 7883 + }, + { + "epoch": 0.7044474724685594, + "grad_norm": 0.536952555179596, + "learning_rate": 2.120506153641625e-05, + "loss": 0.8473, + "step": 7884 + }, + { + "epoch": 0.7045368239998213, + "grad_norm": 0.41716116666793823, + "learning_rate": 2.119323262732144e-05, + "loss": 0.9998, + "step": 7885 + }, + { + "epoch": 0.7046261755310832, + "grad_norm": 0.5672418475151062, + "learning_rate": 2.1181406131100463e-05, + "loss": 0.9055, + "step": 7886 + }, + { + "epoch": 0.7047155270623451, + "grad_norm": 0.5225016474723816, + "learning_rate": 2.1169582048743902e-05, + "loss": 0.9609, + "step": 7887 + }, + { + "epoch": 0.7048048785936069, + "grad_norm": 0.4605422019958496, + "learning_rate": 2.1157760381242174e-05, + "loss": 0.9329, + "step": 7888 + }, + { + "epoch": 0.7048942301248687, + "grad_norm": 0.4933791756629944, + "learning_rate": 2.1145941129585435e-05, + "loss": 0.9709, + "step": 7889 + }, + { + "epoch": 0.7049835816561306, + "grad_norm": 0.42105743288993835, + "learning_rate": 2.1134124294763678e-05, + "loss": 0.9333, + "step": 7890 + }, + { + "epoch": 0.7050729331873925, + "grad_norm": 0.5470524430274963, + "learning_rate": 2.112230987776671e-05, + "loss": 1.0609, + "step": 7891 + }, + { + "epoch": 0.7051622847186544, + "grad_norm": 0.48633435368537903, + "learning_rate": 2.11104978795841e-05, + "loss": 0.9119, + "step": 7892 + }, + { + "epoch": 0.7052516362499163, + "grad_norm": 0.6098655462265015, + "learning_rate": 2.1098688301205237e-05, + "loss": 0.8961, + "step": 7893 + }, + { + "epoch": 0.7053409877811782, + "grad_norm": 0.5170839428901672, + "learning_rate": 2.1086881143619292e-05, + "loss": 0.923, + "step": 7894 + }, + { + "epoch": 0.7054303393124399, + "grad_norm": 0.4735257923603058, + "learning_rate": 2.1075076407815243e-05, + "loss": 0.9574, + "step": 7895 + }, + { + "epoch": 0.7055196908437018, + "grad_norm": 0.6144537329673767, + "learning_rate": 2.106327409478186e-05, + "loss": 0.8456, + "step": 7896 + }, + { + "epoch": 0.7056090423749637, + "grad_norm": 0.5229153037071228, + "learning_rate": 2.1051474205507715e-05, + "loss": 0.9355, + "step": 7897 + }, + { + "epoch": 0.7056983939062256, + "grad_norm": 0.429524689912796, + "learning_rate": 2.1039676740981172e-05, + "loss": 0.954, + "step": 7898 + }, + { + "epoch": 0.7057877454374875, + "grad_norm": 0.5060709714889526, + "learning_rate": 2.1027881702190422e-05, + "loss": 0.8428, + "step": 7899 + }, + { + "epoch": 0.7058770969687493, + "grad_norm": 0.5516795516014099, + "learning_rate": 2.101608909012338e-05, + "loss": 0.9072, + "step": 7900 + }, + { + "epoch": 0.7059664485000112, + "grad_norm": 0.5318334102630615, + "learning_rate": 2.100429890576782e-05, + "loss": 0.9315, + "step": 7901 + }, + { + "epoch": 0.706055800031273, + "grad_norm": 0.54982990026474, + "learning_rate": 2.09925111501113e-05, + "loss": 0.969, + "step": 7902 + }, + { + "epoch": 0.7061451515625349, + "grad_norm": 0.4394816756248474, + "learning_rate": 2.0980725824141166e-05, + "loss": 1.042, + "step": 7903 + }, + { + "epoch": 0.7062345030937968, + "grad_norm": 0.579893171787262, + "learning_rate": 2.0968942928844593e-05, + "loss": 0.8299, + "step": 7904 + }, + { + "epoch": 0.7063238546250586, + "grad_norm": 0.4310462176799774, + "learning_rate": 2.0957162465208475e-05, + "loss": 0.9358, + "step": 7905 + }, + { + "epoch": 0.7064132061563205, + "grad_norm": 0.46636763215065, + "learning_rate": 2.094538443421958e-05, + "loss": 0.9262, + "step": 7906 + }, + { + "epoch": 0.7065025576875824, + "grad_norm": 0.5085285902023315, + "learning_rate": 2.0933608836864433e-05, + "loss": 0.9616, + "step": 7907 + }, + { + "epoch": 0.7065919092188442, + "grad_norm": 0.43514180183410645, + "learning_rate": 2.092183567412937e-05, + "loss": 1.0016, + "step": 7908 + }, + { + "epoch": 0.7066812607501061, + "grad_norm": 0.45315706729888916, + "learning_rate": 2.091006494700054e-05, + "loss": 0.9255, + "step": 7909 + }, + { + "epoch": 0.706770612281368, + "grad_norm": 0.46915316581726074, + "learning_rate": 2.0898296656463834e-05, + "loss": 0.9525, + "step": 7910 + }, + { + "epoch": 0.7068599638126298, + "grad_norm": 0.572519838809967, + "learning_rate": 2.0886530803504977e-05, + "loss": 0.9686, + "step": 7911 + }, + { + "epoch": 0.7069493153438917, + "grad_norm": 0.5691891312599182, + "learning_rate": 2.08747673891095e-05, + "loss": 0.9626, + "step": 7912 + }, + { + "epoch": 0.7070386668751536, + "grad_norm": 0.4699474275112152, + "learning_rate": 2.0863006414262703e-05, + "loss": 0.9125, + "step": 7913 + }, + { + "epoch": 0.7071280184064155, + "grad_norm": 0.5651034116744995, + "learning_rate": 2.0851247879949698e-05, + "loss": 0.8713, + "step": 7914 + }, + { + "epoch": 0.7072173699376773, + "grad_norm": 0.4276520609855652, + "learning_rate": 2.0839491787155387e-05, + "loss": 0.8954, + "step": 7915 + }, + { + "epoch": 0.7073067214689391, + "grad_norm": 0.5207197070121765, + "learning_rate": 2.0827738136864462e-05, + "loss": 0.9387, + "step": 7916 + }, + { + "epoch": 0.707396073000201, + "grad_norm": 0.43144193291664124, + "learning_rate": 2.0815986930061428e-05, + "loss": 0.8953, + "step": 7917 + }, + { + "epoch": 0.7074854245314629, + "grad_norm": 0.48623722791671753, + "learning_rate": 2.0804238167730566e-05, + "loss": 0.8455, + "step": 7918 + }, + { + "epoch": 0.7075747760627248, + "grad_norm": 0.4490886926651001, + "learning_rate": 2.0792491850855976e-05, + "loss": 0.9528, + "step": 7919 + }, + { + "epoch": 0.7076641275939867, + "grad_norm": 0.44778475165367126, + "learning_rate": 2.078074798042151e-05, + "loss": 0.9674, + "step": 7920 + }, + { + "epoch": 0.7077534791252486, + "grad_norm": 0.4196859300136566, + "learning_rate": 2.0769006557410858e-05, + "loss": 0.9325, + "step": 7921 + }, + { + "epoch": 0.7078428306565103, + "grad_norm": 0.4247509837150574, + "learning_rate": 2.0757267582807482e-05, + "loss": 0.9765, + "step": 7922 + }, + { + "epoch": 0.7079321821877722, + "grad_norm": 0.44996559619903564, + "learning_rate": 2.0745531057594654e-05, + "loss": 0.9291, + "step": 7923 + }, + { + "epoch": 0.7080215337190341, + "grad_norm": 0.4982374906539917, + "learning_rate": 2.0733796982755425e-05, + "loss": 0.8631, + "step": 7924 + }, + { + "epoch": 0.708110885250296, + "grad_norm": 0.515921413898468, + "learning_rate": 2.0722065359272657e-05, + "loss": 0.8803, + "step": 7925 + }, + { + "epoch": 0.7082002367815579, + "grad_norm": 0.4899452030658722, + "learning_rate": 2.0710336188128998e-05, + "loss": 0.8997, + "step": 7926 + }, + { + "epoch": 0.7082895883128197, + "grad_norm": 0.5061736106872559, + "learning_rate": 2.0698609470306885e-05, + "loss": 0.8947, + "step": 7927 + }, + { + "epoch": 0.7083789398440816, + "grad_norm": 0.45645084977149963, + "learning_rate": 2.0686885206788565e-05, + "loss": 0.9493, + "step": 7928 + }, + { + "epoch": 0.7084682913753434, + "grad_norm": 0.4615998864173889, + "learning_rate": 2.0675163398556073e-05, + "loss": 0.9419, + "step": 7929 + }, + { + "epoch": 0.7085576429066053, + "grad_norm": 0.5820156335830688, + "learning_rate": 2.066344404659122e-05, + "loss": 0.8803, + "step": 7930 + }, + { + "epoch": 0.7086469944378672, + "grad_norm": 0.48079875111579895, + "learning_rate": 2.065172715187562e-05, + "loss": 0.9185, + "step": 7931 + }, + { + "epoch": 0.708736345969129, + "grad_norm": 0.48415642976760864, + "learning_rate": 2.064001271539071e-05, + "loss": 0.9558, + "step": 7932 + }, + { + "epoch": 0.7088256975003909, + "grad_norm": 0.5177109241485596, + "learning_rate": 2.062830073811769e-05, + "loss": 0.9591, + "step": 7933 + }, + { + "epoch": 0.7089150490316528, + "grad_norm": 0.4747374653816223, + "learning_rate": 2.061659122103756e-05, + "loss": 1.048, + "step": 7934 + }, + { + "epoch": 0.7090044005629147, + "grad_norm": 0.5308428406715393, + "learning_rate": 2.0604884165131122e-05, + "loss": 0.8741, + "step": 7935 + }, + { + "epoch": 0.7090937520941765, + "grad_norm": 0.45492875576019287, + "learning_rate": 2.0593179571378964e-05, + "loss": 0.9447, + "step": 7936 + }, + { + "epoch": 0.7091831036254383, + "grad_norm": 0.534234881401062, + "learning_rate": 2.0581477440761488e-05, + "loss": 0.8724, + "step": 7937 + }, + { + "epoch": 0.7092724551567002, + "grad_norm": 0.4276201128959656, + "learning_rate": 2.0569777774258842e-05, + "loss": 0.9492, + "step": 7938 + }, + { + "epoch": 0.7093618066879621, + "grad_norm": 0.4283556044101715, + "learning_rate": 2.0558080572851002e-05, + "loss": 1.0475, + "step": 7939 + }, + { + "epoch": 0.709451158219224, + "grad_norm": 0.458793580532074, + "learning_rate": 2.054638583751775e-05, + "loss": 0.9828, + "step": 7940 + }, + { + "epoch": 0.7095405097504859, + "grad_norm": 0.41058874130249023, + "learning_rate": 2.053469356923865e-05, + "loss": 0.9937, + "step": 7941 + }, + { + "epoch": 0.7096298612817478, + "grad_norm": 0.4860825538635254, + "learning_rate": 2.0523003768993025e-05, + "loss": 0.9317, + "step": 7942 + }, + { + "epoch": 0.7097192128130095, + "grad_norm": 0.46478596329689026, + "learning_rate": 2.0511316437760042e-05, + "loss": 0.9276, + "step": 7943 + }, + { + "epoch": 0.7098085643442714, + "grad_norm": 0.504979133605957, + "learning_rate": 2.049963157651863e-05, + "loss": 0.9422, + "step": 7944 + }, + { + "epoch": 0.7098979158755333, + "grad_norm": 0.5935678482055664, + "learning_rate": 2.0487949186247524e-05, + "loss": 0.8924, + "step": 7945 + }, + { + "epoch": 0.7099872674067952, + "grad_norm": 0.4765070080757141, + "learning_rate": 2.0476269267925247e-05, + "loss": 0.9904, + "step": 7946 + }, + { + "epoch": 0.7100766189380571, + "grad_norm": 0.4645882844924927, + "learning_rate": 2.0464591822530123e-05, + "loss": 0.9594, + "step": 7947 + }, + { + "epoch": 0.710165970469319, + "grad_norm": 0.4871085584163666, + "learning_rate": 2.0452916851040256e-05, + "loss": 0.8937, + "step": 7948 + }, + { + "epoch": 0.7102553220005808, + "grad_norm": 0.44038885831832886, + "learning_rate": 2.0441244354433568e-05, + "loss": 0.951, + "step": 7949 + }, + { + "epoch": 0.7103446735318426, + "grad_norm": 0.5087218880653381, + "learning_rate": 2.042957433368773e-05, + "loss": 0.9512, + "step": 7950 + }, + { + "epoch": 0.7104340250631045, + "grad_norm": 0.5866579413414001, + "learning_rate": 2.0417906789780235e-05, + "loss": 0.8355, + "step": 7951 + }, + { + "epoch": 0.7105233765943664, + "grad_norm": 0.4508510231971741, + "learning_rate": 2.0406241723688362e-05, + "loss": 0.9801, + "step": 7952 + }, + { + "epoch": 0.7106127281256283, + "grad_norm": 0.4516523778438568, + "learning_rate": 2.0394579136389203e-05, + "loss": 0.9136, + "step": 7953 + }, + { + "epoch": 0.7107020796568901, + "grad_norm": 0.45436009764671326, + "learning_rate": 2.0382919028859605e-05, + "loss": 0.8881, + "step": 7954 + }, + { + "epoch": 0.710791431188152, + "grad_norm": 0.6993239521980286, + "learning_rate": 2.0371261402076236e-05, + "loss": 0.9434, + "step": 7955 + }, + { + "epoch": 0.7108807827194139, + "grad_norm": 0.47904080152511597, + "learning_rate": 2.0359606257015546e-05, + "loss": 1.0162, + "step": 7956 + }, + { + "epoch": 0.7109701342506757, + "grad_norm": 0.4850071370601654, + "learning_rate": 2.034795359465377e-05, + "loss": 0.9378, + "step": 7957 + }, + { + "epoch": 0.7110594857819376, + "grad_norm": 0.5238434672355652, + "learning_rate": 2.0336303415966952e-05, + "loss": 0.9051, + "step": 7958 + }, + { + "epoch": 0.7111488373131994, + "grad_norm": 0.5065889954566956, + "learning_rate": 2.0324655721930937e-05, + "loss": 0.9456, + "step": 7959 + }, + { + "epoch": 0.7112381888444613, + "grad_norm": 0.5274893045425415, + "learning_rate": 2.0313010513521298e-05, + "loss": 0.9309, + "step": 7960 + }, + { + "epoch": 0.7113275403757232, + "grad_norm": 0.47935160994529724, + "learning_rate": 2.030136779171347e-05, + "loss": 0.9542, + "step": 7961 + }, + { + "epoch": 0.7114168919069851, + "grad_norm": 0.4876161515712738, + "learning_rate": 2.0289727557482656e-05, + "loss": 0.881, + "step": 7962 + }, + { + "epoch": 0.711506243438247, + "grad_norm": 0.49741584062576294, + "learning_rate": 2.0278089811803846e-05, + "loss": 0.9275, + "step": 7963 + }, + { + "epoch": 0.7115955949695087, + "grad_norm": 0.5018129944801331, + "learning_rate": 2.0266454555651825e-05, + "loss": 0.9503, + "step": 7964 + }, + { + "epoch": 0.7116849465007706, + "grad_norm": 0.46502235531806946, + "learning_rate": 2.0254821790001177e-05, + "loss": 0.96, + "step": 7965 + }, + { + "epoch": 0.7117742980320325, + "grad_norm": 0.5179951786994934, + "learning_rate": 2.0243191515826265e-05, + "loss": 1.0107, + "step": 7966 + }, + { + "epoch": 0.7118636495632944, + "grad_norm": 0.49856871366500854, + "learning_rate": 2.0231563734101243e-05, + "loss": 0.9671, + "step": 7967 + }, + { + "epoch": 0.7119530010945563, + "grad_norm": 0.4230417013168335, + "learning_rate": 2.0219938445800074e-05, + "loss": 0.9217, + "step": 7968 + }, + { + "epoch": 0.7120423526258182, + "grad_norm": 0.5896261930465698, + "learning_rate": 2.0208315651896504e-05, + "loss": 0.9198, + "step": 7969 + }, + { + "epoch": 0.7121317041570799, + "grad_norm": 0.46407637000083923, + "learning_rate": 2.0196695353364042e-05, + "loss": 1.0145, + "step": 7970 + }, + { + "epoch": 0.7122210556883418, + "grad_norm": 0.39339298009872437, + "learning_rate": 2.018507755117602e-05, + "loss": 0.9654, + "step": 7971 + }, + { + "epoch": 0.7123104072196037, + "grad_norm": 0.5174713730812073, + "learning_rate": 2.017346224630556e-05, + "loss": 0.9395, + "step": 7972 + }, + { + "epoch": 0.7123997587508656, + "grad_norm": 0.5631799697875977, + "learning_rate": 2.0161849439725565e-05, + "loss": 0.8938, + "step": 7973 + }, + { + "epoch": 0.7124891102821275, + "grad_norm": 0.4893137812614441, + "learning_rate": 2.015023913240875e-05, + "loss": 0.9324, + "step": 7974 + }, + { + "epoch": 0.7125784618133894, + "grad_norm": 0.495174765586853, + "learning_rate": 2.0138631325327563e-05, + "loss": 0.9131, + "step": 7975 + }, + { + "epoch": 0.7126678133446512, + "grad_norm": 0.4327716827392578, + "learning_rate": 2.0127026019454304e-05, + "loss": 0.9635, + "step": 7976 + }, + { + "epoch": 0.712757164875913, + "grad_norm": 0.3611029386520386, + "learning_rate": 2.0115423215761037e-05, + "loss": 0.9878, + "step": 7977 + }, + { + "epoch": 0.7128465164071749, + "grad_norm": 0.4893472492694855, + "learning_rate": 2.0103822915219624e-05, + "loss": 0.9732, + "step": 7978 + }, + { + "epoch": 0.7129358679384368, + "grad_norm": 0.4027042090892792, + "learning_rate": 2.0092225118801706e-05, + "loss": 0.9217, + "step": 7979 + }, + { + "epoch": 0.7130252194696987, + "grad_norm": 0.4319385290145874, + "learning_rate": 2.0080629827478753e-05, + "loss": 0.92, + "step": 7980 + }, + { + "epoch": 0.7131145710009605, + "grad_norm": 0.48171430826187134, + "learning_rate": 2.006903704222195e-05, + "loss": 0.8967, + "step": 7981 + }, + { + "epoch": 0.7132039225322224, + "grad_norm": 0.5436350703239441, + "learning_rate": 2.0057446764002337e-05, + "loss": 0.9484, + "step": 7982 + }, + { + "epoch": 0.7132932740634843, + "grad_norm": 0.42786580324172974, + "learning_rate": 2.004585899379072e-05, + "loss": 1.0898, + "step": 7983 + }, + { + "epoch": 0.7133826255947461, + "grad_norm": 0.4779389798641205, + "learning_rate": 2.00342737325577e-05, + "loss": 0.9715, + "step": 7984 + }, + { + "epoch": 0.713471977126008, + "grad_norm": 0.47203120589256287, + "learning_rate": 2.0022690981273666e-05, + "loss": 0.9221, + "step": 7985 + }, + { + "epoch": 0.7135613286572698, + "grad_norm": 0.43961775302886963, + "learning_rate": 2.0011110740908802e-05, + "loss": 0.9608, + "step": 7986 + }, + { + "epoch": 0.7136506801885317, + "grad_norm": 0.5077892541885376, + "learning_rate": 1.999953301243307e-05, + "loss": 0.9414, + "step": 7987 + }, + { + "epoch": 0.7137400317197936, + "grad_norm": 0.5579219460487366, + "learning_rate": 1.998795779681623e-05, + "loss": 0.9396, + "step": 7988 + }, + { + "epoch": 0.7138293832510555, + "grad_norm": 0.5244463682174683, + "learning_rate": 1.9976385095027826e-05, + "loss": 0.938, + "step": 7989 + }, + { + "epoch": 0.7139187347823174, + "grad_norm": 0.5091515779495239, + "learning_rate": 1.9964814908037223e-05, + "loss": 0.9086, + "step": 7990 + }, + { + "epoch": 0.7140080863135791, + "grad_norm": 0.5169358253479004, + "learning_rate": 1.9953247236813504e-05, + "loss": 0.9151, + "step": 7991 + }, + { + "epoch": 0.714097437844841, + "grad_norm": 0.509032666683197, + "learning_rate": 1.9941682082325602e-05, + "loss": 0.8878, + "step": 7992 + }, + { + "epoch": 0.7141867893761029, + "grad_norm": 0.3874991238117218, + "learning_rate": 1.9930119445542227e-05, + "loss": 0.9988, + "step": 7993 + }, + { + "epoch": 0.7142761409073648, + "grad_norm": 0.48893627524375916, + "learning_rate": 1.991855932743187e-05, + "loss": 0.9737, + "step": 7994 + }, + { + "epoch": 0.7143654924386267, + "grad_norm": 0.5337826013565063, + "learning_rate": 1.990700172896281e-05, + "loss": 0.9399, + "step": 7995 + }, + { + "epoch": 0.7144548439698886, + "grad_norm": 0.5026716589927673, + "learning_rate": 1.989544665110313e-05, + "loss": 0.9656, + "step": 7996 + }, + { + "epoch": 0.7145441955011504, + "grad_norm": 0.5005045533180237, + "learning_rate": 1.988389409482068e-05, + "loss": 1.0192, + "step": 7997 + }, + { + "epoch": 0.7146335470324122, + "grad_norm": 0.45082736015319824, + "learning_rate": 1.9872344061083113e-05, + "loss": 0.9364, + "step": 7998 + }, + { + "epoch": 0.7147228985636741, + "grad_norm": 0.4575754702091217, + "learning_rate": 1.9860796550857872e-05, + "loss": 0.9433, + "step": 7999 + }, + { + "epoch": 0.714812250094936, + "grad_norm": 0.5190432071685791, + "learning_rate": 1.98492515651122e-05, + "loss": 0.9311, + "step": 8000 + }, + { + "epoch": 0.7149016016261979, + "grad_norm": 0.4815937578678131, + "learning_rate": 1.9837709104813073e-05, + "loss": 0.9925, + "step": 8001 + }, + { + "epoch": 0.7149909531574598, + "grad_norm": 0.4584693908691406, + "learning_rate": 1.9826169170927317e-05, + "loss": 0.9623, + "step": 8002 + }, + { + "epoch": 0.7150803046887216, + "grad_norm": 0.4569786489009857, + "learning_rate": 1.9814631764421522e-05, + "loss": 0.9571, + "step": 8003 + }, + { + "epoch": 0.7151696562199835, + "grad_norm": 0.47242796421051025, + "learning_rate": 1.9803096886262068e-05, + "loss": 0.864, + "step": 8004 + }, + { + "epoch": 0.7152590077512453, + "grad_norm": 0.4647274613380432, + "learning_rate": 1.9791564537415124e-05, + "loss": 0.9579, + "step": 8005 + }, + { + "epoch": 0.7153483592825072, + "grad_norm": 0.4390351474285126, + "learning_rate": 1.978003471884665e-05, + "loss": 0.9682, + "step": 8006 + }, + { + "epoch": 0.7154377108137691, + "grad_norm": 0.4374440908432007, + "learning_rate": 1.976850743152241e-05, + "loss": 0.9661, + "step": 8007 + }, + { + "epoch": 0.7155270623450309, + "grad_norm": 0.48356321454048157, + "learning_rate": 1.97569826764079e-05, + "loss": 0.9314, + "step": 8008 + }, + { + "epoch": 0.7156164138762928, + "grad_norm": 0.37716495990753174, + "learning_rate": 1.9745460454468457e-05, + "loss": 0.9704, + "step": 8009 + }, + { + "epoch": 0.7157057654075547, + "grad_norm": 0.3916030526161194, + "learning_rate": 1.973394076666919e-05, + "loss": 0.9635, + "step": 8010 + }, + { + "epoch": 0.7157951169388166, + "grad_norm": 0.5709711909294128, + "learning_rate": 1.9722423613975016e-05, + "loss": 0.9523, + "step": 8011 + }, + { + "epoch": 0.7158844684700784, + "grad_norm": 0.4928154945373535, + "learning_rate": 1.971090899735058e-05, + "loss": 0.8978, + "step": 8012 + }, + { + "epoch": 0.7159738200013402, + "grad_norm": 0.4418443739414215, + "learning_rate": 1.9699396917760377e-05, + "loss": 0.8998, + "step": 8013 + }, + { + "epoch": 0.7160631715326021, + "grad_norm": 0.5247768759727478, + "learning_rate": 1.9687887376168663e-05, + "loss": 1.0516, + "step": 8014 + }, + { + "epoch": 0.716152523063864, + "grad_norm": 0.5156433582305908, + "learning_rate": 1.9676380373539482e-05, + "loss": 0.9109, + "step": 8015 + }, + { + "epoch": 0.7162418745951259, + "grad_norm": 0.4411882758140564, + "learning_rate": 1.9664875910836677e-05, + "loss": 0.9728, + "step": 8016 + }, + { + "epoch": 0.7163312261263878, + "grad_norm": 0.5197750329971313, + "learning_rate": 1.965337398902386e-05, + "loss": 0.9223, + "step": 8017 + }, + { + "epoch": 0.7164205776576497, + "grad_norm": 0.45580387115478516, + "learning_rate": 1.9641874609064443e-05, + "loss": 0.9332, + "step": 8018 + }, + { + "epoch": 0.7165099291889114, + "grad_norm": 0.4132586419582367, + "learning_rate": 1.963037777192162e-05, + "loss": 0.9757, + "step": 8019 + }, + { + "epoch": 0.7165992807201733, + "grad_norm": 0.5496894121170044, + "learning_rate": 1.96188834785584e-05, + "loss": 1.0128, + "step": 8020 + }, + { + "epoch": 0.7166886322514352, + "grad_norm": 0.4949209988117218, + "learning_rate": 1.9607391729937503e-05, + "loss": 0.9594, + "step": 8021 + }, + { + "epoch": 0.7167779837826971, + "grad_norm": 0.4753008782863617, + "learning_rate": 1.9595902527021513e-05, + "loss": 0.9873, + "step": 8022 + }, + { + "epoch": 0.716867335313959, + "grad_norm": 0.4486302435398102, + "learning_rate": 1.958441587077277e-05, + "loss": 0.8871, + "step": 8023 + }, + { + "epoch": 0.7169566868452208, + "grad_norm": 0.42516544461250305, + "learning_rate": 1.9572931762153407e-05, + "loss": 1.0009, + "step": 8024 + }, + { + "epoch": 0.7170460383764827, + "grad_norm": 0.4175739288330078, + "learning_rate": 1.9561450202125337e-05, + "loss": 0.9372, + "step": 8025 + }, + { + "epoch": 0.7171353899077445, + "grad_norm": 0.5125389695167542, + "learning_rate": 1.9549971191650262e-05, + "loss": 0.9469, + "step": 8026 + }, + { + "epoch": 0.7172247414390064, + "grad_norm": 0.5798017382621765, + "learning_rate": 1.953849473168968e-05, + "loss": 0.8865, + "step": 8027 + }, + { + "epoch": 0.7173140929702683, + "grad_norm": 0.4292055666446686, + "learning_rate": 1.9527020823204856e-05, + "loss": 0.991, + "step": 8028 + }, + { + "epoch": 0.7174034445015302, + "grad_norm": 0.41361159086227417, + "learning_rate": 1.951554946715686e-05, + "loss": 0.9773, + "step": 8029 + }, + { + "epoch": 0.717492796032792, + "grad_norm": 0.5009101629257202, + "learning_rate": 1.9504080664506546e-05, + "loss": 0.9358, + "step": 8030 + }, + { + "epoch": 0.7175821475640539, + "grad_norm": 0.6405025124549866, + "learning_rate": 1.9492614416214526e-05, + "loss": 0.9336, + "step": 8031 + }, + { + "epoch": 0.7176714990953158, + "grad_norm": 0.440360426902771, + "learning_rate": 1.9481150723241236e-05, + "loss": 0.923, + "step": 8032 + }, + { + "epoch": 0.7177608506265776, + "grad_norm": 0.4327067732810974, + "learning_rate": 1.946968958654688e-05, + "loss": 0.9564, + "step": 8033 + }, + { + "epoch": 0.7178502021578395, + "grad_norm": 0.6656114459037781, + "learning_rate": 1.9458231007091456e-05, + "loss": 0.9387, + "step": 8034 + }, + { + "epoch": 0.7179395536891013, + "grad_norm": 0.5067597031593323, + "learning_rate": 1.9446774985834726e-05, + "loss": 0.905, + "step": 8035 + }, + { + "epoch": 0.7180289052203632, + "grad_norm": 0.48535487055778503, + "learning_rate": 1.943532152373627e-05, + "loss": 0.8893, + "step": 8036 + }, + { + "epoch": 0.7181182567516251, + "grad_norm": 0.6070266962051392, + "learning_rate": 1.9423870621755434e-05, + "loss": 0.8946, + "step": 8037 + }, + { + "epoch": 0.718207608282887, + "grad_norm": 0.5179934501647949, + "learning_rate": 1.941242228085135e-05, + "loss": 0.8965, + "step": 8038 + }, + { + "epoch": 0.7182969598141488, + "grad_norm": 0.49246200919151306, + "learning_rate": 1.9400976501982943e-05, + "loss": 0.8797, + "step": 8039 + }, + { + "epoch": 0.7183863113454106, + "grad_norm": 0.47545865178108215, + "learning_rate": 1.938953328610893e-05, + "loss": 0.9602, + "step": 8040 + }, + { + "epoch": 0.7184756628766725, + "grad_norm": 0.5661696791648865, + "learning_rate": 1.9378092634187772e-05, + "loss": 0.8881, + "step": 8041 + }, + { + "epoch": 0.7185650144079344, + "grad_norm": 0.5181058645248413, + "learning_rate": 1.9366654547177764e-05, + "loss": 0.9065, + "step": 8042 + }, + { + "epoch": 0.7186543659391963, + "grad_norm": 0.42876681685447693, + "learning_rate": 1.9355219026036965e-05, + "loss": 1.0135, + "step": 8043 + }, + { + "epoch": 0.7187437174704582, + "grad_norm": 0.5706227421760559, + "learning_rate": 1.934378607172324e-05, + "loss": 0.9067, + "step": 8044 + }, + { + "epoch": 0.7188330690017201, + "grad_norm": 0.4501083791255951, + "learning_rate": 1.9332355685194182e-05, + "loss": 0.9817, + "step": 8045 + }, + { + "epoch": 0.7189224205329818, + "grad_norm": 0.5079956650733948, + "learning_rate": 1.9320927867407234e-05, + "loss": 0.9354, + "step": 8046 + }, + { + "epoch": 0.7190117720642437, + "grad_norm": 0.43281540274620056, + "learning_rate": 1.9309502619319592e-05, + "loss": 0.9358, + "step": 8047 + }, + { + "epoch": 0.7191011235955056, + "grad_norm": 0.45937132835388184, + "learning_rate": 1.9298079941888237e-05, + "loss": 0.9387, + "step": 8048 + }, + { + "epoch": 0.7191904751267675, + "grad_norm": 0.572298526763916, + "learning_rate": 1.9286659836069953e-05, + "loss": 0.8596, + "step": 8049 + }, + { + "epoch": 0.7192798266580294, + "grad_norm": 0.4632246494293213, + "learning_rate": 1.9275242302821302e-05, + "loss": 0.9057, + "step": 8050 + }, + { + "epoch": 0.7193691781892912, + "grad_norm": 0.46226373314857483, + "learning_rate": 1.9263827343098594e-05, + "loss": 0.9242, + "step": 8051 + }, + { + "epoch": 0.7194585297205531, + "grad_norm": 0.4486196041107178, + "learning_rate": 1.9252414957857966e-05, + "loss": 0.9282, + "step": 8052 + }, + { + "epoch": 0.7195478812518149, + "grad_norm": 0.5384345054626465, + "learning_rate": 1.9241005148055336e-05, + "loss": 0.904, + "step": 8053 + }, + { + "epoch": 0.7196372327830768, + "grad_norm": 0.5714906454086304, + "learning_rate": 1.922959791464639e-05, + "loss": 0.8597, + "step": 8054 + }, + { + "epoch": 0.7197265843143387, + "grad_norm": 0.5070673227310181, + "learning_rate": 1.921819325858661e-05, + "loss": 0.9542, + "step": 8055 + }, + { + "epoch": 0.7198159358456006, + "grad_norm": 0.5547393560409546, + "learning_rate": 1.920679118083125e-05, + "loss": 0.892, + "step": 8056 + }, + { + "epoch": 0.7199052873768624, + "grad_norm": 0.4647422134876251, + "learning_rate": 1.9195391682335368e-05, + "loss": 0.8927, + "step": 8057 + }, + { + "epoch": 0.7199946389081243, + "grad_norm": 0.4780432879924774, + "learning_rate": 1.918399476405378e-05, + "loss": 0.964, + "step": 8058 + }, + { + "epoch": 0.7200839904393862, + "grad_norm": 0.5166242122650146, + "learning_rate": 1.9172600426941113e-05, + "loss": 0.9151, + "step": 8059 + }, + { + "epoch": 0.720173341970648, + "grad_norm": 0.4829009771347046, + "learning_rate": 1.9161208671951763e-05, + "loss": 0.9347, + "step": 8060 + }, + { + "epoch": 0.7202626935019099, + "grad_norm": 0.5364348292350769, + "learning_rate": 1.9149819500039896e-05, + "loss": 0.9733, + "step": 8061 + }, + { + "epoch": 0.7203520450331717, + "grad_norm": 0.43714621663093567, + "learning_rate": 1.913843291215948e-05, + "loss": 0.9185, + "step": 8062 + }, + { + "epoch": 0.7204413965644336, + "grad_norm": 0.5605136752128601, + "learning_rate": 1.912704890926427e-05, + "loss": 0.8812, + "step": 8063 + }, + { + "epoch": 0.7205307480956955, + "grad_norm": 0.5223738551139832, + "learning_rate": 1.9115667492307797e-05, + "loss": 0.8438, + "step": 8064 + }, + { + "epoch": 0.7206200996269574, + "grad_norm": 0.46035128831863403, + "learning_rate": 1.9104288662243375e-05, + "loss": 0.9511, + "step": 8065 + }, + { + "epoch": 0.7207094511582193, + "grad_norm": 0.3924838602542877, + "learning_rate": 1.90929124200241e-05, + "loss": 0.9798, + "step": 8066 + }, + { + "epoch": 0.720798802689481, + "grad_norm": 0.47366228699684143, + "learning_rate": 1.9081538766602857e-05, + "loss": 1.0173, + "step": 8067 + }, + { + "epoch": 0.7208881542207429, + "grad_norm": 0.4460136294364929, + "learning_rate": 1.9070167702932313e-05, + "loss": 0.9475, + "step": 8068 + }, + { + "epoch": 0.7209775057520048, + "grad_norm": 0.4173930883407593, + "learning_rate": 1.9058799229964907e-05, + "loss": 0.9413, + "step": 8069 + }, + { + "epoch": 0.7210668572832667, + "grad_norm": 0.5706358551979065, + "learning_rate": 1.9047433348652876e-05, + "loss": 0.8534, + "step": 8070 + }, + { + "epoch": 0.7211562088145286, + "grad_norm": 0.47395849227905273, + "learning_rate": 1.9036070059948252e-05, + "loss": 0.9308, + "step": 8071 + }, + { + "epoch": 0.7212455603457905, + "grad_norm": 0.461373507976532, + "learning_rate": 1.9024709364802795e-05, + "loss": 0.8572, + "step": 8072 + }, + { + "epoch": 0.7213349118770523, + "grad_norm": 0.40039876103401184, + "learning_rate": 1.90133512641681e-05, + "loss": 0.9251, + "step": 8073 + }, + { + "epoch": 0.7214242634083141, + "grad_norm": 0.4980182945728302, + "learning_rate": 1.9001995758995533e-05, + "loss": 0.9338, + "step": 8074 + }, + { + "epoch": 0.721513614939576, + "grad_norm": 0.4447602927684784, + "learning_rate": 1.899064285023623e-05, + "loss": 0.9201, + "step": 8075 + }, + { + "epoch": 0.7216029664708379, + "grad_norm": 0.404520183801651, + "learning_rate": 1.8979292538841132e-05, + "loss": 0.9922, + "step": 8076 + }, + { + "epoch": 0.7216923180020998, + "grad_norm": 0.4557284116744995, + "learning_rate": 1.8967944825760954e-05, + "loss": 0.9549, + "step": 8077 + }, + { + "epoch": 0.7217816695333616, + "grad_norm": 0.45047318935394287, + "learning_rate": 1.8956599711946156e-05, + "loss": 0.9628, + "step": 8078 + }, + { + "epoch": 0.7218710210646235, + "grad_norm": 0.4499797224998474, + "learning_rate": 1.8945257198347034e-05, + "loss": 0.9302, + "step": 8079 + }, + { + "epoch": 0.7219603725958854, + "grad_norm": 0.4355414807796478, + "learning_rate": 1.893391728591364e-05, + "loss": 0.942, + "step": 8080 + }, + { + "epoch": 0.7220497241271472, + "grad_norm": 0.5795384049415588, + "learning_rate": 1.8922579975595835e-05, + "loss": 0.8788, + "step": 8081 + }, + { + "epoch": 0.7221390756584091, + "grad_norm": 0.5405740737915039, + "learning_rate": 1.891124526834319e-05, + "loss": 0.9337, + "step": 8082 + }, + { + "epoch": 0.722228427189671, + "grad_norm": 0.4552251696586609, + "learning_rate": 1.889991316510515e-05, + "loss": 1.0248, + "step": 8083 + }, + { + "epoch": 0.7223177787209328, + "grad_norm": 0.3918149471282959, + "learning_rate": 1.8888583666830876e-05, + "loss": 1.0259, + "step": 8084 + }, + { + "epoch": 0.7224071302521947, + "grad_norm": 0.5067173838615417, + "learning_rate": 1.887725677446935e-05, + "loss": 0.9839, + "step": 8085 + }, + { + "epoch": 0.7224964817834566, + "grad_norm": 0.5742607116699219, + "learning_rate": 1.8865932488969307e-05, + "loss": 0.8916, + "step": 8086 + }, + { + "epoch": 0.7225858333147185, + "grad_norm": 0.5914955735206604, + "learning_rate": 1.8854610811279288e-05, + "loss": 0.9132, + "step": 8087 + }, + { + "epoch": 0.7226751848459803, + "grad_norm": 0.479939341545105, + "learning_rate": 1.88432917423476e-05, + "loss": 1.0004, + "step": 8088 + }, + { + "epoch": 0.7227645363772421, + "grad_norm": 0.5861053466796875, + "learning_rate": 1.883197528312233e-05, + "loss": 0.9021, + "step": 8089 + }, + { + "epoch": 0.722853887908504, + "grad_norm": 0.607997715473175, + "learning_rate": 1.8820661434551362e-05, + "loss": 0.8781, + "step": 8090 + }, + { + "epoch": 0.7229432394397659, + "grad_norm": 0.4828149378299713, + "learning_rate": 1.8809350197582364e-05, + "loss": 0.865, + "step": 8091 + }, + { + "epoch": 0.7230325909710278, + "grad_norm": 0.6091709136962891, + "learning_rate": 1.879804157316274e-05, + "loss": 0.9282, + "step": 8092 + }, + { + "epoch": 0.7231219425022897, + "grad_norm": 0.5312778353691101, + "learning_rate": 1.878673556223972e-05, + "loss": 0.9184, + "step": 8093 + }, + { + "epoch": 0.7232112940335516, + "grad_norm": 0.41062840819358826, + "learning_rate": 1.8775432165760303e-05, + "loss": 0.9453, + "step": 8094 + }, + { + "epoch": 0.7233006455648133, + "grad_norm": 0.6178465485572815, + "learning_rate": 1.876413138467128e-05, + "loss": 0.9003, + "step": 8095 + }, + { + "epoch": 0.7233899970960752, + "grad_norm": 0.4309876561164856, + "learning_rate": 1.8752833219919197e-05, + "loss": 0.9821, + "step": 8096 + }, + { + "epoch": 0.7234793486273371, + "grad_norm": 0.5962274670600891, + "learning_rate": 1.8741537672450405e-05, + "loss": 0.9284, + "step": 8097 + }, + { + "epoch": 0.723568700158599, + "grad_norm": 0.4582142233848572, + "learning_rate": 1.8730244743211027e-05, + "loss": 0.938, + "step": 8098 + }, + { + "epoch": 0.7236580516898609, + "grad_norm": 0.4365943670272827, + "learning_rate": 1.8718954433146963e-05, + "loss": 0.9746, + "step": 8099 + }, + { + "epoch": 0.7237474032211227, + "grad_norm": 0.4775666892528534, + "learning_rate": 1.8707666743203893e-05, + "loss": 0.9665, + "step": 8100 + }, + { + "epoch": 0.7238367547523845, + "grad_norm": 0.43144118785858154, + "learning_rate": 1.8696381674327308e-05, + "loss": 0.9532, + "step": 8101 + }, + { + "epoch": 0.7239261062836464, + "grad_norm": 0.5456085205078125, + "learning_rate": 1.8685099227462406e-05, + "loss": 0.9138, + "step": 8102 + }, + { + "epoch": 0.7240154578149083, + "grad_norm": 0.44839584827423096, + "learning_rate": 1.8673819403554244e-05, + "loss": 0.9408, + "step": 8103 + }, + { + "epoch": 0.7241048093461702, + "grad_norm": 0.4968213737010956, + "learning_rate": 1.8662542203547616e-05, + "loss": 0.9101, + "step": 8104 + }, + { + "epoch": 0.724194160877432, + "grad_norm": 0.5285754203796387, + "learning_rate": 1.8651267628387108e-05, + "loss": 0.8168, + "step": 8105 + }, + { + "epoch": 0.7242835124086939, + "grad_norm": 0.4092321991920471, + "learning_rate": 1.86399956790171e-05, + "loss": 0.9397, + "step": 8106 + }, + { + "epoch": 0.7243728639399558, + "grad_norm": 0.6239436864852905, + "learning_rate": 1.862872635638172e-05, + "loss": 0.8964, + "step": 8107 + }, + { + "epoch": 0.7244622154712176, + "grad_norm": 0.5783222913742065, + "learning_rate": 1.86174596614249e-05, + "loss": 0.8807, + "step": 8108 + }, + { + "epoch": 0.7245515670024795, + "grad_norm": 0.43302804231643677, + "learning_rate": 1.8606195595090354e-05, + "loss": 0.9818, + "step": 8109 + }, + { + "epoch": 0.7246409185337414, + "grad_norm": 0.4686475694179535, + "learning_rate": 1.859493415832157e-05, + "loss": 0.9568, + "step": 8110 + }, + { + "epoch": 0.7247302700650032, + "grad_norm": 0.6813352704048157, + "learning_rate": 1.8583675352061792e-05, + "loss": 0.878, + "step": 8111 + }, + { + "epoch": 0.7248196215962651, + "grad_norm": 0.5830407738685608, + "learning_rate": 1.857241917725408e-05, + "loss": 0.8521, + "step": 8112 + }, + { + "epoch": 0.724908973127527, + "grad_norm": 0.5068725347518921, + "learning_rate": 1.856116563484125e-05, + "loss": 0.8299, + "step": 8113 + }, + { + "epoch": 0.7249983246587889, + "grad_norm": 0.4167884886264801, + "learning_rate": 1.8549914725765932e-05, + "loss": 1.0106, + "step": 8114 + }, + { + "epoch": 0.7250876761900507, + "grad_norm": 0.5126107335090637, + "learning_rate": 1.853866645097047e-05, + "loss": 0.893, + "step": 8115 + }, + { + "epoch": 0.7251770277213125, + "grad_norm": 0.46296408772468567, + "learning_rate": 1.852742081139705e-05, + "loss": 0.8905, + "step": 8116 + }, + { + "epoch": 0.7252663792525744, + "grad_norm": 0.5143711566925049, + "learning_rate": 1.8516177807987606e-05, + "loss": 0.9687, + "step": 8117 + }, + { + "epoch": 0.7253557307838363, + "grad_norm": 0.5294638276100159, + "learning_rate": 1.8504937441683868e-05, + "loss": 0.931, + "step": 8118 + }, + { + "epoch": 0.7254450823150982, + "grad_norm": 0.4719218909740448, + "learning_rate": 1.8493699713427333e-05, + "loss": 0.9807, + "step": 8119 + }, + { + "epoch": 0.7255344338463601, + "grad_norm": 0.4698675870895386, + "learning_rate": 1.848246462415928e-05, + "loss": 1.0442, + "step": 8120 + }, + { + "epoch": 0.725623785377622, + "grad_norm": 0.6009805798530579, + "learning_rate": 1.847123217482078e-05, + "loss": 0.9352, + "step": 8121 + }, + { + "epoch": 0.7257131369088837, + "grad_norm": 0.4387056827545166, + "learning_rate": 1.846000236635264e-05, + "loss": 0.9247, + "step": 8122 + }, + { + "epoch": 0.7258024884401456, + "grad_norm": 0.5181190967559814, + "learning_rate": 1.8448775199695502e-05, + "loss": 0.8485, + "step": 8123 + }, + { + "epoch": 0.7258918399714075, + "grad_norm": 0.4301597774028778, + "learning_rate": 1.843755067578975e-05, + "loss": 0.9481, + "step": 8124 + }, + { + "epoch": 0.7259811915026694, + "grad_norm": 0.49605125188827515, + "learning_rate": 1.842632879557556e-05, + "loss": 0.9434, + "step": 8125 + }, + { + "epoch": 0.7260705430339313, + "grad_norm": 0.43525439500808716, + "learning_rate": 1.8415109559992882e-05, + "loss": 0.9441, + "step": 8126 + }, + { + "epoch": 0.7261598945651931, + "grad_norm": 0.5193489789962769, + "learning_rate": 1.840389296998145e-05, + "loss": 0.9361, + "step": 8127 + }, + { + "epoch": 0.726249246096455, + "grad_norm": 0.45760974287986755, + "learning_rate": 1.8392679026480774e-05, + "loss": 0.9345, + "step": 8128 + }, + { + "epoch": 0.7263385976277168, + "grad_norm": 0.6482862234115601, + "learning_rate": 1.8381467730430134e-05, + "loss": 0.8445, + "step": 8129 + }, + { + "epoch": 0.7264279491589787, + "grad_norm": 0.45973724126815796, + "learning_rate": 1.837025908276861e-05, + "loss": 0.896, + "step": 8130 + }, + { + "epoch": 0.7265173006902406, + "grad_norm": 0.4498787522315979, + "learning_rate": 1.8359053084435046e-05, + "loss": 0.9654, + "step": 8131 + }, + { + "epoch": 0.7266066522215024, + "grad_norm": 0.4632214903831482, + "learning_rate": 1.834784973636804e-05, + "loss": 0.9408, + "step": 8132 + }, + { + "epoch": 0.7266960037527643, + "grad_norm": 0.4368852972984314, + "learning_rate": 1.8336649039506004e-05, + "loss": 0.9919, + "step": 8133 + }, + { + "epoch": 0.7267853552840262, + "grad_norm": 0.4225790798664093, + "learning_rate": 1.8325450994787124e-05, + "loss": 0.9568, + "step": 8134 + }, + { + "epoch": 0.7268747068152881, + "grad_norm": 0.40643781423568726, + "learning_rate": 1.8314255603149345e-05, + "loss": 0.9723, + "step": 8135 + }, + { + "epoch": 0.7269640583465499, + "grad_norm": 0.47153598070144653, + "learning_rate": 1.8303062865530406e-05, + "loss": 0.8922, + "step": 8136 + }, + { + "epoch": 0.7270534098778118, + "grad_norm": 0.45243874192237854, + "learning_rate": 1.8291872782867813e-05, + "loss": 1.0081, + "step": 8137 + }, + { + "epoch": 0.7271427614090736, + "grad_norm": 0.4166507124900818, + "learning_rate": 1.8280685356098863e-05, + "loss": 0.8909, + "step": 8138 + }, + { + "epoch": 0.7272321129403355, + "grad_norm": 0.5738488435745239, + "learning_rate": 1.826950058616062e-05, + "loss": 0.9549, + "step": 8139 + }, + { + "epoch": 0.7273214644715974, + "grad_norm": 0.5272175073623657, + "learning_rate": 1.825831847398992e-05, + "loss": 0.9165, + "step": 8140 + }, + { + "epoch": 0.7274108160028593, + "grad_norm": 0.38618120551109314, + "learning_rate": 1.8247139020523412e-05, + "loss": 0.9462, + "step": 8141 + }, + { + "epoch": 0.7275001675341212, + "grad_norm": 0.459420382976532, + "learning_rate": 1.8235962226697457e-05, + "loss": 0.9248, + "step": 8142 + }, + { + "epoch": 0.7275895190653829, + "grad_norm": 0.5098559260368347, + "learning_rate": 1.822478809344824e-05, + "loss": 0.9542, + "step": 8143 + }, + { + "epoch": 0.7276788705966448, + "grad_norm": 0.5070344805717468, + "learning_rate": 1.8213616621711722e-05, + "loss": 1.0229, + "step": 8144 + }, + { + "epoch": 0.7277682221279067, + "grad_norm": 0.434377521276474, + "learning_rate": 1.8202447812423634e-05, + "loss": 0.9772, + "step": 8145 + }, + { + "epoch": 0.7278575736591686, + "grad_norm": 0.5484136939048767, + "learning_rate": 1.8191281666519473e-05, + "loss": 0.9173, + "step": 8146 + }, + { + "epoch": 0.7279469251904305, + "grad_norm": 0.4595695734024048, + "learning_rate": 1.8180118184934548e-05, + "loss": 0.9066, + "step": 8147 + }, + { + "epoch": 0.7280362767216924, + "grad_norm": 0.5228673219680786, + "learning_rate": 1.8168957368603884e-05, + "loss": 0.8998, + "step": 8148 + }, + { + "epoch": 0.7281256282529542, + "grad_norm": 0.5042482018470764, + "learning_rate": 1.8157799218462335e-05, + "loss": 0.9067, + "step": 8149 + }, + { + "epoch": 0.728214979784216, + "grad_norm": 0.5893993973731995, + "learning_rate": 1.814664373544452e-05, + "loss": 0.8643, + "step": 8150 + }, + { + "epoch": 0.7283043313154779, + "grad_norm": 0.4281430244445801, + "learning_rate": 1.813549092048483e-05, + "loss": 0.8613, + "step": 8151 + }, + { + "epoch": 0.7283936828467398, + "grad_norm": 0.5029839873313904, + "learning_rate": 1.8124340774517418e-05, + "loss": 0.9724, + "step": 8152 + }, + { + "epoch": 0.7284830343780017, + "grad_norm": 0.49639925360679626, + "learning_rate": 1.8113193298476232e-05, + "loss": 0.9772, + "step": 8153 + }, + { + "epoch": 0.7285723859092635, + "grad_norm": 0.44179990887641907, + "learning_rate": 1.8102048493295006e-05, + "loss": 0.9583, + "step": 8154 + }, + { + "epoch": 0.7286617374405254, + "grad_norm": 0.45038869976997375, + "learning_rate": 1.8090906359907216e-05, + "loss": 0.993, + "step": 8155 + }, + { + "epoch": 0.7287510889717873, + "grad_norm": 0.5203961133956909, + "learning_rate": 1.8079766899246148e-05, + "loss": 0.8979, + "step": 8156 + }, + { + "epoch": 0.7288404405030491, + "grad_norm": 0.5539426803588867, + "learning_rate": 1.8068630112244843e-05, + "loss": 0.999, + "step": 8157 + }, + { + "epoch": 0.728929792034311, + "grad_norm": 0.48159733414649963, + "learning_rate": 1.8057495999836137e-05, + "loss": 1.0195, + "step": 8158 + }, + { + "epoch": 0.7290191435655728, + "grad_norm": 0.45767495036125183, + "learning_rate": 1.8046364562952624e-05, + "loss": 0.9883, + "step": 8159 + }, + { + "epoch": 0.7291084950968347, + "grad_norm": 0.5231384038925171, + "learning_rate": 1.8035235802526674e-05, + "loss": 0.8853, + "step": 8160 + }, + { + "epoch": 0.7291978466280966, + "grad_norm": 0.46192824840545654, + "learning_rate": 1.802410971949045e-05, + "loss": 0.9542, + "step": 8161 + }, + { + "epoch": 0.7292871981593585, + "grad_norm": 0.40954524278640747, + "learning_rate": 1.801298631477589e-05, + "loss": 0.9083, + "step": 8162 + }, + { + "epoch": 0.7293765496906203, + "grad_norm": 0.5644052624702454, + "learning_rate": 1.8001865589314664e-05, + "loss": 0.8587, + "step": 8163 + }, + { + "epoch": 0.7294659012218822, + "grad_norm": 0.4511362612247467, + "learning_rate": 1.7990747544038277e-05, + "loss": 0.9583, + "step": 8164 + }, + { + "epoch": 0.729555252753144, + "grad_norm": 0.4605756103992462, + "learning_rate": 1.7979632179877974e-05, + "loss": 0.942, + "step": 8165 + }, + { + "epoch": 0.7296446042844059, + "grad_norm": 0.49011021852493286, + "learning_rate": 1.7968519497764784e-05, + "loss": 0.9492, + "step": 8166 + }, + { + "epoch": 0.7297339558156678, + "grad_norm": 0.48554643988609314, + "learning_rate": 1.7957409498629522e-05, + "loss": 0.9447, + "step": 8167 + }, + { + "epoch": 0.7298233073469297, + "grad_norm": 0.43161845207214355, + "learning_rate": 1.794630218340277e-05, + "loss": 0.9717, + "step": 8168 + }, + { + "epoch": 0.7299126588781916, + "grad_norm": 0.47446951270103455, + "learning_rate": 1.793519755301487e-05, + "loss": 0.8648, + "step": 8169 + }, + { + "epoch": 0.7300020104094533, + "grad_norm": 0.41278836131095886, + "learning_rate": 1.7924095608395963e-05, + "loss": 0.9728, + "step": 8170 + }, + { + "epoch": 0.7300913619407152, + "grad_norm": 0.4663233160972595, + "learning_rate": 1.7912996350475954e-05, + "loss": 0.9122, + "step": 8171 + }, + { + "epoch": 0.7301807134719771, + "grad_norm": 0.6070008873939514, + "learning_rate": 1.7901899780184537e-05, + "loss": 0.8931, + "step": 8172 + }, + { + "epoch": 0.730270065003239, + "grad_norm": 0.49870383739471436, + "learning_rate": 1.789080589845114e-05, + "loss": 0.8633, + "step": 8173 + }, + { + "epoch": 0.7303594165345009, + "grad_norm": 0.4976552426815033, + "learning_rate": 1.7879714706205008e-05, + "loss": 0.9654, + "step": 8174 + }, + { + "epoch": 0.7304487680657628, + "grad_norm": 0.43188008666038513, + "learning_rate": 1.786862620437515e-05, + "loss": 0.9809, + "step": 8175 + }, + { + "epoch": 0.7305381195970246, + "grad_norm": 0.4651055335998535, + "learning_rate": 1.7857540393890337e-05, + "loss": 0.9153, + "step": 8176 + }, + { + "epoch": 0.7306274711282864, + "grad_norm": 0.42811328172683716, + "learning_rate": 1.7846457275679136e-05, + "loss": 0.8451, + "step": 8177 + }, + { + "epoch": 0.7307168226595483, + "grad_norm": 0.4533650875091553, + "learning_rate": 1.7835376850669866e-05, + "loss": 0.9642, + "step": 8178 + }, + { + "epoch": 0.7308061741908102, + "grad_norm": 0.4616091549396515, + "learning_rate": 1.7824299119790637e-05, + "loss": 0.9361, + "step": 8179 + }, + { + "epoch": 0.7308955257220721, + "grad_norm": 0.4673561155796051, + "learning_rate": 1.7813224083969344e-05, + "loss": 0.8883, + "step": 8180 + }, + { + "epoch": 0.7309848772533339, + "grad_norm": 0.4838724136352539, + "learning_rate": 1.78021517441336e-05, + "loss": 0.8978, + "step": 8181 + }, + { + "epoch": 0.7310742287845958, + "grad_norm": 0.5045418739318848, + "learning_rate": 1.7791082101210853e-05, + "loss": 0.9195, + "step": 8182 + }, + { + "epoch": 0.7311635803158577, + "grad_norm": 0.40996941924095154, + "learning_rate": 1.7780015156128305e-05, + "loss": 0.9633, + "step": 8183 + }, + { + "epoch": 0.7312529318471195, + "grad_norm": 0.577971875667572, + "learning_rate": 1.776895090981294e-05, + "loss": 1.0122, + "step": 8184 + }, + { + "epoch": 0.7313422833783814, + "grad_norm": 0.4948059916496277, + "learning_rate": 1.7757889363191483e-05, + "loss": 0.8988, + "step": 8185 + }, + { + "epoch": 0.7314316349096432, + "grad_norm": 0.4757169187068939, + "learning_rate": 1.7746830517190467e-05, + "loss": 0.9198, + "step": 8186 + }, + { + "epoch": 0.7315209864409051, + "grad_norm": 0.5007458925247192, + "learning_rate": 1.7735774372736187e-05, + "loss": 0.9536, + "step": 8187 + }, + { + "epoch": 0.731610337972167, + "grad_norm": 0.5878704190254211, + "learning_rate": 1.772472093075471e-05, + "loss": 0.9794, + "step": 8188 + }, + { + "epoch": 0.7316996895034289, + "grad_norm": 0.4866276979446411, + "learning_rate": 1.7713670192171895e-05, + "loss": 0.9257, + "step": 8189 + }, + { + "epoch": 0.7317890410346908, + "grad_norm": 0.4084816575050354, + "learning_rate": 1.7702622157913344e-05, + "loss": 0.9616, + "step": 8190 + }, + { + "epoch": 0.7318783925659526, + "grad_norm": 0.5077846050262451, + "learning_rate": 1.7691576828904456e-05, + "loss": 0.9011, + "step": 8191 + }, + { + "epoch": 0.7319677440972144, + "grad_norm": 0.46192851662635803, + "learning_rate": 1.7680534206070405e-05, + "loss": 0.9729, + "step": 8192 + }, + { + "epoch": 0.7320570956284763, + "grad_norm": 0.42796364426612854, + "learning_rate": 1.76694942903361e-05, + "loss": 0.9862, + "step": 8193 + }, + { + "epoch": 0.7321464471597382, + "grad_norm": 0.555330753326416, + "learning_rate": 1.765845708262626e-05, + "loss": 0.9163, + "step": 8194 + }, + { + "epoch": 0.7322357986910001, + "grad_norm": 0.456575870513916, + "learning_rate": 1.764742258386538e-05, + "loss": 0.9215, + "step": 8195 + }, + { + "epoch": 0.732325150222262, + "grad_norm": 0.4735051989555359, + "learning_rate": 1.7636390794977713e-05, + "loss": 0.9277, + "step": 8196 + }, + { + "epoch": 0.7324145017535238, + "grad_norm": 0.4335395097732544, + "learning_rate": 1.762536171688729e-05, + "loss": 0.9389, + "step": 8197 + }, + { + "epoch": 0.7325038532847856, + "grad_norm": 0.5177332758903503, + "learning_rate": 1.7614335350517915e-05, + "loss": 0.9101, + "step": 8198 + }, + { + "epoch": 0.7325932048160475, + "grad_norm": 0.5252842903137207, + "learning_rate": 1.760331169679315e-05, + "loss": 0.885, + "step": 8199 + }, + { + "epoch": 0.7326825563473094, + "grad_norm": 0.48446813225746155, + "learning_rate": 1.7592290756636365e-05, + "loss": 0.9211, + "step": 8200 + }, + { + "epoch": 0.7327719078785713, + "grad_norm": 0.5630969405174255, + "learning_rate": 1.7581272530970667e-05, + "loss": 0.8899, + "step": 8201 + }, + { + "epoch": 0.7328612594098332, + "grad_norm": 0.5070595741271973, + "learning_rate": 1.7570257020718967e-05, + "loss": 0.9564, + "step": 8202 + }, + { + "epoch": 0.732950610941095, + "grad_norm": 0.4257377088069916, + "learning_rate": 1.75592442268039e-05, + "loss": 1.001, + "step": 8203 + }, + { + "epoch": 0.7330399624723569, + "grad_norm": 0.40094104409217834, + "learning_rate": 1.7548234150147925e-05, + "loss": 0.9957, + "step": 8204 + }, + { + "epoch": 0.7331293140036187, + "grad_norm": 0.44128090143203735, + "learning_rate": 1.7537226791673256e-05, + "loss": 0.9949, + "step": 8205 + }, + { + "epoch": 0.7332186655348806, + "grad_norm": 0.46732866764068604, + "learning_rate": 1.7526222152301862e-05, + "loss": 0.9562, + "step": 8206 + }, + { + "epoch": 0.7333080170661425, + "grad_norm": 0.41671401262283325, + "learning_rate": 1.7515220232955513e-05, + "loss": 0.9315, + "step": 8207 + }, + { + "epoch": 0.7333973685974043, + "grad_norm": 0.5439326167106628, + "learning_rate": 1.7504221034555734e-05, + "loss": 0.9628, + "step": 8208 + }, + { + "epoch": 0.7334867201286662, + "grad_norm": 0.4819839894771576, + "learning_rate": 1.7493224558023825e-05, + "loss": 0.9094, + "step": 8209 + }, + { + "epoch": 0.7335760716599281, + "grad_norm": 0.4984486699104309, + "learning_rate": 1.7482230804280852e-05, + "loss": 0.9014, + "step": 8210 + }, + { + "epoch": 0.73366542319119, + "grad_norm": 0.5585265159606934, + "learning_rate": 1.7471239774247667e-05, + "loss": 0.9843, + "step": 8211 + }, + { + "epoch": 0.7337547747224518, + "grad_norm": 0.4539826214313507, + "learning_rate": 1.7460251468844895e-05, + "loss": 0.9753, + "step": 8212 + }, + { + "epoch": 0.7338441262537136, + "grad_norm": 0.44110792875289917, + "learning_rate": 1.74492658889929e-05, + "loss": 0.9038, + "step": 8213 + }, + { + "epoch": 0.7339334777849755, + "grad_norm": 0.44374075531959534, + "learning_rate": 1.7438283035611846e-05, + "loss": 0.9506, + "step": 8214 + }, + { + "epoch": 0.7340228293162374, + "grad_norm": 0.43943607807159424, + "learning_rate": 1.7427302909621672e-05, + "loss": 0.9645, + "step": 8215 + }, + { + "epoch": 0.7341121808474993, + "grad_norm": 0.5023413300514221, + "learning_rate": 1.7416325511942085e-05, + "loss": 0.8913, + "step": 8216 + }, + { + "epoch": 0.7342015323787612, + "grad_norm": 0.43534350395202637, + "learning_rate": 1.7405350843492567e-05, + "loss": 0.907, + "step": 8217 + }, + { + "epoch": 0.7342908839100231, + "grad_norm": 0.46004897356033325, + "learning_rate": 1.7394378905192334e-05, + "loss": 0.9859, + "step": 8218 + }, + { + "epoch": 0.7343802354412848, + "grad_norm": 0.49676865339279175, + "learning_rate": 1.738340969796042e-05, + "loss": 0.9193, + "step": 8219 + }, + { + "epoch": 0.7344695869725467, + "grad_norm": 0.47538769245147705, + "learning_rate": 1.7372443222715605e-05, + "loss": 0.9194, + "step": 8220 + }, + { + "epoch": 0.7345589385038086, + "grad_norm": 0.477346271276474, + "learning_rate": 1.736147948037646e-05, + "loss": 0.9213, + "step": 8221 + }, + { + "epoch": 0.7346482900350705, + "grad_norm": 0.3966813087463379, + "learning_rate": 1.7350518471861328e-05, + "loss": 0.9935, + "step": 8222 + }, + { + "epoch": 0.7347376415663324, + "grad_norm": 0.46678826212882996, + "learning_rate": 1.7339560198088273e-05, + "loss": 0.923, + "step": 8223 + }, + { + "epoch": 0.7348269930975942, + "grad_norm": 0.42243048548698425, + "learning_rate": 1.7328604659975184e-05, + "loss": 1.0018, + "step": 8224 + }, + { + "epoch": 0.734916344628856, + "grad_norm": 0.4616301655769348, + "learning_rate": 1.7317651858439714e-05, + "loss": 0.9319, + "step": 8225 + }, + { + "epoch": 0.7350056961601179, + "grad_norm": 0.644029438495636, + "learning_rate": 1.7306701794399266e-05, + "loss": 0.9191, + "step": 8226 + }, + { + "epoch": 0.7350950476913798, + "grad_norm": 0.4394361078739166, + "learning_rate": 1.7295754468771024e-05, + "loss": 0.9644, + "step": 8227 + }, + { + "epoch": 0.7351843992226417, + "grad_norm": 0.4633764922618866, + "learning_rate": 1.7284809882471954e-05, + "loss": 0.9198, + "step": 8228 + }, + { + "epoch": 0.7352737507539036, + "grad_norm": 0.49035075306892395, + "learning_rate": 1.727386803641877e-05, + "loss": 0.9324, + "step": 8229 + }, + { + "epoch": 0.7353631022851654, + "grad_norm": 0.45196330547332764, + "learning_rate": 1.7262928931527977e-05, + "loss": 0.9823, + "step": 8230 + }, + { + "epoch": 0.7354524538164273, + "grad_norm": 0.459506630897522, + "learning_rate": 1.7251992568715842e-05, + "loss": 0.9723, + "step": 8231 + }, + { + "epoch": 0.7355418053476891, + "grad_norm": 0.5117012858390808, + "learning_rate": 1.724105894889841e-05, + "loss": 0.9468, + "step": 8232 + }, + { + "epoch": 0.735631156878951, + "grad_norm": 0.43561848998069763, + "learning_rate": 1.7230128072991458e-05, + "loss": 0.933, + "step": 8233 + }, + { + "epoch": 0.7357205084102129, + "grad_norm": 0.5399268865585327, + "learning_rate": 1.721919994191058e-05, + "loss": 0.9901, + "step": 8234 + }, + { + "epoch": 0.7358098599414747, + "grad_norm": 0.4303232729434967, + "learning_rate": 1.720827455657113e-05, + "loss": 1.0334, + "step": 8235 + }, + { + "epoch": 0.7358992114727366, + "grad_norm": 0.5467885732650757, + "learning_rate": 1.719735191788822e-05, + "loss": 0.9286, + "step": 8236 + }, + { + "epoch": 0.7359885630039985, + "grad_norm": 0.43079063296318054, + "learning_rate": 1.7186432026776734e-05, + "loss": 0.8944, + "step": 8237 + }, + { + "epoch": 0.7360779145352604, + "grad_norm": 0.5494388341903687, + "learning_rate": 1.717551488415134e-05, + "loss": 0.8537, + "step": 8238 + }, + { + "epoch": 0.7361672660665222, + "grad_norm": 0.5178488492965698, + "learning_rate": 1.7164600490926454e-05, + "loss": 0.8705, + "step": 8239 + }, + { + "epoch": 0.736256617597784, + "grad_norm": 0.4486089050769806, + "learning_rate": 1.7153688848016277e-05, + "loss": 1.0209, + "step": 8240 + }, + { + "epoch": 0.7363459691290459, + "grad_norm": 0.4773981273174286, + "learning_rate": 1.7142779956334777e-05, + "loss": 0.9607, + "step": 8241 + }, + { + "epoch": 0.7364353206603078, + "grad_norm": 0.5104852318763733, + "learning_rate": 1.7131873816795684e-05, + "loss": 0.958, + "step": 8242 + }, + { + "epoch": 0.7365246721915697, + "grad_norm": 0.461008220911026, + "learning_rate": 1.7120970430312526e-05, + "loss": 0.9717, + "step": 8243 + }, + { + "epoch": 0.7366140237228316, + "grad_norm": 0.44321146607398987, + "learning_rate": 1.711006979779855e-05, + "loss": 0.9139, + "step": 8244 + }, + { + "epoch": 0.7367033752540935, + "grad_norm": 0.49801915884017944, + "learning_rate": 1.70991719201668e-05, + "loss": 0.8756, + "step": 8245 + }, + { + "epoch": 0.7367927267853552, + "grad_norm": 0.5032793283462524, + "learning_rate": 1.7088276798330106e-05, + "loss": 1.0369, + "step": 8246 + }, + { + "epoch": 0.7368820783166171, + "grad_norm": 0.5026294589042664, + "learning_rate": 1.7077384433201045e-05, + "loss": 0.9135, + "step": 8247 + }, + { + "epoch": 0.736971429847879, + "grad_norm": 0.4539667069911957, + "learning_rate": 1.7066494825691966e-05, + "loss": 0.9861, + "step": 8248 + }, + { + "epoch": 0.7370607813791409, + "grad_norm": 0.4462776184082031, + "learning_rate": 1.7055607976714988e-05, + "loss": 0.9653, + "step": 8249 + }, + { + "epoch": 0.7371501329104028, + "grad_norm": 0.4230305552482605, + "learning_rate": 1.7044723887182017e-05, + "loss": 0.9694, + "step": 8250 + }, + { + "epoch": 0.7372394844416646, + "grad_norm": 0.4417788088321686, + "learning_rate": 1.7033842558004693e-05, + "loss": 0.9562, + "step": 8251 + }, + { + "epoch": 0.7373288359729265, + "grad_norm": 0.4916762113571167, + "learning_rate": 1.7022963990094442e-05, + "loss": 0.8921, + "step": 8252 + }, + { + "epoch": 0.7374181875041883, + "grad_norm": 0.44411367177963257, + "learning_rate": 1.7012088184362467e-05, + "loss": 0.961, + "step": 8253 + }, + { + "epoch": 0.7375075390354502, + "grad_norm": 0.4935406446456909, + "learning_rate": 1.700121514171975e-05, + "loss": 0.9968, + "step": 8254 + }, + { + "epoch": 0.7375968905667121, + "grad_norm": 0.5243939757347107, + "learning_rate": 1.699034486307699e-05, + "loss": 0.9191, + "step": 8255 + }, + { + "epoch": 0.737686242097974, + "grad_norm": 0.5002910494804382, + "learning_rate": 1.69794773493447e-05, + "loss": 0.9762, + "step": 8256 + }, + { + "epoch": 0.7377755936292358, + "grad_norm": 0.42303359508514404, + "learning_rate": 1.6968612601433164e-05, + "loss": 0.9327, + "step": 8257 + }, + { + "epoch": 0.7378649451604977, + "grad_norm": 0.47965705394744873, + "learning_rate": 1.695775062025241e-05, + "loss": 0.906, + "step": 8258 + }, + { + "epoch": 0.7379542966917596, + "grad_norm": 0.44880691170692444, + "learning_rate": 1.6946891406712245e-05, + "loss": 0.9013, + "step": 8259 + }, + { + "epoch": 0.7380436482230214, + "grad_norm": 0.48894819617271423, + "learning_rate": 1.6936034961722247e-05, + "loss": 0.9125, + "step": 8260 + }, + { + "epoch": 0.7381329997542833, + "grad_norm": 0.4461333751678467, + "learning_rate": 1.6925181286191755e-05, + "loss": 0.9559, + "step": 8261 + }, + { + "epoch": 0.7382223512855451, + "grad_norm": 0.4503779411315918, + "learning_rate": 1.6914330381029888e-05, + "loss": 0.9575, + "step": 8262 + }, + { + "epoch": 0.738311702816807, + "grad_norm": 0.4670964181423187, + "learning_rate": 1.690348224714553e-05, + "loss": 0.9779, + "step": 8263 + }, + { + "epoch": 0.7384010543480689, + "grad_norm": 0.6177948713302612, + "learning_rate": 1.68926368854473e-05, + "loss": 0.994, + "step": 8264 + }, + { + "epoch": 0.7384904058793308, + "grad_norm": 0.4183749854564667, + "learning_rate": 1.6881794296843633e-05, + "loss": 0.9373, + "step": 8265 + }, + { + "epoch": 0.7385797574105927, + "grad_norm": 0.42233920097351074, + "learning_rate": 1.6870954482242707e-05, + "loss": 0.979, + "step": 8266 + }, + { + "epoch": 0.7386691089418544, + "grad_norm": 0.45563188195228577, + "learning_rate": 1.6860117442552477e-05, + "loss": 0.9276, + "step": 8267 + }, + { + "epoch": 0.7387584604731163, + "grad_norm": 0.43523088097572327, + "learning_rate": 1.6849283178680653e-05, + "loss": 1.0035, + "step": 8268 + }, + { + "epoch": 0.7388478120043782, + "grad_norm": 0.6393901109695435, + "learning_rate": 1.6838451691534724e-05, + "loss": 0.9238, + "step": 8269 + }, + { + "epoch": 0.7389371635356401, + "grad_norm": 0.5803185105323792, + "learning_rate": 1.6827622982021947e-05, + "loss": 0.8818, + "step": 8270 + }, + { + "epoch": 0.739026515066902, + "grad_norm": 0.3935569226741791, + "learning_rate": 1.6816797051049334e-05, + "loss": 0.914, + "step": 8271 + }, + { + "epoch": 0.7391158665981639, + "grad_norm": 0.43459293246269226, + "learning_rate": 1.6805973899523675e-05, + "loss": 0.9829, + "step": 8272 + }, + { + "epoch": 0.7392052181294257, + "grad_norm": 0.46992024779319763, + "learning_rate": 1.679515352835154e-05, + "loss": 0.9613, + "step": 8273 + }, + { + "epoch": 0.7392945696606875, + "grad_norm": 0.4815920889377594, + "learning_rate": 1.678433593843922e-05, + "loss": 0.9408, + "step": 8274 + }, + { + "epoch": 0.7393839211919494, + "grad_norm": 0.5394062399864197, + "learning_rate": 1.6773521130692822e-05, + "loss": 0.8696, + "step": 8275 + }, + { + "epoch": 0.7394732727232113, + "grad_norm": 0.4539199769496918, + "learning_rate": 1.6762709106018194e-05, + "loss": 0.9607, + "step": 8276 + }, + { + "epoch": 0.7395626242544732, + "grad_norm": 0.3883989751338959, + "learning_rate": 1.6751899865320963e-05, + "loss": 0.957, + "step": 8277 + }, + { + "epoch": 0.739651975785735, + "grad_norm": 0.4848140776157379, + "learning_rate": 1.674109340950652e-05, + "loss": 0.9505, + "step": 8278 + }, + { + "epoch": 0.7397413273169969, + "grad_norm": 0.42425286769866943, + "learning_rate": 1.6730289739480015e-05, + "loss": 0.9207, + "step": 8279 + }, + { + "epoch": 0.7398306788482588, + "grad_norm": 0.5808948278427124, + "learning_rate": 1.6719488856146377e-05, + "loss": 0.9163, + "step": 8280 + }, + { + "epoch": 0.7399200303795206, + "grad_norm": 0.481012761592865, + "learning_rate": 1.670869076041029e-05, + "loss": 0.9699, + "step": 8281 + }, + { + "epoch": 0.7400093819107825, + "grad_norm": 0.5220696926116943, + "learning_rate": 1.669789545317621e-05, + "loss": 0.912, + "step": 8282 + }, + { + "epoch": 0.7400987334420444, + "grad_norm": 0.46939072012901306, + "learning_rate": 1.668710293534838e-05, + "loss": 1.0163, + "step": 8283 + }, + { + "epoch": 0.7401880849733062, + "grad_norm": 0.4594574570655823, + "learning_rate": 1.6676313207830752e-05, + "loss": 0.9666, + "step": 8284 + }, + { + "epoch": 0.7402774365045681, + "grad_norm": 0.36939460039138794, + "learning_rate": 1.66655262715271e-05, + "loss": 0.9669, + "step": 8285 + }, + { + "epoch": 0.74036678803583, + "grad_norm": 0.45765185356140137, + "learning_rate": 1.6654742127340938e-05, + "loss": 0.8751, + "step": 8286 + }, + { + "epoch": 0.7404561395670918, + "grad_norm": 0.5784175992012024, + "learning_rate": 1.6643960776175577e-05, + "loss": 0.8804, + "step": 8287 + }, + { + "epoch": 0.7405454910983537, + "grad_norm": 0.4556187689304352, + "learning_rate": 1.6633182218934035e-05, + "loss": 0.9594, + "step": 8288 + }, + { + "epoch": 0.7406348426296155, + "grad_norm": 0.6318187117576599, + "learning_rate": 1.6622406456519146e-05, + "loss": 0.9063, + "step": 8289 + }, + { + "epoch": 0.7407241941608774, + "grad_norm": 0.5511077046394348, + "learning_rate": 1.66116334898335e-05, + "loss": 0.8661, + "step": 8290 + }, + { + "epoch": 0.7408135456921393, + "grad_norm": 0.5508573651313782, + "learning_rate": 1.6600863319779435e-05, + "loss": 0.8953, + "step": 8291 + }, + { + "epoch": 0.7409028972234012, + "grad_norm": 0.5418042540550232, + "learning_rate": 1.659009594725908e-05, + "loss": 0.8662, + "step": 8292 + }, + { + "epoch": 0.7409922487546631, + "grad_norm": 0.45847129821777344, + "learning_rate": 1.6579331373174335e-05, + "loss": 0.9173, + "step": 8293 + }, + { + "epoch": 0.7410816002859248, + "grad_norm": 0.45850539207458496, + "learning_rate": 1.6568569598426803e-05, + "loss": 0.8913, + "step": 8294 + }, + { + "epoch": 0.7411709518171867, + "grad_norm": 0.46910685300827026, + "learning_rate": 1.6557810623917923e-05, + "loss": 0.9585, + "step": 8295 + }, + { + "epoch": 0.7412603033484486, + "grad_norm": 0.5492746829986572, + "learning_rate": 1.654705445054887e-05, + "loss": 0.9454, + "step": 8296 + }, + { + "epoch": 0.7413496548797105, + "grad_norm": 0.42353761196136475, + "learning_rate": 1.6536301079220595e-05, + "loss": 0.9887, + "step": 8297 + }, + { + "epoch": 0.7414390064109724, + "grad_norm": 0.38787001371383667, + "learning_rate": 1.65255505108338e-05, + "loss": 0.9821, + "step": 8298 + }, + { + "epoch": 0.7415283579422343, + "grad_norm": 0.45528730750083923, + "learning_rate": 1.6514802746288955e-05, + "loss": 0.9411, + "step": 8299 + }, + { + "epoch": 0.7416177094734961, + "grad_norm": 0.4377180337905884, + "learning_rate": 1.6504057786486316e-05, + "loss": 0.9134, + "step": 8300 + }, + { + "epoch": 0.7417070610047579, + "grad_norm": 0.4238099753856659, + "learning_rate": 1.649331563232587e-05, + "loss": 0.9492, + "step": 8301 + }, + { + "epoch": 0.7417964125360198, + "grad_norm": 0.6428069472312927, + "learning_rate": 1.6482576284707402e-05, + "loss": 0.8924, + "step": 8302 + }, + { + "epoch": 0.7418857640672817, + "grad_norm": 0.5138502717018127, + "learning_rate": 1.6471839744530455e-05, + "loss": 1.0355, + "step": 8303 + }, + { + "epoch": 0.7419751155985436, + "grad_norm": 0.4536551833152771, + "learning_rate": 1.6461106012694293e-05, + "loss": 0.9953, + "step": 8304 + }, + { + "epoch": 0.7420644671298054, + "grad_norm": 0.45389628410339355, + "learning_rate": 1.6450375090098003e-05, + "loss": 0.9809, + "step": 8305 + }, + { + "epoch": 0.7421538186610673, + "grad_norm": 0.42163825035095215, + "learning_rate": 1.6439646977640417e-05, + "loss": 0.9368, + "step": 8306 + }, + { + "epoch": 0.7422431701923292, + "grad_norm": 0.48852425813674927, + "learning_rate": 1.6428921676220122e-05, + "loss": 0.9138, + "step": 8307 + }, + { + "epoch": 0.742332521723591, + "grad_norm": 0.4827854633331299, + "learning_rate": 1.6418199186735476e-05, + "loss": 0.9123, + "step": 8308 + }, + { + "epoch": 0.7424218732548529, + "grad_norm": 0.4420393705368042, + "learning_rate": 1.6407479510084607e-05, + "loss": 0.9481, + "step": 8309 + }, + { + "epoch": 0.7425112247861148, + "grad_norm": 0.47318053245544434, + "learning_rate": 1.6396762647165398e-05, + "loss": 0.9103, + "step": 8310 + }, + { + "epoch": 0.7426005763173766, + "grad_norm": 0.5290870070457458, + "learning_rate": 1.6386048598875502e-05, + "loss": 0.9708, + "step": 8311 + }, + { + "epoch": 0.7426899278486385, + "grad_norm": 0.52885901927948, + "learning_rate": 1.6375337366112336e-05, + "loss": 0.9811, + "step": 8312 + }, + { + "epoch": 0.7427792793799004, + "grad_norm": 0.49882233142852783, + "learning_rate": 1.6364628949773096e-05, + "loss": 0.9632, + "step": 8313 + }, + { + "epoch": 0.7428686309111623, + "grad_norm": 0.46031755208969116, + "learning_rate": 1.6353923350754692e-05, + "loss": 1.0104, + "step": 8314 + }, + { + "epoch": 0.7429579824424241, + "grad_norm": 0.49714195728302, + "learning_rate": 1.6343220569953848e-05, + "loss": 0.9287, + "step": 8315 + }, + { + "epoch": 0.7430473339736859, + "grad_norm": 0.5016975998878479, + "learning_rate": 1.6332520608267032e-05, + "loss": 0.9117, + "step": 8316 + }, + { + "epoch": 0.7431366855049478, + "grad_norm": 0.5067397952079773, + "learning_rate": 1.632182346659049e-05, + "loss": 0.9105, + "step": 8317 + }, + { + "epoch": 0.7432260370362097, + "grad_norm": 0.4985998272895813, + "learning_rate": 1.6311129145820218e-05, + "loss": 0.8446, + "step": 8318 + }, + { + "epoch": 0.7433153885674716, + "grad_norm": 0.4276972711086273, + "learning_rate": 1.6300437646851972e-05, + "loss": 0.9542, + "step": 8319 + }, + { + "epoch": 0.7434047400987335, + "grad_norm": 0.4463093876838684, + "learning_rate": 1.6289748970581307e-05, + "loss": 0.9895, + "step": 8320 + }, + { + "epoch": 0.7434940916299954, + "grad_norm": 0.4791722297668457, + "learning_rate": 1.627906311790347e-05, + "loss": 0.9321, + "step": 8321 + }, + { + "epoch": 0.7435834431612571, + "grad_norm": 0.4845844805240631, + "learning_rate": 1.6268380089713542e-05, + "loss": 0.9468, + "step": 8322 + }, + { + "epoch": 0.743672794692519, + "grad_norm": 0.49456390738487244, + "learning_rate": 1.625769988690633e-05, + "loss": 0.9305, + "step": 8323 + }, + { + "epoch": 0.7437621462237809, + "grad_norm": 0.5137072205543518, + "learning_rate": 1.6247022510376435e-05, + "loss": 0.9164, + "step": 8324 + }, + { + "epoch": 0.7438514977550428, + "grad_norm": 0.5762333869934082, + "learning_rate": 1.6236347961018172e-05, + "loss": 0.9437, + "step": 8325 + }, + { + "epoch": 0.7439408492863047, + "grad_norm": 0.43072283267974854, + "learning_rate": 1.6225676239725663e-05, + "loss": 0.927, + "step": 8326 + }, + { + "epoch": 0.7440302008175665, + "grad_norm": 0.42828169465065, + "learning_rate": 1.6215007347392775e-05, + "loss": 0.9505, + "step": 8327 + }, + { + "epoch": 0.7441195523488284, + "grad_norm": 0.44435444474220276, + "learning_rate": 1.6204341284913144e-05, + "loss": 0.9176, + "step": 8328 + }, + { + "epoch": 0.7442089038800902, + "grad_norm": 0.38234448432922363, + "learning_rate": 1.6193678053180168e-05, + "loss": 1.0191, + "step": 8329 + }, + { + "epoch": 0.7442982554113521, + "grad_norm": 0.5488905310630798, + "learning_rate": 1.6183017653087e-05, + "loss": 0.9291, + "step": 8330 + }, + { + "epoch": 0.744387606942614, + "grad_norm": 0.4858629107475281, + "learning_rate": 1.6172360085526565e-05, + "loss": 0.9887, + "step": 8331 + }, + { + "epoch": 0.7444769584738758, + "grad_norm": 0.517011821269989, + "learning_rate": 1.616170535139156e-05, + "loss": 0.9611, + "step": 8332 + }, + { + "epoch": 0.7445663100051377, + "grad_norm": 0.5602929592132568, + "learning_rate": 1.6151053451574416e-05, + "loss": 0.9426, + "step": 8333 + }, + { + "epoch": 0.7446556615363996, + "grad_norm": 0.4301270842552185, + "learning_rate": 1.614040438696736e-05, + "loss": 1.0005, + "step": 8334 + }, + { + "epoch": 0.7447450130676615, + "grad_norm": 0.42185020446777344, + "learning_rate": 1.612975815846235e-05, + "loss": 0.9301, + "step": 8335 + }, + { + "epoch": 0.7448343645989233, + "grad_norm": 0.5149445533752441, + "learning_rate": 1.6119114766951116e-05, + "loss": 0.9543, + "step": 8336 + }, + { + "epoch": 0.7449237161301852, + "grad_norm": 0.48916956782341003, + "learning_rate": 1.6108474213325165e-05, + "loss": 0.8676, + "step": 8337 + }, + { + "epoch": 0.745013067661447, + "grad_norm": 0.4371699392795563, + "learning_rate": 1.609783649847576e-05, + "loss": 0.9421, + "step": 8338 + }, + { + "epoch": 0.7451024191927089, + "grad_norm": 0.6675881147384644, + "learning_rate": 1.6087201623293917e-05, + "loss": 0.8675, + "step": 8339 + }, + { + "epoch": 0.7451917707239708, + "grad_norm": 0.5161626935005188, + "learning_rate": 1.6076569588670425e-05, + "loss": 0.9024, + "step": 8340 + }, + { + "epoch": 0.7452811222552327, + "grad_norm": 0.4626888334751129, + "learning_rate": 1.6065940395495825e-05, + "loss": 0.9111, + "step": 8341 + }, + { + "epoch": 0.7453704737864946, + "grad_norm": 0.45520034432411194, + "learning_rate": 1.605531404466043e-05, + "loss": 0.9532, + "step": 8342 + }, + { + "epoch": 0.7454598253177563, + "grad_norm": 0.5057812929153442, + "learning_rate": 1.6044690537054306e-05, + "loss": 0.8732, + "step": 8343 + }, + { + "epoch": 0.7455491768490182, + "grad_norm": 0.48415088653564453, + "learning_rate": 1.6034069873567303e-05, + "loss": 0.8654, + "step": 8344 + }, + { + "epoch": 0.7456385283802801, + "grad_norm": 0.520176351070404, + "learning_rate": 1.6023452055088982e-05, + "loss": 0.9233, + "step": 8345 + }, + { + "epoch": 0.745727879911542, + "grad_norm": 0.4946928024291992, + "learning_rate": 1.601283708250872e-05, + "loss": 0.9015, + "step": 8346 + }, + { + "epoch": 0.7458172314428039, + "grad_norm": 0.4103570580482483, + "learning_rate": 1.600222495671563e-05, + "loss": 0.9471, + "step": 8347 + }, + { + "epoch": 0.7459065829740658, + "grad_norm": 0.41291913390159607, + "learning_rate": 1.599161567859858e-05, + "loss": 0.9129, + "step": 8348 + }, + { + "epoch": 0.7459959345053276, + "grad_norm": 0.4548855721950531, + "learning_rate": 1.598100924904623e-05, + "loss": 0.932, + "step": 8349 + }, + { + "epoch": 0.7460852860365894, + "grad_norm": 0.43719008564949036, + "learning_rate": 1.597040566894697e-05, + "loss": 0.9768, + "step": 8350 + }, + { + "epoch": 0.7461746375678513, + "grad_norm": 0.4432275593280792, + "learning_rate": 1.595980493918896e-05, + "loss": 0.9684, + "step": 8351 + }, + { + "epoch": 0.7462639890991132, + "grad_norm": 0.5939651727676392, + "learning_rate": 1.5949207060660138e-05, + "loss": 0.9425, + "step": 8352 + }, + { + "epoch": 0.7463533406303751, + "grad_norm": 0.45562806725502014, + "learning_rate": 1.5938612034248184e-05, + "loss": 0.9775, + "step": 8353 + }, + { + "epoch": 0.746442692161637, + "grad_norm": 0.4708113670349121, + "learning_rate": 1.5928019860840532e-05, + "loss": 0.975, + "step": 8354 + }, + { + "epoch": 0.7465320436928988, + "grad_norm": 0.5317729115486145, + "learning_rate": 1.5917430541324398e-05, + "loss": 0.9226, + "step": 8355 + }, + { + "epoch": 0.7466213952241606, + "grad_norm": 0.5121520757675171, + "learning_rate": 1.5906844076586746e-05, + "loss": 0.9363, + "step": 8356 + }, + { + "epoch": 0.7467107467554225, + "grad_norm": 0.629950225353241, + "learning_rate": 1.5896260467514336e-05, + "loss": 0.8352, + "step": 8357 + }, + { + "epoch": 0.7468000982866844, + "grad_norm": 0.3894347846508026, + "learning_rate": 1.58856797149936e-05, + "loss": 0.9527, + "step": 8358 + }, + { + "epoch": 0.7468894498179462, + "grad_norm": 0.5037431716918945, + "learning_rate": 1.5875101819910833e-05, + "loss": 0.9559, + "step": 8359 + }, + { + "epoch": 0.7469788013492081, + "grad_norm": 0.46914321184158325, + "learning_rate": 1.5864526783152028e-05, + "loss": 0.9593, + "step": 8360 + }, + { + "epoch": 0.74706815288047, + "grad_norm": 0.42743295431137085, + "learning_rate": 1.5853954605602965e-05, + "loss": 0.9583, + "step": 8361 + }, + { + "epoch": 0.7471575044117319, + "grad_norm": 0.5981160402297974, + "learning_rate": 1.584338528814917e-05, + "loss": 0.831, + "step": 8362 + }, + { + "epoch": 0.7472468559429937, + "grad_norm": 0.45480284094810486, + "learning_rate": 1.5832818831675943e-05, + "loss": 0.9134, + "step": 8363 + }, + { + "epoch": 0.7473362074742556, + "grad_norm": 0.4186408817768097, + "learning_rate": 1.5822255237068357e-05, + "loss": 0.9619, + "step": 8364 + }, + { + "epoch": 0.7474255590055174, + "grad_norm": 0.5457665920257568, + "learning_rate": 1.5811694505211182e-05, + "loss": 0.9438, + "step": 8365 + }, + { + "epoch": 0.7475149105367793, + "grad_norm": 0.485774427652359, + "learning_rate": 1.5801136636989012e-05, + "loss": 0.9763, + "step": 8366 + }, + { + "epoch": 0.7476042620680412, + "grad_norm": 0.42799893021583557, + "learning_rate": 1.5790581633286184e-05, + "loss": 0.9938, + "step": 8367 + }, + { + "epoch": 0.7476936135993031, + "grad_norm": 0.45776692032814026, + "learning_rate": 1.5780029494986794e-05, + "loss": 0.9387, + "step": 8368 + }, + { + "epoch": 0.747782965130565, + "grad_norm": 0.5079134702682495, + "learning_rate": 1.5769480222974685e-05, + "loss": 0.9021, + "step": 8369 + }, + { + "epoch": 0.7478723166618267, + "grad_norm": 0.4531920850276947, + "learning_rate": 1.575893381813348e-05, + "loss": 0.9242, + "step": 8370 + }, + { + "epoch": 0.7479616681930886, + "grad_norm": 0.5917310118675232, + "learning_rate": 1.5748390281346553e-05, + "loss": 0.8605, + "step": 8371 + }, + { + "epoch": 0.7480510197243505, + "grad_norm": 0.4818684756755829, + "learning_rate": 1.573784961349704e-05, + "loss": 0.9327, + "step": 8372 + }, + { + "epoch": 0.7481403712556124, + "grad_norm": 0.4459517300128937, + "learning_rate": 1.5727311815467825e-05, + "loss": 0.9717, + "step": 8373 + }, + { + "epoch": 0.7482297227868743, + "grad_norm": 0.4536815583705902, + "learning_rate": 1.5716776888141583e-05, + "loss": 0.9551, + "step": 8374 + }, + { + "epoch": 0.7483190743181362, + "grad_norm": 0.7309669256210327, + "learning_rate": 1.5706244832400696e-05, + "loss": 0.8589, + "step": 8375 + }, + { + "epoch": 0.748408425849398, + "grad_norm": 0.45956024527549744, + "learning_rate": 1.5695715649127345e-05, + "loss": 0.9848, + "step": 8376 + }, + { + "epoch": 0.7484977773806598, + "grad_norm": 0.450041264295578, + "learning_rate": 1.568518933920347e-05, + "loss": 0.938, + "step": 8377 + }, + { + "epoch": 0.7485871289119217, + "grad_norm": 0.45489928126335144, + "learning_rate": 1.5674665903510755e-05, + "loss": 1.0276, + "step": 8378 + }, + { + "epoch": 0.7486764804431836, + "grad_norm": 0.42867419123649597, + "learning_rate": 1.566414534293065e-05, + "loss": 0.9341, + "step": 8379 + }, + { + "epoch": 0.7487658319744455, + "grad_norm": 0.44365933537483215, + "learning_rate": 1.5653627658344374e-05, + "loss": 0.943, + "step": 8380 + }, + { + "epoch": 0.7488551835057073, + "grad_norm": 0.4565197229385376, + "learning_rate": 1.5643112850632884e-05, + "loss": 0.8627, + "step": 8381 + }, + { + "epoch": 0.7489445350369692, + "grad_norm": 0.4930249750614166, + "learning_rate": 1.563260092067691e-05, + "loss": 0.9596, + "step": 8382 + }, + { + "epoch": 0.7490338865682311, + "grad_norm": 0.5552300214767456, + "learning_rate": 1.5622091869356937e-05, + "loss": 0.9906, + "step": 8383 + }, + { + "epoch": 0.7491232380994929, + "grad_norm": 0.44053712487220764, + "learning_rate": 1.5611585697553232e-05, + "loss": 0.9295, + "step": 8384 + }, + { + "epoch": 0.7492125896307548, + "grad_norm": 0.4973897635936737, + "learning_rate": 1.5601082406145762e-05, + "loss": 0.9121, + "step": 8385 + }, + { + "epoch": 0.7493019411620166, + "grad_norm": 0.4836944043636322, + "learning_rate": 1.5590581996014304e-05, + "loss": 0.8984, + "step": 8386 + }, + { + "epoch": 0.7493912926932785, + "grad_norm": 0.4505099654197693, + "learning_rate": 1.5580084468038382e-05, + "loss": 0.9337, + "step": 8387 + }, + { + "epoch": 0.7494806442245404, + "grad_norm": 0.42860540747642517, + "learning_rate": 1.556958982309728e-05, + "loss": 0.9484, + "step": 8388 + }, + { + "epoch": 0.7495699957558023, + "grad_norm": 0.5451123714447021, + "learning_rate": 1.5559098062070028e-05, + "loss": 0.9509, + "step": 8389 + }, + { + "epoch": 0.7496593472870642, + "grad_norm": 0.5574720501899719, + "learning_rate": 1.5548609185835444e-05, + "loss": 0.8854, + "step": 8390 + }, + { + "epoch": 0.749748698818326, + "grad_norm": 0.42226511240005493, + "learning_rate": 1.5538123195272054e-05, + "loss": 0.9356, + "step": 8391 + }, + { + "epoch": 0.7498380503495878, + "grad_norm": 0.4949922561645508, + "learning_rate": 1.5527640091258177e-05, + "loss": 0.9313, + "step": 8392 + }, + { + "epoch": 0.7499274018808497, + "grad_norm": 0.49576741456985474, + "learning_rate": 1.5517159874671892e-05, + "loss": 0.8921, + "step": 8393 + }, + { + "epoch": 0.7500167534121116, + "grad_norm": 0.47223761677742004, + "learning_rate": 1.550668254639105e-05, + "loss": 0.9675, + "step": 8394 + }, + { + "epoch": 0.7501061049433735, + "grad_norm": 0.4839581847190857, + "learning_rate": 1.5496208107293197e-05, + "loss": 0.982, + "step": 8395 + }, + { + "epoch": 0.7501954564746354, + "grad_norm": 0.539761483669281, + "learning_rate": 1.5485736558255697e-05, + "loss": 0.9059, + "step": 8396 + }, + { + "epoch": 0.7502848080058973, + "grad_norm": 0.4613410532474518, + "learning_rate": 1.547526790015566e-05, + "loss": 0.9384, + "step": 8397 + }, + { + "epoch": 0.750374159537159, + "grad_norm": 0.47453439235687256, + "learning_rate": 1.5464802133869942e-05, + "loss": 0.9226, + "step": 8398 + }, + { + "epoch": 0.7504635110684209, + "grad_norm": 0.5073251128196716, + "learning_rate": 1.5454339260275165e-05, + "loss": 0.9383, + "step": 8399 + }, + { + "epoch": 0.7505528625996828, + "grad_norm": 0.6021450757980347, + "learning_rate": 1.5443879280247704e-05, + "loss": 0.9338, + "step": 8400 + }, + { + "epoch": 0.7506422141309447, + "grad_norm": 0.4408600330352783, + "learning_rate": 1.5433422194663693e-05, + "loss": 0.965, + "step": 8401 + }, + { + "epoch": 0.7507315656622066, + "grad_norm": 0.5570371747016907, + "learning_rate": 1.542296800439903e-05, + "loss": 0.9716, + "step": 8402 + }, + { + "epoch": 0.7508209171934684, + "grad_norm": 0.48043861985206604, + "learning_rate": 1.541251671032936e-05, + "loss": 0.9822, + "step": 8403 + }, + { + "epoch": 0.7509102687247303, + "grad_norm": 0.5600388646125793, + "learning_rate": 1.540206831333011e-05, + "loss": 0.9544, + "step": 8404 + }, + { + "epoch": 0.7509996202559921, + "grad_norm": 0.48118168115615845, + "learning_rate": 1.5391622814276408e-05, + "loss": 0.9213, + "step": 8405 + }, + { + "epoch": 0.751088971787254, + "grad_norm": 0.5037118196487427, + "learning_rate": 1.538118021404319e-05, + "loss": 0.8725, + "step": 8406 + }, + { + "epoch": 0.7511783233185159, + "grad_norm": 0.5096782445907593, + "learning_rate": 1.5370740513505143e-05, + "loss": 0.9062, + "step": 8407 + }, + { + "epoch": 0.7512676748497777, + "grad_norm": 0.450232595205307, + "learning_rate": 1.53603037135367e-05, + "loss": 0.9839, + "step": 8408 + }, + { + "epoch": 0.7513570263810396, + "grad_norm": 0.4577409625053406, + "learning_rate": 1.534986981501205e-05, + "loss": 0.946, + "step": 8409 + }, + { + "epoch": 0.7514463779123015, + "grad_norm": 0.4218578338623047, + "learning_rate": 1.5339438818805152e-05, + "loss": 0.9404, + "step": 8410 + }, + { + "epoch": 0.7515357294435634, + "grad_norm": 0.5897673964500427, + "learning_rate": 1.5329010725789704e-05, + "loss": 0.9004, + "step": 8411 + }, + { + "epoch": 0.7516250809748252, + "grad_norm": 0.4546933174133301, + "learning_rate": 1.531858553683918e-05, + "loss": 0.9354, + "step": 8412 + }, + { + "epoch": 0.751714432506087, + "grad_norm": 0.48495376110076904, + "learning_rate": 1.530816325282679e-05, + "loss": 1.0102, + "step": 8413 + }, + { + "epoch": 0.7518037840373489, + "grad_norm": 0.5461007952690125, + "learning_rate": 1.5297743874625515e-05, + "loss": 0.9229, + "step": 8414 + }, + { + "epoch": 0.7518931355686108, + "grad_norm": 0.42798981070518494, + "learning_rate": 1.5287327403108108e-05, + "loss": 0.9756, + "step": 8415 + }, + { + "epoch": 0.7519824870998727, + "grad_norm": 0.4769188463687897, + "learning_rate": 1.527691383914702e-05, + "loss": 0.9252, + "step": 8416 + }, + { + "epoch": 0.7520718386311346, + "grad_norm": 0.5491362810134888, + "learning_rate": 1.526650318361453e-05, + "loss": 0.912, + "step": 8417 + }, + { + "epoch": 0.7521611901623964, + "grad_norm": 0.4550434648990631, + "learning_rate": 1.5256095437382622e-05, + "loss": 0.9414, + "step": 8418 + }, + { + "epoch": 0.7522505416936582, + "grad_norm": 0.5997094511985779, + "learning_rate": 1.524569060132307e-05, + "loss": 0.9087, + "step": 8419 + }, + { + "epoch": 0.7523398932249201, + "grad_norm": 0.4672020971775055, + "learning_rate": 1.523528867630738e-05, + "loss": 0.8856, + "step": 8420 + }, + { + "epoch": 0.752429244756182, + "grad_norm": 0.5284818410873413, + "learning_rate": 1.5224889663206832e-05, + "loss": 0.7954, + "step": 8421 + }, + { + "epoch": 0.7525185962874439, + "grad_norm": 0.6233134269714355, + "learning_rate": 1.521449356289245e-05, + "loss": 0.9528, + "step": 8422 + }, + { + "epoch": 0.7526079478187058, + "grad_norm": 0.5407315492630005, + "learning_rate": 1.5204100376235036e-05, + "loss": 0.9614, + "step": 8423 + }, + { + "epoch": 0.7526972993499677, + "grad_norm": 0.5749227404594421, + "learning_rate": 1.5193710104105092e-05, + "loss": 0.8443, + "step": 8424 + }, + { + "epoch": 0.7527866508812294, + "grad_norm": 0.4009944796562195, + "learning_rate": 1.518332274737294e-05, + "loss": 0.9648, + "step": 8425 + }, + { + "epoch": 0.7528760024124913, + "grad_norm": 0.44395560026168823, + "learning_rate": 1.5172938306908623e-05, + "loss": 0.972, + "step": 8426 + }, + { + "epoch": 0.7529653539437532, + "grad_norm": 0.5940225720405579, + "learning_rate": 1.5162556783581971e-05, + "loss": 0.9004, + "step": 8427 + }, + { + "epoch": 0.7530547054750151, + "grad_norm": 0.4562346339225769, + "learning_rate": 1.5152178178262516e-05, + "loss": 0.9244, + "step": 8428 + }, + { + "epoch": 0.753144057006277, + "grad_norm": 0.483516663312912, + "learning_rate": 1.5141802491819584e-05, + "loss": 0.9087, + "step": 8429 + }, + { + "epoch": 0.7532334085375388, + "grad_norm": 0.668719470500946, + "learning_rate": 1.513142972512226e-05, + "loss": 0.7838, + "step": 8430 + }, + { + "epoch": 0.7533227600688007, + "grad_norm": 0.5353193879127502, + "learning_rate": 1.5121059879039367e-05, + "loss": 0.8938, + "step": 8431 + }, + { + "epoch": 0.7534121116000625, + "grad_norm": 0.5128493309020996, + "learning_rate": 1.5110692954439492e-05, + "loss": 0.9597, + "step": 8432 + }, + { + "epoch": 0.7535014631313244, + "grad_norm": 0.4552728831768036, + "learning_rate": 1.5100328952190973e-05, + "loss": 0.9204, + "step": 8433 + }, + { + "epoch": 0.7535908146625863, + "grad_norm": 0.5088723301887512, + "learning_rate": 1.5089967873161909e-05, + "loss": 0.9603, + "step": 8434 + }, + { + "epoch": 0.7536801661938481, + "grad_norm": 0.6546921133995056, + "learning_rate": 1.5079609718220166e-05, + "loss": 0.9088, + "step": 8435 + }, + { + "epoch": 0.75376951772511, + "grad_norm": 0.41693705320358276, + "learning_rate": 1.506925448823332e-05, + "loss": 0.9853, + "step": 8436 + }, + { + "epoch": 0.7538588692563719, + "grad_norm": 0.5113435387611389, + "learning_rate": 1.5058902184068741e-05, + "loss": 0.8789, + "step": 8437 + }, + { + "epoch": 0.7539482207876338, + "grad_norm": 0.4677006006240845, + "learning_rate": 1.5048552806593552e-05, + "loss": 0.9137, + "step": 8438 + }, + { + "epoch": 0.7540375723188956, + "grad_norm": 0.45884403586387634, + "learning_rate": 1.5038206356674623e-05, + "loss": 0.9903, + "step": 8439 + }, + { + "epoch": 0.7541269238501574, + "grad_norm": 0.49759143590927124, + "learning_rate": 1.5027862835178574e-05, + "loss": 0.905, + "step": 8440 + }, + { + "epoch": 0.7542162753814193, + "grad_norm": 0.4578782916069031, + "learning_rate": 1.5017522242971794e-05, + "loss": 0.9329, + "step": 8441 + }, + { + "epoch": 0.7543056269126812, + "grad_norm": 0.40164539217948914, + "learning_rate": 1.5007184580920408e-05, + "loss": 1.0047, + "step": 8442 + }, + { + "epoch": 0.7543949784439431, + "grad_norm": 0.4076503813266754, + "learning_rate": 1.499684984989031e-05, + "loss": 0.9041, + "step": 8443 + }, + { + "epoch": 0.754484329975205, + "grad_norm": 0.5915509462356567, + "learning_rate": 1.4986518050747145e-05, + "loss": 0.9639, + "step": 8444 + }, + { + "epoch": 0.7545736815064669, + "grad_norm": 0.3953361511230469, + "learning_rate": 1.4976189184356327e-05, + "loss": 0.9684, + "step": 8445 + }, + { + "epoch": 0.7546630330377286, + "grad_norm": 0.5043690800666809, + "learning_rate": 1.4965863251582974e-05, + "loss": 0.9368, + "step": 8446 + }, + { + "epoch": 0.7547523845689905, + "grad_norm": 0.4337480962276459, + "learning_rate": 1.495554025329201e-05, + "loss": 0.907, + "step": 8447 + }, + { + "epoch": 0.7548417361002524, + "grad_norm": 0.5515368580818176, + "learning_rate": 1.4945220190348102e-05, + "loss": 0.9356, + "step": 8448 + }, + { + "epoch": 0.7549310876315143, + "grad_norm": 0.5478143692016602, + "learning_rate": 1.4934903063615657e-05, + "loss": 0.967, + "step": 8449 + }, + { + "epoch": 0.7550204391627762, + "grad_norm": 0.5199536085128784, + "learning_rate": 1.4924588873958844e-05, + "loss": 0.8784, + "step": 8450 + }, + { + "epoch": 0.755109790694038, + "grad_norm": 0.667113184928894, + "learning_rate": 1.4914277622241597e-05, + "loss": 0.9662, + "step": 8451 + }, + { + "epoch": 0.7551991422252999, + "grad_norm": 0.4980166256427765, + "learning_rate": 1.4903969309327581e-05, + "loss": 0.9127, + "step": 8452 + }, + { + "epoch": 0.7552884937565617, + "grad_norm": 0.48468539118766785, + "learning_rate": 1.4893663936080232e-05, + "loss": 0.9079, + "step": 8453 + }, + { + "epoch": 0.7553778452878236, + "grad_norm": 0.43085187673568726, + "learning_rate": 1.4883361503362736e-05, + "loss": 1.0368, + "step": 8454 + }, + { + "epoch": 0.7554671968190855, + "grad_norm": 0.5685251355171204, + "learning_rate": 1.4873062012038047e-05, + "loss": 0.8219, + "step": 8455 + }, + { + "epoch": 0.7555565483503474, + "grad_norm": 0.5110030174255371, + "learning_rate": 1.4862765462968826e-05, + "loss": 0.9895, + "step": 8456 + }, + { + "epoch": 0.7556458998816092, + "grad_norm": 0.5514484643936157, + "learning_rate": 1.485247185701753e-05, + "loss": 0.9602, + "step": 8457 + }, + { + "epoch": 0.7557352514128711, + "grad_norm": 0.47170817852020264, + "learning_rate": 1.4842181195046361e-05, + "loss": 0.9424, + "step": 8458 + }, + { + "epoch": 0.755824602944133, + "grad_norm": 0.42950955033302307, + "learning_rate": 1.483189347791728e-05, + "loss": 0.9495, + "step": 8459 + }, + { + "epoch": 0.7559139544753948, + "grad_norm": 0.4732508659362793, + "learning_rate": 1.4821608706491996e-05, + "loss": 0.9515, + "step": 8460 + }, + { + "epoch": 0.7560033060066567, + "grad_norm": 0.47231292724609375, + "learning_rate": 1.4811326881631937e-05, + "loss": 0.9472, + "step": 8461 + }, + { + "epoch": 0.7560926575379185, + "grad_norm": 0.4015584886074066, + "learning_rate": 1.4801048004198342e-05, + "loss": 0.947, + "step": 8462 + }, + { + "epoch": 0.7561820090691804, + "grad_norm": 0.530529797077179, + "learning_rate": 1.4790772075052173e-05, + "loss": 0.9262, + "step": 8463 + }, + { + "epoch": 0.7562713606004423, + "grad_norm": 0.5107260942459106, + "learning_rate": 1.478049909505414e-05, + "loss": 0.9101, + "step": 8464 + }, + { + "epoch": 0.7563607121317042, + "grad_norm": 0.4164203703403473, + "learning_rate": 1.4770229065064738e-05, + "loss": 1.0175, + "step": 8465 + }, + { + "epoch": 0.7564500636629661, + "grad_norm": 0.4908786714076996, + "learning_rate": 1.475996198594416e-05, + "loss": 0.9977, + "step": 8466 + }, + { + "epoch": 0.7565394151942278, + "grad_norm": 0.5216073393821716, + "learning_rate": 1.4749697858552398e-05, + "loss": 0.9263, + "step": 8467 + }, + { + "epoch": 0.7566287667254897, + "grad_norm": 0.48185113072395325, + "learning_rate": 1.4739436683749181e-05, + "loss": 0.8621, + "step": 8468 + }, + { + "epoch": 0.7567181182567516, + "grad_norm": 0.49778178334236145, + "learning_rate": 1.472917846239399e-05, + "loss": 0.8806, + "step": 8469 + }, + { + "epoch": 0.7568074697880135, + "grad_norm": 0.5073883533477783, + "learning_rate": 1.4718923195346062e-05, + "loss": 0.9601, + "step": 8470 + }, + { + "epoch": 0.7568968213192754, + "grad_norm": 0.4837455153465271, + "learning_rate": 1.4708670883464393e-05, + "loss": 0.9467, + "step": 8471 + }, + { + "epoch": 0.7569861728505373, + "grad_norm": 0.38693663477897644, + "learning_rate": 1.469842152760771e-05, + "loss": 0.9904, + "step": 8472 + }, + { + "epoch": 0.7570755243817991, + "grad_norm": 0.46666041016578674, + "learning_rate": 1.4688175128634512e-05, + "loss": 0.9895, + "step": 8473 + }, + { + "epoch": 0.7571648759130609, + "grad_norm": 0.42904338240623474, + "learning_rate": 1.4677931687403046e-05, + "loss": 0.999, + "step": 8474 + }, + { + "epoch": 0.7572542274443228, + "grad_norm": 0.4167872369289398, + "learning_rate": 1.4667691204771322e-05, + "loss": 1.0134, + "step": 8475 + }, + { + "epoch": 0.7573435789755847, + "grad_norm": 0.43804195523262024, + "learning_rate": 1.4657453681597055e-05, + "loss": 0.9359, + "step": 8476 + }, + { + "epoch": 0.7574329305068466, + "grad_norm": 0.5330228805541992, + "learning_rate": 1.4647219118737771e-05, + "loss": 0.842, + "step": 8477 + }, + { + "epoch": 0.7575222820381085, + "grad_norm": 0.4981614649295807, + "learning_rate": 1.463698751705072e-05, + "loss": 0.9647, + "step": 8478 + }, + { + "epoch": 0.7576116335693703, + "grad_norm": 0.4153805077075958, + "learning_rate": 1.4626758877392904e-05, + "loss": 0.9969, + "step": 8479 + }, + { + "epoch": 0.7577009851006321, + "grad_norm": 0.4333755671977997, + "learning_rate": 1.461653320062109e-05, + "loss": 0.9647, + "step": 8480 + }, + { + "epoch": 0.757790336631894, + "grad_norm": 0.5431153178215027, + "learning_rate": 1.4606310487591773e-05, + "loss": 0.9157, + "step": 8481 + }, + { + "epoch": 0.7578796881631559, + "grad_norm": 0.4706348180770874, + "learning_rate": 1.4596090739161228e-05, + "loss": 0.8865, + "step": 8482 + }, + { + "epoch": 0.7579690396944178, + "grad_norm": 0.5356218814849854, + "learning_rate": 1.4585873956185458e-05, + "loss": 0.9127, + "step": 8483 + }, + { + "epoch": 0.7580583912256796, + "grad_norm": 0.45213526487350464, + "learning_rate": 1.4575660139520237e-05, + "loss": 0.9477, + "step": 8484 + }, + { + "epoch": 0.7581477427569415, + "grad_norm": 0.5066707134246826, + "learning_rate": 1.4565449290021088e-05, + "loss": 0.9715, + "step": 8485 + }, + { + "epoch": 0.7582370942882034, + "grad_norm": 0.4998951554298401, + "learning_rate": 1.4555241408543252e-05, + "loss": 0.9402, + "step": 8486 + }, + { + "epoch": 0.7583264458194652, + "grad_norm": 0.46853405237197876, + "learning_rate": 1.454503649594176e-05, + "loss": 0.9371, + "step": 8487 + }, + { + "epoch": 0.7584157973507271, + "grad_norm": 0.5197616219520569, + "learning_rate": 1.4534834553071386e-05, + "loss": 0.9598, + "step": 8488 + }, + { + "epoch": 0.7585051488819889, + "grad_norm": 0.4408440589904785, + "learning_rate": 1.452463558078665e-05, + "loss": 0.9645, + "step": 8489 + }, + { + "epoch": 0.7585945004132508, + "grad_norm": 0.6004998087882996, + "learning_rate": 1.4514439579941818e-05, + "loss": 0.9181, + "step": 8490 + }, + { + "epoch": 0.7586838519445127, + "grad_norm": 0.4586280286312103, + "learning_rate": 1.4504246551390927e-05, + "loss": 0.9704, + "step": 8491 + }, + { + "epoch": 0.7587732034757746, + "grad_norm": 0.4923453629016876, + "learning_rate": 1.4494056495987746e-05, + "loss": 0.9155, + "step": 8492 + }, + { + "epoch": 0.7588625550070365, + "grad_norm": 0.5735955238342285, + "learning_rate": 1.448386941458581e-05, + "loss": 0.9033, + "step": 8493 + }, + { + "epoch": 0.7589519065382982, + "grad_norm": 0.40873488783836365, + "learning_rate": 1.447368530803837e-05, + "loss": 0.9818, + "step": 8494 + }, + { + "epoch": 0.7590412580695601, + "grad_norm": 0.5386642217636108, + "learning_rate": 1.4463504177198473e-05, + "loss": 0.8663, + "step": 8495 + }, + { + "epoch": 0.759130609600822, + "grad_norm": 0.43717023730278015, + "learning_rate": 1.445332602291889e-05, + "loss": 0.9881, + "step": 8496 + }, + { + "epoch": 0.7592199611320839, + "grad_norm": 0.4573476314544678, + "learning_rate": 1.4443150846052167e-05, + "loss": 0.9784, + "step": 8497 + }, + { + "epoch": 0.7593093126633458, + "grad_norm": 0.46079909801483154, + "learning_rate": 1.4432978647450557e-05, + "loss": 0.9627, + "step": 8498 + }, + { + "epoch": 0.7593986641946077, + "grad_norm": 0.4422478973865509, + "learning_rate": 1.44228094279661e-05, + "loss": 0.9779, + "step": 8499 + }, + { + "epoch": 0.7594880157258695, + "grad_norm": 0.42531049251556396, + "learning_rate": 1.4412643188450581e-05, + "loss": 0.9694, + "step": 8500 + }, + { + "epoch": 0.7595773672571313, + "grad_norm": 0.5261896848678589, + "learning_rate": 1.440247992975553e-05, + "loss": 0.8767, + "step": 8501 + }, + { + "epoch": 0.7596667187883932, + "grad_norm": 0.4981927275657654, + "learning_rate": 1.4392319652732222e-05, + "loss": 0.9391, + "step": 8502 + }, + { + "epoch": 0.7597560703196551, + "grad_norm": 0.42882490158081055, + "learning_rate": 1.43821623582317e-05, + "loss": 0.9519, + "step": 8503 + }, + { + "epoch": 0.759845421850917, + "grad_norm": 0.558502197265625, + "learning_rate": 1.4372008047104735e-05, + "loss": 0.9239, + "step": 8504 + }, + { + "epoch": 0.7599347733821789, + "grad_norm": 0.4893379509449005, + "learning_rate": 1.4361856720201866e-05, + "loss": 0.952, + "step": 8505 + }, + { + "epoch": 0.7600241249134407, + "grad_norm": 0.5038098096847534, + "learning_rate": 1.4351708378373386e-05, + "loss": 0.9585, + "step": 8506 + }, + { + "epoch": 0.7601134764447026, + "grad_norm": 0.46212783455848694, + "learning_rate": 1.4341563022469296e-05, + "loss": 0.9344, + "step": 8507 + }, + { + "epoch": 0.7602028279759644, + "grad_norm": 0.43376246094703674, + "learning_rate": 1.4331420653339395e-05, + "loss": 0.9815, + "step": 8508 + }, + { + "epoch": 0.7602921795072263, + "grad_norm": 0.5722975730895996, + "learning_rate": 1.4321281271833214e-05, + "loss": 0.9366, + "step": 8509 + }, + { + "epoch": 0.7603815310384882, + "grad_norm": 0.5890293717384338, + "learning_rate": 1.4311144878800037e-05, + "loss": 0.842, + "step": 8510 + }, + { + "epoch": 0.76047088256975, + "grad_norm": 0.5008702278137207, + "learning_rate": 1.4301011475088889e-05, + "loss": 0.9603, + "step": 8511 + }, + { + "epoch": 0.7605602341010119, + "grad_norm": 0.4864904284477234, + "learning_rate": 1.4290881061548555e-05, + "loss": 0.9232, + "step": 8512 + }, + { + "epoch": 0.7606495856322738, + "grad_norm": 0.4470934271812439, + "learning_rate": 1.4280753639027566e-05, + "loss": 0.949, + "step": 8513 + }, + { + "epoch": 0.7607389371635357, + "grad_norm": 0.5684083700180054, + "learning_rate": 1.4270629208374197e-05, + "loss": 0.8836, + "step": 8514 + }, + { + "epoch": 0.7608282886947975, + "grad_norm": 0.4778299033641815, + "learning_rate": 1.4260507770436482e-05, + "loss": 0.9575, + "step": 8515 + }, + { + "epoch": 0.7609176402260593, + "grad_norm": 0.4669052064418793, + "learning_rate": 1.4250389326062207e-05, + "loss": 0.9437, + "step": 8516 + }, + { + "epoch": 0.7610069917573212, + "grad_norm": 0.4639929533004761, + "learning_rate": 1.424027387609888e-05, + "loss": 0.9677, + "step": 8517 + }, + { + "epoch": 0.7610963432885831, + "grad_norm": 0.40526002645492554, + "learning_rate": 1.4230161421393783e-05, + "loss": 0.9132, + "step": 8518 + }, + { + "epoch": 0.761185694819845, + "grad_norm": 0.45086291432380676, + "learning_rate": 1.422005196279395e-05, + "loss": 0.9638, + "step": 8519 + }, + { + "epoch": 0.7612750463511069, + "grad_norm": 0.48102667927742004, + "learning_rate": 1.420994550114615e-05, + "loss": 0.94, + "step": 8520 + }, + { + "epoch": 0.7613643978823688, + "grad_norm": 0.48567044734954834, + "learning_rate": 1.4199842037296906e-05, + "loss": 0.958, + "step": 8521 + }, + { + "epoch": 0.7614537494136305, + "grad_norm": 0.398173063993454, + "learning_rate": 1.4189741572092496e-05, + "loss": 1.0515, + "step": 8522 + }, + { + "epoch": 0.7615431009448924, + "grad_norm": 0.4758659601211548, + "learning_rate": 1.4179644106378942e-05, + "loss": 0.9731, + "step": 8523 + }, + { + "epoch": 0.7616324524761543, + "grad_norm": 0.5288426280021667, + "learning_rate": 1.4169549641002006e-05, + "loss": 0.9146, + "step": 8524 + }, + { + "epoch": 0.7617218040074162, + "grad_norm": 0.46394291520118713, + "learning_rate": 1.4159458176807217e-05, + "loss": 0.9605, + "step": 8525 + }, + { + "epoch": 0.7618111555386781, + "grad_norm": 0.4080126881599426, + "learning_rate": 1.4149369714639853e-05, + "loss": 0.9506, + "step": 8526 + }, + { + "epoch": 0.76190050706994, + "grad_norm": 0.5326384902000427, + "learning_rate": 1.4139284255344897e-05, + "loss": 0.8812, + "step": 8527 + }, + { + "epoch": 0.7619898586012018, + "grad_norm": 0.3904532194137573, + "learning_rate": 1.412920179976714e-05, + "loss": 0.9381, + "step": 8528 + }, + { + "epoch": 0.7620792101324636, + "grad_norm": 0.4524264931678772, + "learning_rate": 1.411912234875108e-05, + "loss": 1.029, + "step": 8529 + }, + { + "epoch": 0.7621685616637255, + "grad_norm": 0.4898856580257416, + "learning_rate": 1.4109045903141006e-05, + "loss": 0.9016, + "step": 8530 + }, + { + "epoch": 0.7622579131949874, + "grad_norm": 0.5062960982322693, + "learning_rate": 1.4098972463780885e-05, + "loss": 0.9609, + "step": 8531 + }, + { + "epoch": 0.7623472647262493, + "grad_norm": 0.5530799627304077, + "learning_rate": 1.4088902031514507e-05, + "loss": 0.9659, + "step": 8532 + }, + { + "epoch": 0.7624366162575111, + "grad_norm": 0.4216810464859009, + "learning_rate": 1.4078834607185364e-05, + "loss": 0.9497, + "step": 8533 + }, + { + "epoch": 0.762525967788773, + "grad_norm": 0.6185227036476135, + "learning_rate": 1.4068770191636716e-05, + "loss": 0.8765, + "step": 8534 + }, + { + "epoch": 0.7626153193200349, + "grad_norm": 0.5121302008628845, + "learning_rate": 1.405870878571156e-05, + "loss": 0.914, + "step": 8535 + }, + { + "epoch": 0.7627046708512967, + "grad_norm": 0.42362403869628906, + "learning_rate": 1.4048650390252671e-05, + "loss": 0.9526, + "step": 8536 + }, + { + "epoch": 0.7627940223825586, + "grad_norm": 0.4262167513370514, + "learning_rate": 1.4038595006102506e-05, + "loss": 0.9338, + "step": 8537 + }, + { + "epoch": 0.7628833739138204, + "grad_norm": 0.655556857585907, + "learning_rate": 1.4028542634103331e-05, + "loss": 0.9118, + "step": 8538 + }, + { + "epoch": 0.7629727254450823, + "grad_norm": 0.5181341767311096, + "learning_rate": 1.4018493275097139e-05, + "loss": 0.8456, + "step": 8539 + }, + { + "epoch": 0.7630620769763442, + "grad_norm": 0.46238502860069275, + "learning_rate": 1.4008446929925672e-05, + "loss": 0.9437, + "step": 8540 + }, + { + "epoch": 0.7631514285076061, + "grad_norm": 0.5332851409912109, + "learning_rate": 1.3998403599430415e-05, + "loss": 0.9058, + "step": 8541 + }, + { + "epoch": 0.7632407800388679, + "grad_norm": 0.47178998589515686, + "learning_rate": 1.398836328445261e-05, + "loss": 0.9194, + "step": 8542 + }, + { + "epoch": 0.7633301315701297, + "grad_norm": 0.5274223685264587, + "learning_rate": 1.3978325985833229e-05, + "loss": 0.9471, + "step": 8543 + }, + { + "epoch": 0.7634194831013916, + "grad_norm": 0.5024911165237427, + "learning_rate": 1.3968291704413018e-05, + "loss": 1.0011, + "step": 8544 + }, + { + "epoch": 0.7635088346326535, + "grad_norm": 0.5870459675788879, + "learning_rate": 1.3958260441032445e-05, + "loss": 0.8727, + "step": 8545 + }, + { + "epoch": 0.7635981861639154, + "grad_norm": 0.592512309551239, + "learning_rate": 1.3948232196531746e-05, + "loss": 0.8981, + "step": 8546 + }, + { + "epoch": 0.7636875376951773, + "grad_norm": 0.511677086353302, + "learning_rate": 1.3938206971750878e-05, + "loss": 0.9192, + "step": 8547 + }, + { + "epoch": 0.7637768892264392, + "grad_norm": 0.4113241732120514, + "learning_rate": 1.3928184767529562e-05, + "loss": 0.912, + "step": 8548 + }, + { + "epoch": 0.7638662407577009, + "grad_norm": 0.4762282967567444, + "learning_rate": 1.3918165584707271e-05, + "loss": 0.9661, + "step": 8549 + }, + { + "epoch": 0.7639555922889628, + "grad_norm": 0.5495911836624146, + "learning_rate": 1.3908149424123217e-05, + "loss": 1.0346, + "step": 8550 + }, + { + "epoch": 0.7640449438202247, + "grad_norm": 0.4995447099208832, + "learning_rate": 1.3898136286616364e-05, + "loss": 0.9112, + "step": 8551 + }, + { + "epoch": 0.7641342953514866, + "grad_norm": 0.48613351583480835, + "learning_rate": 1.3888126173025412e-05, + "loss": 0.9372, + "step": 8552 + }, + { + "epoch": 0.7642236468827485, + "grad_norm": 0.502858579158783, + "learning_rate": 1.3878119084188818e-05, + "loss": 0.9525, + "step": 8553 + }, + { + "epoch": 0.7643129984140103, + "grad_norm": 0.4795863628387451, + "learning_rate": 1.3868115020944783e-05, + "loss": 0.9097, + "step": 8554 + }, + { + "epoch": 0.7644023499452722, + "grad_norm": 0.49957504868507385, + "learning_rate": 1.385811398413125e-05, + "loss": 0.8775, + "step": 8555 + }, + { + "epoch": 0.764491701476534, + "grad_norm": 0.5270972847938538, + "learning_rate": 1.3848115974585934e-05, + "loss": 0.9626, + "step": 8556 + }, + { + "epoch": 0.7645810530077959, + "grad_norm": 0.4184163510799408, + "learning_rate": 1.3838120993146243e-05, + "loss": 0.8799, + "step": 8557 + }, + { + "epoch": 0.7646704045390578, + "grad_norm": 0.5301498770713806, + "learning_rate": 1.3828129040649374e-05, + "loss": 0.8824, + "step": 8558 + }, + { + "epoch": 0.7647597560703197, + "grad_norm": 0.4511061906814575, + "learning_rate": 1.381814011793226e-05, + "loss": 0.916, + "step": 8559 + }, + { + "epoch": 0.7648491076015815, + "grad_norm": 0.4901033043861389, + "learning_rate": 1.3808154225831583e-05, + "loss": 0.8872, + "step": 8560 + }, + { + "epoch": 0.7649384591328434, + "grad_norm": 0.45118212699890137, + "learning_rate": 1.3798171365183771e-05, + "loss": 0.9832, + "step": 8561 + }, + { + "epoch": 0.7650278106641053, + "grad_norm": 0.42388466000556946, + "learning_rate": 1.3788191536824984e-05, + "loss": 0.9311, + "step": 8562 + }, + { + "epoch": 0.7651171621953671, + "grad_norm": 0.4067128598690033, + "learning_rate": 1.3778214741591167e-05, + "loss": 0.9677, + "step": 8563 + }, + { + "epoch": 0.765206513726629, + "grad_norm": 0.5313117504119873, + "learning_rate": 1.3768240980317948e-05, + "loss": 0.8713, + "step": 8564 + }, + { + "epoch": 0.7652958652578908, + "grad_norm": 0.40129441022872925, + "learning_rate": 1.3758270253840744e-05, + "loss": 0.9213, + "step": 8565 + }, + { + "epoch": 0.7653852167891527, + "grad_norm": 0.5199376940727234, + "learning_rate": 1.374830256299472e-05, + "loss": 0.9241, + "step": 8566 + }, + { + "epoch": 0.7654745683204146, + "grad_norm": 0.4660845696926117, + "learning_rate": 1.3738337908614768e-05, + "loss": 0.9697, + "step": 8567 + }, + { + "epoch": 0.7655639198516765, + "grad_norm": 0.5757974982261658, + "learning_rate": 1.3728376291535555e-05, + "loss": 0.8874, + "step": 8568 + }, + { + "epoch": 0.7656532713829384, + "grad_norm": 0.45287203788757324, + "learning_rate": 1.3718417712591441e-05, + "loss": 0.9749, + "step": 8569 + }, + { + "epoch": 0.7657426229142001, + "grad_norm": 0.4740961492061615, + "learning_rate": 1.3708462172616577e-05, + "loss": 0.9843, + "step": 8570 + }, + { + "epoch": 0.765831974445462, + "grad_norm": 0.48708444833755493, + "learning_rate": 1.3698509672444843e-05, + "loss": 0.9231, + "step": 8571 + }, + { + "epoch": 0.7659213259767239, + "grad_norm": 0.4899809956550598, + "learning_rate": 1.3688560212909873e-05, + "loss": 0.9621, + "step": 8572 + }, + { + "epoch": 0.7660106775079858, + "grad_norm": 0.4719412922859192, + "learning_rate": 1.3678613794845035e-05, + "loss": 0.9742, + "step": 8573 + }, + { + "epoch": 0.7661000290392477, + "grad_norm": 0.44776198267936707, + "learning_rate": 1.3668670419083457e-05, + "loss": 0.8969, + "step": 8574 + }, + { + "epoch": 0.7661893805705096, + "grad_norm": 0.46503016352653503, + "learning_rate": 1.3658730086457988e-05, + "loss": 0.9827, + "step": 8575 + }, + { + "epoch": 0.7662787321017714, + "grad_norm": 0.5194110870361328, + "learning_rate": 1.3648792797801263e-05, + "loss": 0.959, + "step": 8576 + }, + { + "epoch": 0.7663680836330332, + "grad_norm": 0.46184584498405457, + "learning_rate": 1.36388585539456e-05, + "loss": 0.9375, + "step": 8577 + }, + { + "epoch": 0.7664574351642951, + "grad_norm": 0.416530579328537, + "learning_rate": 1.3628927355723114e-05, + "loss": 0.988, + "step": 8578 + }, + { + "epoch": 0.766546786695557, + "grad_norm": 0.4688471257686615, + "learning_rate": 1.3618999203965654e-05, + "loss": 0.9492, + "step": 8579 + }, + { + "epoch": 0.7666361382268189, + "grad_norm": 0.635256826877594, + "learning_rate": 1.3609074099504798e-05, + "loss": 0.8742, + "step": 8580 + }, + { + "epoch": 0.7667254897580807, + "grad_norm": 0.446486234664917, + "learning_rate": 1.3599152043171893e-05, + "loss": 0.9117, + "step": 8581 + }, + { + "epoch": 0.7668148412893426, + "grad_norm": 0.5363548994064331, + "learning_rate": 1.3589233035798005e-05, + "loss": 0.9194, + "step": 8582 + }, + { + "epoch": 0.7669041928206045, + "grad_norm": 0.5018768906593323, + "learning_rate": 1.357931707821396e-05, + "loss": 0.9498, + "step": 8583 + }, + { + "epoch": 0.7669935443518663, + "grad_norm": 0.43810418248176575, + "learning_rate": 1.3569404171250328e-05, + "loss": 0.9512, + "step": 8584 + }, + { + "epoch": 0.7670828958831282, + "grad_norm": 0.45606479048728943, + "learning_rate": 1.355949431573742e-05, + "loss": 0.9449, + "step": 8585 + }, + { + "epoch": 0.76717224741439, + "grad_norm": 0.5644077062606812, + "learning_rate": 1.354958751250529e-05, + "loss": 0.9936, + "step": 8586 + }, + { + "epoch": 0.7672615989456519, + "grad_norm": 0.483189195394516, + "learning_rate": 1.3539683762383753e-05, + "loss": 0.8959, + "step": 8587 + }, + { + "epoch": 0.7673509504769138, + "grad_norm": 0.497502863407135, + "learning_rate": 1.3529783066202329e-05, + "loss": 0.9242, + "step": 8588 + }, + { + "epoch": 0.7674403020081757, + "grad_norm": 0.5695610046386719, + "learning_rate": 1.3519885424790313e-05, + "loss": 0.8977, + "step": 8589 + }, + { + "epoch": 0.7675296535394376, + "grad_norm": 0.5756702423095703, + "learning_rate": 1.3509990838976744e-05, + "loss": 0.8939, + "step": 8590 + }, + { + "epoch": 0.7676190050706994, + "grad_norm": 0.5338248014450073, + "learning_rate": 1.3500099309590397e-05, + "loss": 0.9609, + "step": 8591 + }, + { + "epoch": 0.7677083566019612, + "grad_norm": 0.46391648054122925, + "learning_rate": 1.3490210837459799e-05, + "loss": 0.9912, + "step": 8592 + }, + { + "epoch": 0.7677977081332231, + "grad_norm": 0.5242441892623901, + "learning_rate": 1.3480325423413204e-05, + "loss": 0.8846, + "step": 8593 + }, + { + "epoch": 0.767887059664485, + "grad_norm": 0.45362526178359985, + "learning_rate": 1.3470443068278626e-05, + "loss": 0.9602, + "step": 8594 + }, + { + "epoch": 0.7679764111957469, + "grad_norm": 0.40796563029289246, + "learning_rate": 1.3460563772883822e-05, + "loss": 0.9368, + "step": 8595 + }, + { + "epoch": 0.7680657627270088, + "grad_norm": 0.481330931186676, + "learning_rate": 1.34506875380563e-05, + "loss": 0.9077, + "step": 8596 + }, + { + "epoch": 0.7681551142582707, + "grad_norm": 0.40942972898483276, + "learning_rate": 1.3440814364623267e-05, + "loss": 1.0025, + "step": 8597 + }, + { + "epoch": 0.7682444657895324, + "grad_norm": 0.533697247505188, + "learning_rate": 1.3430944253411727e-05, + "loss": 0.8807, + "step": 8598 + }, + { + "epoch": 0.7683338173207943, + "grad_norm": 0.4914191961288452, + "learning_rate": 1.34210772052484e-05, + "loss": 0.909, + "step": 8599 + }, + { + "epoch": 0.7684231688520562, + "grad_norm": 0.4288891851902008, + "learning_rate": 1.3411213220959773e-05, + "loss": 0.9436, + "step": 8600 + }, + { + "epoch": 0.7685125203833181, + "grad_norm": 0.4495335519313812, + "learning_rate": 1.340135230137204e-05, + "loss": 0.9656, + "step": 8601 + }, + { + "epoch": 0.76860187191458, + "grad_norm": 0.46838313341140747, + "learning_rate": 1.339149444731116e-05, + "loss": 0.9274, + "step": 8602 + }, + { + "epoch": 0.7686912234458418, + "grad_norm": 0.5495761632919312, + "learning_rate": 1.3381639659602841e-05, + "loss": 0.9106, + "step": 8603 + }, + { + "epoch": 0.7687805749771037, + "grad_norm": 0.4932359755039215, + "learning_rate": 1.3371787939072522e-05, + "loss": 0.9227, + "step": 8604 + }, + { + "epoch": 0.7688699265083655, + "grad_norm": 0.5239490270614624, + "learning_rate": 1.3361939286545388e-05, + "loss": 0.9407, + "step": 8605 + }, + { + "epoch": 0.7689592780396274, + "grad_norm": 0.46050575375556946, + "learning_rate": 1.335209370284638e-05, + "loss": 0.966, + "step": 8606 + }, + { + "epoch": 0.7690486295708893, + "grad_norm": 0.49059468507766724, + "learning_rate": 1.3342251188800175e-05, + "loss": 0.9366, + "step": 8607 + }, + { + "epoch": 0.7691379811021511, + "grad_norm": 0.46661099791526794, + "learning_rate": 1.333241174523116e-05, + "loss": 0.9926, + "step": 8608 + }, + { + "epoch": 0.769227332633413, + "grad_norm": 0.4436168372631073, + "learning_rate": 1.3322575372963515e-05, + "loss": 0.969, + "step": 8609 + }, + { + "epoch": 0.7693166841646749, + "grad_norm": 0.4760691225528717, + "learning_rate": 1.3312742072821127e-05, + "loss": 0.943, + "step": 8610 + }, + { + "epoch": 0.7694060356959367, + "grad_norm": 0.4735669195652008, + "learning_rate": 1.3302911845627658e-05, + "loss": 0.9113, + "step": 8611 + }, + { + "epoch": 0.7694953872271986, + "grad_norm": 0.515501081943512, + "learning_rate": 1.3293084692206476e-05, + "loss": 0.8752, + "step": 8612 + }, + { + "epoch": 0.7695847387584605, + "grad_norm": 0.46182894706726074, + "learning_rate": 1.3283260613380727e-05, + "loss": 0.9165, + "step": 8613 + }, + { + "epoch": 0.7696740902897223, + "grad_norm": 0.5648226737976074, + "learning_rate": 1.3273439609973271e-05, + "loss": 0.9357, + "step": 8614 + }, + { + "epoch": 0.7697634418209842, + "grad_norm": 0.5595985054969788, + "learning_rate": 1.326362168280672e-05, + "loss": 0.8938, + "step": 8615 + }, + { + "epoch": 0.7698527933522461, + "grad_norm": 0.5155600309371948, + "learning_rate": 1.3253806832703437e-05, + "loss": 0.9146, + "step": 8616 + }, + { + "epoch": 0.769942144883508, + "grad_norm": 0.4387071430683136, + "learning_rate": 1.3243995060485537e-05, + "loss": 0.9607, + "step": 8617 + }, + { + "epoch": 0.7700314964147698, + "grad_norm": 0.45591017603874207, + "learning_rate": 1.3234186366974822e-05, + "loss": 0.9637, + "step": 8618 + }, + { + "epoch": 0.7701208479460316, + "grad_norm": 0.47090432047843933, + "learning_rate": 1.3224380752992898e-05, + "loss": 0.9757, + "step": 8619 + }, + { + "epoch": 0.7702101994772935, + "grad_norm": 0.45392322540283203, + "learning_rate": 1.3214578219361085e-05, + "loss": 0.961, + "step": 8620 + }, + { + "epoch": 0.7702995510085554, + "grad_norm": 0.4720221161842346, + "learning_rate": 1.3204778766900445e-05, + "loss": 0.9789, + "step": 8621 + }, + { + "epoch": 0.7703889025398173, + "grad_norm": 0.674929141998291, + "learning_rate": 1.3194982396431798e-05, + "loss": 0.8242, + "step": 8622 + }, + { + "epoch": 0.7704782540710792, + "grad_norm": 0.5851225256919861, + "learning_rate": 1.3185189108775687e-05, + "loss": 0.87, + "step": 8623 + }, + { + "epoch": 0.770567605602341, + "grad_norm": 0.5186543464660645, + "learning_rate": 1.3175398904752407e-05, + "loss": 0.8479, + "step": 8624 + }, + { + "epoch": 0.7706569571336028, + "grad_norm": 0.5100897550582886, + "learning_rate": 1.3165611785181986e-05, + "loss": 0.9498, + "step": 8625 + }, + { + "epoch": 0.7707463086648647, + "grad_norm": 0.43524983525276184, + "learning_rate": 1.3155827750884209e-05, + "loss": 1.0058, + "step": 8626 + }, + { + "epoch": 0.7708356601961266, + "grad_norm": 0.6012685298919678, + "learning_rate": 1.3146046802678602e-05, + "loss": 0.8363, + "step": 8627 + }, + { + "epoch": 0.7709250117273885, + "grad_norm": 0.5322076082229614, + "learning_rate": 1.31362689413844e-05, + "loss": 0.9146, + "step": 8628 + }, + { + "epoch": 0.7710143632586504, + "grad_norm": 0.5446600317955017, + "learning_rate": 1.3126494167820607e-05, + "loss": 0.8362, + "step": 8629 + }, + { + "epoch": 0.7711037147899122, + "grad_norm": 0.4060341417789459, + "learning_rate": 1.311672248280597e-05, + "loss": 1.0057, + "step": 8630 + }, + { + "epoch": 0.7711930663211741, + "grad_norm": 0.45972713828086853, + "learning_rate": 1.310695388715898e-05, + "loss": 0.9607, + "step": 8631 + }, + { + "epoch": 0.7712824178524359, + "grad_norm": 0.4327675402164459, + "learning_rate": 1.3097188381697845e-05, + "loss": 0.9479, + "step": 8632 + }, + { + "epoch": 0.7713717693836978, + "grad_norm": 0.514522135257721, + "learning_rate": 1.3087425967240557e-05, + "loss": 0.9165, + "step": 8633 + }, + { + "epoch": 0.7714611209149597, + "grad_norm": 0.531057596206665, + "learning_rate": 1.307766664460479e-05, + "loss": 0.8906, + "step": 8634 + }, + { + "epoch": 0.7715504724462215, + "grad_norm": 0.5103472471237183, + "learning_rate": 1.3067910414608003e-05, + "loss": 1.0145, + "step": 8635 + }, + { + "epoch": 0.7716398239774834, + "grad_norm": 0.519164502620697, + "learning_rate": 1.305815727806739e-05, + "loss": 0.9374, + "step": 8636 + }, + { + "epoch": 0.7717291755087453, + "grad_norm": 0.5133655071258545, + "learning_rate": 1.3048407235799876e-05, + "loss": 0.9235, + "step": 8637 + }, + { + "epoch": 0.7718185270400072, + "grad_norm": 0.5075699687004089, + "learning_rate": 1.3038660288622145e-05, + "loss": 0.9772, + "step": 8638 + }, + { + "epoch": 0.771907878571269, + "grad_norm": 0.545677900314331, + "learning_rate": 1.302891643735058e-05, + "loss": 0.8661, + "step": 8639 + }, + { + "epoch": 0.7719972301025309, + "grad_norm": 0.5071326494216919, + "learning_rate": 1.3019175682801349e-05, + "loss": 0.8474, + "step": 8640 + }, + { + "epoch": 0.7720865816337927, + "grad_norm": 0.6671762466430664, + "learning_rate": 1.3009438025790337e-05, + "loss": 0.8427, + "step": 8641 + }, + { + "epoch": 0.7721759331650546, + "grad_norm": 0.4839807450771332, + "learning_rate": 1.2999703467133183e-05, + "loss": 0.9872, + "step": 8642 + }, + { + "epoch": 0.7722652846963165, + "grad_norm": 0.5609176754951477, + "learning_rate": 1.2989972007645263e-05, + "loss": 0.9293, + "step": 8643 + }, + { + "epoch": 0.7723546362275784, + "grad_norm": 0.5578341484069824, + "learning_rate": 1.2980243648141682e-05, + "loss": 0.9176, + "step": 8644 + }, + { + "epoch": 0.7724439877588403, + "grad_norm": 0.5111382007598877, + "learning_rate": 1.2970518389437297e-05, + "loss": 0.8884, + "step": 8645 + }, + { + "epoch": 0.772533339290102, + "grad_norm": 0.6080954670906067, + "learning_rate": 1.2960796232346706e-05, + "loss": 0.9019, + "step": 8646 + }, + { + "epoch": 0.7726226908213639, + "grad_norm": 0.4767414927482605, + "learning_rate": 1.295107717768425e-05, + "loss": 0.9232, + "step": 8647 + }, + { + "epoch": 0.7727120423526258, + "grad_norm": 0.4408529996871948, + "learning_rate": 1.2941361226263982e-05, + "loss": 0.9483, + "step": 8648 + }, + { + "epoch": 0.7728013938838877, + "grad_norm": 0.493099182844162, + "learning_rate": 1.293164837889973e-05, + "loss": 1.0169, + "step": 8649 + }, + { + "epoch": 0.7728907454151496, + "grad_norm": 0.554325520992279, + "learning_rate": 1.2921938636405045e-05, + "loss": 0.8812, + "step": 8650 + }, + { + "epoch": 0.7729800969464115, + "grad_norm": 0.513780951499939, + "learning_rate": 1.2912231999593222e-05, + "loss": 0.8577, + "step": 8651 + }, + { + "epoch": 0.7730694484776733, + "grad_norm": 0.4900115132331848, + "learning_rate": 1.2902528469277297e-05, + "loss": 0.9635, + "step": 8652 + }, + { + "epoch": 0.7731588000089351, + "grad_norm": 0.47862446308135986, + "learning_rate": 1.2892828046270038e-05, + "loss": 0.9151, + "step": 8653 + }, + { + "epoch": 0.773248151540197, + "grad_norm": 0.4748653173446655, + "learning_rate": 1.2883130731383969e-05, + "loss": 0.9588, + "step": 8654 + }, + { + "epoch": 0.7733375030714589, + "grad_norm": 0.4709342122077942, + "learning_rate": 1.2873436525431342e-05, + "loss": 0.9934, + "step": 8655 + }, + { + "epoch": 0.7734268546027208, + "grad_norm": 0.4540732800960541, + "learning_rate": 1.2863745429224144e-05, + "loss": 0.9338, + "step": 8656 + }, + { + "epoch": 0.7735162061339826, + "grad_norm": 0.5187547206878662, + "learning_rate": 1.2854057443574124e-05, + "loss": 0.8713, + "step": 8657 + }, + { + "epoch": 0.7736055576652445, + "grad_norm": 0.522544801235199, + "learning_rate": 1.2844372569292723e-05, + "loss": 0.8775, + "step": 8658 + }, + { + "epoch": 0.7736949091965064, + "grad_norm": 0.6091160774230957, + "learning_rate": 1.2834690807191174e-05, + "loss": 0.9253, + "step": 8659 + }, + { + "epoch": 0.7737842607277682, + "grad_norm": 0.5314405560493469, + "learning_rate": 1.2825012158080424e-05, + "loss": 0.9408, + "step": 8660 + }, + { + "epoch": 0.7738736122590301, + "grad_norm": 0.507613480091095, + "learning_rate": 1.2815336622771157e-05, + "loss": 0.9501, + "step": 8661 + }, + { + "epoch": 0.773962963790292, + "grad_norm": 0.4508688151836395, + "learning_rate": 1.2805664202073814e-05, + "loss": 1.0068, + "step": 8662 + }, + { + "epoch": 0.7740523153215538, + "grad_norm": 0.4795742332935333, + "learning_rate": 1.2795994896798551e-05, + "loss": 0.958, + "step": 8663 + }, + { + "epoch": 0.7741416668528157, + "grad_norm": 0.6187158226966858, + "learning_rate": 1.278632870775529e-05, + "loss": 0.8805, + "step": 8664 + }, + { + "epoch": 0.7742310183840776, + "grad_norm": 0.6057202816009521, + "learning_rate": 1.2776665635753665e-05, + "loss": 0.8507, + "step": 8665 + }, + { + "epoch": 0.7743203699153395, + "grad_norm": 0.43509769439697266, + "learning_rate": 1.2767005681603078e-05, + "loss": 0.9527, + "step": 8666 + }, + { + "epoch": 0.7744097214466013, + "grad_norm": 0.5171564817428589, + "learning_rate": 1.2757348846112626e-05, + "loss": 1.0171, + "step": 8667 + }, + { + "epoch": 0.7744990729778631, + "grad_norm": 0.5768764615058899, + "learning_rate": 1.2747695130091185e-05, + "loss": 0.912, + "step": 8668 + }, + { + "epoch": 0.774588424509125, + "grad_norm": 0.6042607426643372, + "learning_rate": 1.2738044534347365e-05, + "loss": 0.943, + "step": 8669 + }, + { + "epoch": 0.7746777760403869, + "grad_norm": 0.4685840606689453, + "learning_rate": 1.2728397059689495e-05, + "loss": 0.942, + "step": 8670 + }, + { + "epoch": 0.7747671275716488, + "grad_norm": 0.626491367816925, + "learning_rate": 1.271875270692567e-05, + "loss": 0.8608, + "step": 8671 + }, + { + "epoch": 0.7748564791029107, + "grad_norm": 0.5503062605857849, + "learning_rate": 1.2709111476863683e-05, + "loss": 0.8855, + "step": 8672 + }, + { + "epoch": 0.7749458306341724, + "grad_norm": 0.41604703664779663, + "learning_rate": 1.2699473370311099e-05, + "loss": 0.9734, + "step": 8673 + }, + { + "epoch": 0.7750351821654343, + "grad_norm": 0.48552244901657104, + "learning_rate": 1.268983838807522e-05, + "loss": 0.9216, + "step": 8674 + }, + { + "epoch": 0.7751245336966962, + "grad_norm": 0.45172011852264404, + "learning_rate": 1.2680206530963073e-05, + "loss": 0.9134, + "step": 8675 + }, + { + "epoch": 0.7752138852279581, + "grad_norm": 0.4474288523197174, + "learning_rate": 1.267057779978143e-05, + "loss": 1.0127, + "step": 8676 + }, + { + "epoch": 0.77530323675922, + "grad_norm": 0.5544458031654358, + "learning_rate": 1.2660952195336795e-05, + "loss": 0.8255, + "step": 8677 + }, + { + "epoch": 0.7753925882904819, + "grad_norm": 0.4368109703063965, + "learning_rate": 1.265132971843544e-05, + "loss": 0.9037, + "step": 8678 + }, + { + "epoch": 0.7754819398217437, + "grad_norm": 0.5210118889808655, + "learning_rate": 1.2641710369883308e-05, + "loss": 0.915, + "step": 8679 + }, + { + "epoch": 0.7755712913530055, + "grad_norm": 0.5763956308364868, + "learning_rate": 1.2632094150486146e-05, + "loss": 1.0095, + "step": 8680 + }, + { + "epoch": 0.7756606428842674, + "grad_norm": 0.5526300072669983, + "learning_rate": 1.2622481061049413e-05, + "loss": 0.9442, + "step": 8681 + }, + { + "epoch": 0.7757499944155293, + "grad_norm": 0.44834303855895996, + "learning_rate": 1.2612871102378304e-05, + "loss": 0.898, + "step": 8682 + }, + { + "epoch": 0.7758393459467912, + "grad_norm": 0.5319566130638123, + "learning_rate": 1.2603264275277766e-05, + "loss": 0.8959, + "step": 8683 + }, + { + "epoch": 0.775928697478053, + "grad_norm": 0.45926499366760254, + "learning_rate": 1.2593660580552457e-05, + "loss": 0.9541, + "step": 8684 + }, + { + "epoch": 0.7760180490093149, + "grad_norm": 0.4919742941856384, + "learning_rate": 1.2584060019006799e-05, + "loss": 0.8658, + "step": 8685 + }, + { + "epoch": 0.7761074005405768, + "grad_norm": 0.42770183086395264, + "learning_rate": 1.257446259144494e-05, + "loss": 0.9669, + "step": 8686 + }, + { + "epoch": 0.7761967520718386, + "grad_norm": 0.626804530620575, + "learning_rate": 1.2564868298670773e-05, + "loss": 0.8515, + "step": 8687 + }, + { + "epoch": 0.7762861036031005, + "grad_norm": 0.5289289355278015, + "learning_rate": 1.2555277141487925e-05, + "loss": 0.8717, + "step": 8688 + }, + { + "epoch": 0.7763754551343623, + "grad_norm": 0.3943426311016083, + "learning_rate": 1.2545689120699733e-05, + "loss": 0.9667, + "step": 8689 + }, + { + "epoch": 0.7764648066656242, + "grad_norm": 0.44126003980636597, + "learning_rate": 1.2536104237109314e-05, + "loss": 0.9189, + "step": 8690 + }, + { + "epoch": 0.7765541581968861, + "grad_norm": 0.5304616689682007, + "learning_rate": 1.2526522491519499e-05, + "loss": 0.891, + "step": 8691 + }, + { + "epoch": 0.776643509728148, + "grad_norm": 0.5561332106590271, + "learning_rate": 1.2516943884732862e-05, + "loss": 0.9357, + "step": 8692 + }, + { + "epoch": 0.7767328612594099, + "grad_norm": 0.48594895005226135, + "learning_rate": 1.2507368417551717e-05, + "loss": 0.8873, + "step": 8693 + }, + { + "epoch": 0.7768222127906717, + "grad_norm": 0.48896369338035583, + "learning_rate": 1.2497796090778113e-05, + "loss": 0.9618, + "step": 8694 + }, + { + "epoch": 0.7769115643219335, + "grad_norm": 0.5779229998588562, + "learning_rate": 1.2488226905213829e-05, + "loss": 0.9589, + "step": 8695 + }, + { + "epoch": 0.7770009158531954, + "grad_norm": 0.4492420256137848, + "learning_rate": 1.247866086166039e-05, + "loss": 0.9654, + "step": 8696 + }, + { + "epoch": 0.7770902673844573, + "grad_norm": 0.45466622710227966, + "learning_rate": 1.2469097960919052e-05, + "loss": 0.9783, + "step": 8697 + }, + { + "epoch": 0.7771796189157192, + "grad_norm": 0.4977153241634369, + "learning_rate": 1.2459538203790822e-05, + "loss": 0.9013, + "step": 8698 + }, + { + "epoch": 0.7772689704469811, + "grad_norm": 0.5779181122779846, + "learning_rate": 1.244998159107641e-05, + "loss": 0.8386, + "step": 8699 + }, + { + "epoch": 0.777358321978243, + "grad_norm": 0.4691259264945984, + "learning_rate": 1.2440428123576286e-05, + "loss": 1.0294, + "step": 8700 + }, + { + "epoch": 0.7774476735095047, + "grad_norm": 0.4812089502811432, + "learning_rate": 1.2430877802090674e-05, + "loss": 0.9459, + "step": 8701 + }, + { + "epoch": 0.7775370250407666, + "grad_norm": 0.5185266137123108, + "learning_rate": 1.2421330627419498e-05, + "loss": 0.8558, + "step": 8702 + }, + { + "epoch": 0.7776263765720285, + "grad_norm": 0.4775238037109375, + "learning_rate": 1.2411786600362457e-05, + "loss": 0.8715, + "step": 8703 + }, + { + "epoch": 0.7777157281032904, + "grad_norm": 0.5433192253112793, + "learning_rate": 1.2402245721718935e-05, + "loss": 0.9181, + "step": 8704 + }, + { + "epoch": 0.7778050796345523, + "grad_norm": 0.47637078166007996, + "learning_rate": 1.2392707992288095e-05, + "loss": 0.9268, + "step": 8705 + }, + { + "epoch": 0.7778944311658141, + "grad_norm": 0.45217010378837585, + "learning_rate": 1.2383173412868832e-05, + "loss": 0.9737, + "step": 8706 + }, + { + "epoch": 0.777983782697076, + "grad_norm": 0.40835416316986084, + "learning_rate": 1.2373641984259754e-05, + "loss": 1.0122, + "step": 8707 + }, + { + "epoch": 0.7780731342283378, + "grad_norm": 0.43482106924057007, + "learning_rate": 1.2364113707259251e-05, + "loss": 1.0017, + "step": 8708 + }, + { + "epoch": 0.7781624857595997, + "grad_norm": 0.4076036214828491, + "learning_rate": 1.2354588582665371e-05, + "loss": 0.9542, + "step": 8709 + }, + { + "epoch": 0.7782518372908616, + "grad_norm": 0.5018858313560486, + "learning_rate": 1.2345066611275973e-05, + "loss": 0.9303, + "step": 8710 + }, + { + "epoch": 0.7783411888221234, + "grad_norm": 0.45527756214141846, + "learning_rate": 1.2335547793888619e-05, + "loss": 1.0528, + "step": 8711 + }, + { + "epoch": 0.7784305403533853, + "grad_norm": 0.46973609924316406, + "learning_rate": 1.2326032131300613e-05, + "loss": 0.9795, + "step": 8712 + }, + { + "epoch": 0.7785198918846472, + "grad_norm": 0.46930214762687683, + "learning_rate": 1.2316519624308991e-05, + "loss": 0.8755, + "step": 8713 + }, + { + "epoch": 0.7786092434159091, + "grad_norm": 0.4111507534980774, + "learning_rate": 1.2307010273710528e-05, + "loss": 0.9699, + "step": 8714 + }, + { + "epoch": 0.7786985949471709, + "grad_norm": 0.5797132849693298, + "learning_rate": 1.229750408030173e-05, + "loss": 0.9161, + "step": 8715 + }, + { + "epoch": 0.7787879464784327, + "grad_norm": 0.5236135721206665, + "learning_rate": 1.2288001044878849e-05, + "loss": 0.9461, + "step": 8716 + }, + { + "epoch": 0.7788772980096946, + "grad_norm": 0.42989420890808105, + "learning_rate": 1.227850116823786e-05, + "loss": 0.973, + "step": 8717 + }, + { + "epoch": 0.7789666495409565, + "grad_norm": 0.5021857619285583, + "learning_rate": 1.2269004451174493e-05, + "loss": 0.9027, + "step": 8718 + }, + { + "epoch": 0.7790560010722184, + "grad_norm": 0.5190892219543457, + "learning_rate": 1.2259510894484173e-05, + "loss": 0.8612, + "step": 8719 + }, + { + "epoch": 0.7791453526034803, + "grad_norm": 0.4393077492713928, + "learning_rate": 1.22500204989621e-05, + "loss": 0.8991, + "step": 8720 + }, + { + "epoch": 0.7792347041347422, + "grad_norm": 0.5465923547744751, + "learning_rate": 1.2240533265403198e-05, + "loss": 0.8785, + "step": 8721 + }, + { + "epoch": 0.7793240556660039, + "grad_norm": 0.5483127236366272, + "learning_rate": 1.2231049194602122e-05, + "loss": 0.8618, + "step": 8722 + }, + { + "epoch": 0.7794134071972658, + "grad_norm": 0.4444792866706848, + "learning_rate": 1.2221568287353263e-05, + "loss": 0.9536, + "step": 8723 + }, + { + "epoch": 0.7795027587285277, + "grad_norm": 0.5139243006706238, + "learning_rate": 1.221209054445075e-05, + "loss": 0.8762, + "step": 8724 + }, + { + "epoch": 0.7795921102597896, + "grad_norm": 0.49245816469192505, + "learning_rate": 1.2202615966688442e-05, + "loss": 0.9376, + "step": 8725 + }, + { + "epoch": 0.7796814617910515, + "grad_norm": 0.5652595162391663, + "learning_rate": 1.2193144554859937e-05, + "loss": 0.8839, + "step": 8726 + }, + { + "epoch": 0.7797708133223133, + "grad_norm": 0.679634153842926, + "learning_rate": 1.2183676309758574e-05, + "loss": 0.8982, + "step": 8727 + }, + { + "epoch": 0.7798601648535752, + "grad_norm": 0.4609330892562866, + "learning_rate": 1.2174211232177419e-05, + "loss": 0.9649, + "step": 8728 + }, + { + "epoch": 0.779949516384837, + "grad_norm": 0.5099778771400452, + "learning_rate": 1.2164749322909257e-05, + "loss": 0.8654, + "step": 8729 + }, + { + "epoch": 0.7800388679160989, + "grad_norm": 0.5093337297439575, + "learning_rate": 1.2155290582746636e-05, + "loss": 0.9262, + "step": 8730 + }, + { + "epoch": 0.7801282194473608, + "grad_norm": 0.5493488907814026, + "learning_rate": 1.214583501248182e-05, + "loss": 0.9284, + "step": 8731 + }, + { + "epoch": 0.7802175709786227, + "grad_norm": 0.4473147392272949, + "learning_rate": 1.2136382612906822e-05, + "loss": 1.0044, + "step": 8732 + }, + { + "epoch": 0.7803069225098845, + "grad_norm": 0.4741600751876831, + "learning_rate": 1.2126933384813378e-05, + "loss": 0.913, + "step": 8733 + }, + { + "epoch": 0.7803962740411464, + "grad_norm": 0.42708808183670044, + "learning_rate": 1.2117487328992955e-05, + "loss": 0.9394, + "step": 8734 + }, + { + "epoch": 0.7804856255724082, + "grad_norm": 0.5554439425468445, + "learning_rate": 1.210804444623677e-05, + "loss": 0.9425, + "step": 8735 + }, + { + "epoch": 0.7805749771036701, + "grad_norm": 0.5091964602470398, + "learning_rate": 1.2098604737335778e-05, + "loss": 0.9622, + "step": 8736 + }, + { + "epoch": 0.780664328634932, + "grad_norm": 0.5323701500892639, + "learning_rate": 1.2089168203080625e-05, + "loss": 0.9593, + "step": 8737 + }, + { + "epoch": 0.7807536801661938, + "grad_norm": 0.5752742290496826, + "learning_rate": 1.207973484426173e-05, + "loss": 0.8755, + "step": 8738 + }, + { + "epoch": 0.7808430316974557, + "grad_norm": 0.4725722074508667, + "learning_rate": 1.2070304661669251e-05, + "loss": 1.0083, + "step": 8739 + }, + { + "epoch": 0.7809323832287176, + "grad_norm": 0.5769376754760742, + "learning_rate": 1.2060877656093051e-05, + "loss": 0.9272, + "step": 8740 + }, + { + "epoch": 0.7810217347599795, + "grad_norm": 0.5163832902908325, + "learning_rate": 1.2051453828322768e-05, + "loss": 0.9612, + "step": 8741 + }, + { + "epoch": 0.7811110862912413, + "grad_norm": 0.5350446105003357, + "learning_rate": 1.204203317914771e-05, + "loss": 0.9202, + "step": 8742 + }, + { + "epoch": 0.7812004378225031, + "grad_norm": 0.5735968351364136, + "learning_rate": 1.2032615709356981e-05, + "loss": 0.9389, + "step": 8743 + }, + { + "epoch": 0.781289789353765, + "grad_norm": 0.5735962390899658, + "learning_rate": 1.2023201419739389e-05, + "loss": 0.9112, + "step": 8744 + }, + { + "epoch": 0.7813791408850269, + "grad_norm": 0.5810607671737671, + "learning_rate": 1.2013790311083478e-05, + "loss": 0.9489, + "step": 8745 + }, + { + "epoch": 0.7814684924162888, + "grad_norm": 0.554716944694519, + "learning_rate": 1.2004382384177537e-05, + "loss": 0.9159, + "step": 8746 + }, + { + "epoch": 0.7815578439475507, + "grad_norm": 0.5520256757736206, + "learning_rate": 1.1994977639809574e-05, + "loss": 0.8593, + "step": 8747 + }, + { + "epoch": 0.7816471954788126, + "grad_norm": 0.4506448805332184, + "learning_rate": 1.1985576078767352e-05, + "loss": 0.9468, + "step": 8748 + }, + { + "epoch": 0.7817365470100743, + "grad_norm": 0.40401577949523926, + "learning_rate": 1.197617770183832e-05, + "loss": 0.9659, + "step": 8749 + }, + { + "epoch": 0.7818258985413362, + "grad_norm": 0.4281572699546814, + "learning_rate": 1.1966782509809715e-05, + "loss": 0.9421, + "step": 8750 + }, + { + "epoch": 0.7819152500725981, + "grad_norm": 0.4709114134311676, + "learning_rate": 1.195739050346848e-05, + "loss": 0.975, + "step": 8751 + }, + { + "epoch": 0.78200460160386, + "grad_norm": 0.4489923119544983, + "learning_rate": 1.1948001683601295e-05, + "loss": 1.0054, + "step": 8752 + }, + { + "epoch": 0.7820939531351219, + "grad_norm": 0.4930829107761383, + "learning_rate": 1.1938616050994572e-05, + "loss": 0.9004, + "step": 8753 + }, + { + "epoch": 0.7821833046663837, + "grad_norm": 0.5570724606513977, + "learning_rate": 1.192923360643446e-05, + "loss": 0.8782, + "step": 8754 + }, + { + "epoch": 0.7822726561976456, + "grad_norm": 0.44626376032829285, + "learning_rate": 1.1919854350706838e-05, + "loss": 0.9346, + "step": 8755 + }, + { + "epoch": 0.7823620077289074, + "grad_norm": 0.39036697149276733, + "learning_rate": 1.1910478284597321e-05, + "loss": 0.9688, + "step": 8756 + }, + { + "epoch": 0.7824513592601693, + "grad_norm": 0.5225417017936707, + "learning_rate": 1.1901105408891256e-05, + "loss": 0.8374, + "step": 8757 + }, + { + "epoch": 0.7825407107914312, + "grad_norm": 0.43829452991485596, + "learning_rate": 1.1891735724373726e-05, + "loss": 0.9661, + "step": 8758 + }, + { + "epoch": 0.782630062322693, + "grad_norm": 0.5867989659309387, + "learning_rate": 1.1882369231829526e-05, + "loss": 0.9218, + "step": 8759 + }, + { + "epoch": 0.7827194138539549, + "grad_norm": 0.46479299664497375, + "learning_rate": 1.1873005932043202e-05, + "loss": 0.94, + "step": 8760 + }, + { + "epoch": 0.7828087653852168, + "grad_norm": 0.543161153793335, + "learning_rate": 1.1863645825799042e-05, + "loss": 0.9706, + "step": 8761 + }, + { + "epoch": 0.7828981169164787, + "grad_norm": 0.4378313720226288, + "learning_rate": 1.185428891388104e-05, + "loss": 0.9379, + "step": 8762 + }, + { + "epoch": 0.7829874684477405, + "grad_norm": 0.44382184743881226, + "learning_rate": 1.1844935197072954e-05, + "loss": 0.9562, + "step": 8763 + }, + { + "epoch": 0.7830768199790024, + "grad_norm": 0.5598951578140259, + "learning_rate": 1.183558467615824e-05, + "loss": 0.8841, + "step": 8764 + }, + { + "epoch": 0.7831661715102642, + "grad_norm": 0.4546869695186615, + "learning_rate": 1.1826237351920123e-05, + "loss": 0.8926, + "step": 8765 + }, + { + "epoch": 0.7832555230415261, + "grad_norm": 0.3868406414985657, + "learning_rate": 1.1816893225141523e-05, + "loss": 1.0242, + "step": 8766 + }, + { + "epoch": 0.783344874572788, + "grad_norm": 0.4522659480571747, + "learning_rate": 1.1807552296605118e-05, + "loss": 0.9198, + "step": 8767 + }, + { + "epoch": 0.7834342261040499, + "grad_norm": 0.47773608565330505, + "learning_rate": 1.1798214567093313e-05, + "loss": 0.96, + "step": 8768 + }, + { + "epoch": 0.7835235776353118, + "grad_norm": 0.42124301195144653, + "learning_rate": 1.1788880037388256e-05, + "loss": 1.0142, + "step": 8769 + }, + { + "epoch": 0.7836129291665735, + "grad_norm": 0.4496397376060486, + "learning_rate": 1.1779548708271782e-05, + "loss": 0.964, + "step": 8770 + }, + { + "epoch": 0.7837022806978354, + "grad_norm": 0.4569949209690094, + "learning_rate": 1.1770220580525504e-05, + "loss": 0.9092, + "step": 8771 + }, + { + "epoch": 0.7837916322290973, + "grad_norm": 0.6700246334075928, + "learning_rate": 1.1760895654930748e-05, + "loss": 0.8237, + "step": 8772 + }, + { + "epoch": 0.7838809837603592, + "grad_norm": 0.4912089407444, + "learning_rate": 1.175157393226859e-05, + "loss": 0.8981, + "step": 8773 + }, + { + "epoch": 0.7839703352916211, + "grad_norm": 0.5014065504074097, + "learning_rate": 1.174225541331982e-05, + "loss": 0.9595, + "step": 8774 + }, + { + "epoch": 0.784059686822883, + "grad_norm": 0.5120930671691895, + "learning_rate": 1.1732940098864947e-05, + "loss": 0.9345, + "step": 8775 + }, + { + "epoch": 0.7841490383541448, + "grad_norm": 0.45890259742736816, + "learning_rate": 1.1723627989684239e-05, + "loss": 1.0086, + "step": 8776 + }, + { + "epoch": 0.7842383898854066, + "grad_norm": 0.43002086877822876, + "learning_rate": 1.171431908655768e-05, + "loss": 1.0457, + "step": 8777 + }, + { + "epoch": 0.7843277414166685, + "grad_norm": 0.4677255153656006, + "learning_rate": 1.1705013390264995e-05, + "loss": 0.8648, + "step": 8778 + }, + { + "epoch": 0.7844170929479304, + "grad_norm": 0.5233368277549744, + "learning_rate": 1.169571090158565e-05, + "loss": 0.8994, + "step": 8779 + }, + { + "epoch": 0.7845064444791923, + "grad_norm": 0.4638941287994385, + "learning_rate": 1.1686411621298793e-05, + "loss": 0.9106, + "step": 8780 + }, + { + "epoch": 0.7845957960104541, + "grad_norm": 0.4682579040527344, + "learning_rate": 1.167711555018336e-05, + "loss": 0.9533, + "step": 8781 + }, + { + "epoch": 0.784685147541716, + "grad_norm": 0.47554928064346313, + "learning_rate": 1.1667822689017988e-05, + "loss": 0.9157, + "step": 8782 + }, + { + "epoch": 0.7847744990729779, + "grad_norm": 0.5877280235290527, + "learning_rate": 1.165853303858106e-05, + "loss": 0.926, + "step": 8783 + }, + { + "epoch": 0.7848638506042397, + "grad_norm": 0.47542765736579895, + "learning_rate": 1.1649246599650681e-05, + "loss": 0.952, + "step": 8784 + }, + { + "epoch": 0.7849532021355016, + "grad_norm": 0.5808351635932922, + "learning_rate": 1.1639963373004691e-05, + "loss": 0.8716, + "step": 8785 + }, + { + "epoch": 0.7850425536667635, + "grad_norm": 0.4650845527648926, + "learning_rate": 1.1630683359420652e-05, + "loss": 0.9614, + "step": 8786 + }, + { + "epoch": 0.7851319051980253, + "grad_norm": 0.43621885776519775, + "learning_rate": 1.1621406559675873e-05, + "loss": 0.8881, + "step": 8787 + }, + { + "epoch": 0.7852212567292872, + "grad_norm": 0.47639355063438416, + "learning_rate": 1.1612132974547379e-05, + "loss": 0.9544, + "step": 8788 + }, + { + "epoch": 0.7853106082605491, + "grad_norm": 0.6018533706665039, + "learning_rate": 1.1602862604811955e-05, + "loss": 0.8597, + "step": 8789 + }, + { + "epoch": 0.785399959791811, + "grad_norm": 0.44231539964675903, + "learning_rate": 1.1593595451246047e-05, + "loss": 0.9587, + "step": 8790 + }, + { + "epoch": 0.7854893113230728, + "grad_norm": 0.5324105024337769, + "learning_rate": 1.1584331514625912e-05, + "loss": 0.9463, + "step": 8791 + }, + { + "epoch": 0.7855786628543346, + "grad_norm": 0.5201995372772217, + "learning_rate": 1.1575070795727489e-05, + "loss": 0.9776, + "step": 8792 + }, + { + "epoch": 0.7856680143855965, + "grad_norm": 0.45011529326438904, + "learning_rate": 1.1565813295326466e-05, + "loss": 0.9372, + "step": 8793 + }, + { + "epoch": 0.7857573659168584, + "grad_norm": 0.5564170479774475, + "learning_rate": 1.155655901419826e-05, + "loss": 0.9709, + "step": 8794 + }, + { + "epoch": 0.7858467174481203, + "grad_norm": 0.4589083790779114, + "learning_rate": 1.1547307953118014e-05, + "loss": 0.9465, + "step": 8795 + }, + { + "epoch": 0.7859360689793822, + "grad_norm": 0.5712124705314636, + "learning_rate": 1.1538060112860604e-05, + "loss": 0.8866, + "step": 8796 + }, + { + "epoch": 0.786025420510644, + "grad_norm": 0.45000290870666504, + "learning_rate": 1.1528815494200634e-05, + "loss": 0.9641, + "step": 8797 + }, + { + "epoch": 0.7861147720419058, + "grad_norm": 0.5889281630516052, + "learning_rate": 1.1519574097912444e-05, + "loss": 0.8468, + "step": 8798 + }, + { + "epoch": 0.7862041235731677, + "grad_norm": 0.5337108373641968, + "learning_rate": 1.1510335924770105e-05, + "loss": 0.9437, + "step": 8799 + }, + { + "epoch": 0.7862934751044296, + "grad_norm": 0.4237595498561859, + "learning_rate": 1.1501100975547385e-05, + "loss": 0.9497, + "step": 8800 + }, + { + "epoch": 0.7863828266356915, + "grad_norm": 0.4925912916660309, + "learning_rate": 1.1491869251017834e-05, + "loss": 0.945, + "step": 8801 + }, + { + "epoch": 0.7864721781669534, + "grad_norm": 0.6141911745071411, + "learning_rate": 1.1482640751954699e-05, + "loss": 0.8556, + "step": 8802 + }, + { + "epoch": 0.7865615296982152, + "grad_norm": 0.43323883414268494, + "learning_rate": 1.1473415479130962e-05, + "loss": 0.9278, + "step": 8803 + }, + { + "epoch": 0.786650881229477, + "grad_norm": 0.44862401485443115, + "learning_rate": 1.1464193433319347e-05, + "loss": 0.9978, + "step": 8804 + }, + { + "epoch": 0.7867402327607389, + "grad_norm": 0.46563851833343506, + "learning_rate": 1.145497461529229e-05, + "loss": 0.9032, + "step": 8805 + }, + { + "epoch": 0.7868295842920008, + "grad_norm": 0.6122609972953796, + "learning_rate": 1.144575902582199e-05, + "loss": 0.953, + "step": 8806 + }, + { + "epoch": 0.7869189358232627, + "grad_norm": 0.629047691822052, + "learning_rate": 1.1436546665680309e-05, + "loss": 0.945, + "step": 8807 + }, + { + "epoch": 0.7870082873545245, + "grad_norm": 0.47389715909957886, + "learning_rate": 1.1427337535638905e-05, + "loss": 0.9537, + "step": 8808 + }, + { + "epoch": 0.7870976388857864, + "grad_norm": 0.4402306079864502, + "learning_rate": 1.1418131636469137e-05, + "loss": 1.0046, + "step": 8809 + }, + { + "epoch": 0.7871869904170483, + "grad_norm": 0.4482784867286682, + "learning_rate": 1.1408928968942095e-05, + "loss": 0.8711, + "step": 8810 + }, + { + "epoch": 0.7872763419483101, + "grad_norm": 0.4512721598148346, + "learning_rate": 1.1399729533828623e-05, + "loss": 0.9093, + "step": 8811 + }, + { + "epoch": 0.787365693479572, + "grad_norm": 0.5375701189041138, + "learning_rate": 1.1390533331899234e-05, + "loss": 0.892, + "step": 8812 + }, + { + "epoch": 0.7874550450108339, + "grad_norm": 0.44732967019081116, + "learning_rate": 1.1381340363924226e-05, + "loss": 0.9631, + "step": 8813 + }, + { + "epoch": 0.7875443965420957, + "grad_norm": 0.5341693758964539, + "learning_rate": 1.1372150630673606e-05, + "loss": 0.8947, + "step": 8814 + }, + { + "epoch": 0.7876337480733576, + "grad_norm": 0.5837507247924805, + "learning_rate": 1.136296413291712e-05, + "loss": 0.9357, + "step": 8815 + }, + { + "epoch": 0.7877230996046195, + "grad_norm": 0.49844542145729065, + "learning_rate": 1.1353780871424225e-05, + "loss": 0.9402, + "step": 8816 + }, + { + "epoch": 0.7878124511358814, + "grad_norm": 0.5053272247314453, + "learning_rate": 1.134460084696412e-05, + "loss": 0.916, + "step": 8817 + }, + { + "epoch": 0.7879018026671432, + "grad_norm": 0.47650039196014404, + "learning_rate": 1.1335424060305733e-05, + "loss": 0.9513, + "step": 8818 + }, + { + "epoch": 0.787991154198405, + "grad_norm": 0.5099322199821472, + "learning_rate": 1.1326250512217728e-05, + "loss": 0.9191, + "step": 8819 + }, + { + "epoch": 0.7880805057296669, + "grad_norm": 0.6279272437095642, + "learning_rate": 1.1317080203468462e-05, + "loss": 0.935, + "step": 8820 + }, + { + "epoch": 0.7881698572609288, + "grad_norm": 0.5648776888847351, + "learning_rate": 1.1307913134826059e-05, + "loss": 0.8982, + "step": 8821 + }, + { + "epoch": 0.7882592087921907, + "grad_norm": 0.6467058062553406, + "learning_rate": 1.1298749307058359e-05, + "loss": 0.9648, + "step": 8822 + }, + { + "epoch": 0.7883485603234526, + "grad_norm": 0.5014479756355286, + "learning_rate": 1.1289588720932931e-05, + "loss": 0.928, + "step": 8823 + }, + { + "epoch": 0.7884379118547145, + "grad_norm": 0.5449494123458862, + "learning_rate": 1.1280431377217071e-05, + "loss": 1.0022, + "step": 8824 + }, + { + "epoch": 0.7885272633859762, + "grad_norm": 0.46424680948257446, + "learning_rate": 1.1271277276677805e-05, + "loss": 0.8855, + "step": 8825 + }, + { + "epoch": 0.7886166149172381, + "grad_norm": 0.548555314540863, + "learning_rate": 1.1262126420081887e-05, + "loss": 0.8627, + "step": 8826 + }, + { + "epoch": 0.7887059664485, + "grad_norm": 0.44086822867393494, + "learning_rate": 1.125297880819579e-05, + "loss": 0.9702, + "step": 8827 + }, + { + "epoch": 0.7887953179797619, + "grad_norm": 0.4535292088985443, + "learning_rate": 1.124383444178574e-05, + "loss": 0.8851, + "step": 8828 + }, + { + "epoch": 0.7888846695110238, + "grad_norm": 0.5612303614616394, + "learning_rate": 1.1234693321617673e-05, + "loss": 0.9546, + "step": 8829 + }, + { + "epoch": 0.7889740210422856, + "grad_norm": 0.467670738697052, + "learning_rate": 1.1225555448457242e-05, + "loss": 0.9503, + "step": 8830 + }, + { + "epoch": 0.7890633725735475, + "grad_norm": 0.4849298298358917, + "learning_rate": 1.1216420823069846e-05, + "loss": 0.9047, + "step": 8831 + }, + { + "epoch": 0.7891527241048093, + "grad_norm": 0.41470497846603394, + "learning_rate": 1.1207289446220604e-05, + "loss": 0.9613, + "step": 8832 + }, + { + "epoch": 0.7892420756360712, + "grad_norm": 0.4530922472476959, + "learning_rate": 1.1198161318674377e-05, + "loss": 0.9708, + "step": 8833 + }, + { + "epoch": 0.7893314271673331, + "grad_norm": 0.422997385263443, + "learning_rate": 1.1189036441195733e-05, + "loss": 0.9678, + "step": 8834 + }, + { + "epoch": 0.789420778698595, + "grad_norm": 0.5136018395423889, + "learning_rate": 1.1179914814548986e-05, + "loss": 0.9182, + "step": 8835 + }, + { + "epoch": 0.7895101302298568, + "grad_norm": 0.529691755771637, + "learning_rate": 1.1170796439498166e-05, + "loss": 0.8935, + "step": 8836 + }, + { + "epoch": 0.7895994817611187, + "grad_norm": 0.47432875633239746, + "learning_rate": 1.1161681316807032e-05, + "loss": 0.9239, + "step": 8837 + }, + { + "epoch": 0.7896888332923806, + "grad_norm": 0.563213586807251, + "learning_rate": 1.1152569447239075e-05, + "loss": 0.9184, + "step": 8838 + }, + { + "epoch": 0.7897781848236424, + "grad_norm": 0.4394233524799347, + "learning_rate": 1.1143460831557523e-05, + "loss": 0.9732, + "step": 8839 + }, + { + "epoch": 0.7898675363549043, + "grad_norm": 0.46250519156455994, + "learning_rate": 1.1134355470525293e-05, + "loss": 0.9073, + "step": 8840 + }, + { + "epoch": 0.7899568878861661, + "grad_norm": 0.43355268239974976, + "learning_rate": 1.1125253364905075e-05, + "loss": 0.9525, + "step": 8841 + }, + { + "epoch": 0.790046239417428, + "grad_norm": 0.4754622280597687, + "learning_rate": 1.111615451545926e-05, + "loss": 0.946, + "step": 8842 + }, + { + "epoch": 0.7901355909486899, + "grad_norm": 0.47392573952674866, + "learning_rate": 1.1107058922949975e-05, + "loss": 0.9907, + "step": 8843 + }, + { + "epoch": 0.7902249424799518, + "grad_norm": 0.4125828742980957, + "learning_rate": 1.1097966588139092e-05, + "loss": 0.9411, + "step": 8844 + }, + { + "epoch": 0.7903142940112137, + "grad_norm": 0.5055714249610901, + "learning_rate": 1.1088877511788154e-05, + "loss": 0.9096, + "step": 8845 + }, + { + "epoch": 0.7904036455424754, + "grad_norm": 0.5957999229431152, + "learning_rate": 1.1079791694658487e-05, + "loss": 0.922, + "step": 8846 + }, + { + "epoch": 0.7904929970737373, + "grad_norm": 0.48999646306037903, + "learning_rate": 1.1070709137511132e-05, + "loss": 0.9453, + "step": 8847 + }, + { + "epoch": 0.7905823486049992, + "grad_norm": 0.40712419152259827, + "learning_rate": 1.1061629841106836e-05, + "loss": 0.9806, + "step": 8848 + }, + { + "epoch": 0.7906717001362611, + "grad_norm": 0.5603836178779602, + "learning_rate": 1.1052553806206094e-05, + "loss": 1.014, + "step": 8849 + }, + { + "epoch": 0.790761051667523, + "grad_norm": 0.467892587184906, + "learning_rate": 1.1043481033569142e-05, + "loss": 0.9254, + "step": 8850 + }, + { + "epoch": 0.7908504031987849, + "grad_norm": 0.5536115169525146, + "learning_rate": 1.103441152395588e-05, + "loss": 0.9557, + "step": 8851 + }, + { + "epoch": 0.7909397547300467, + "grad_norm": 0.4258303940296173, + "learning_rate": 1.1025345278125998e-05, + "loss": 0.9526, + "step": 8852 + }, + { + "epoch": 0.7910291062613085, + "grad_norm": 0.4268374443054199, + "learning_rate": 1.1016282296838887e-05, + "loss": 0.9622, + "step": 8853 + }, + { + "epoch": 0.7911184577925704, + "grad_norm": 0.525230348110199, + "learning_rate": 1.100722258085367e-05, + "loss": 0.9293, + "step": 8854 + }, + { + "epoch": 0.7912078093238323, + "grad_norm": 0.49180522561073303, + "learning_rate": 1.0998166130929199e-05, + "loss": 1.0217, + "step": 8855 + }, + { + "epoch": 0.7912971608550942, + "grad_norm": 0.4953739643096924, + "learning_rate": 1.0989112947824043e-05, + "loss": 0.9168, + "step": 8856 + }, + { + "epoch": 0.791386512386356, + "grad_norm": 0.4349702298641205, + "learning_rate": 1.0980063032296501e-05, + "loss": 0.9029, + "step": 8857 + }, + { + "epoch": 0.7914758639176179, + "grad_norm": 0.4538852870464325, + "learning_rate": 1.0971016385104604e-05, + "loss": 0.9189, + "step": 8858 + }, + { + "epoch": 0.7915652154488797, + "grad_norm": 0.466751366853714, + "learning_rate": 1.0961973007006105e-05, + "loss": 0.941, + "step": 8859 + }, + { + "epoch": 0.7916545669801416, + "grad_norm": 0.5299864411354065, + "learning_rate": 1.0952932898758494e-05, + "loss": 0.9471, + "step": 8860 + }, + { + "epoch": 0.7917439185114035, + "grad_norm": 0.4934852719306946, + "learning_rate": 1.0943896061118953e-05, + "loss": 0.8567, + "step": 8861 + }, + { + "epoch": 0.7918332700426653, + "grad_norm": 0.442080557346344, + "learning_rate": 1.0934862494844427e-05, + "loss": 0.9867, + "step": 8862 + }, + { + "epoch": 0.7919226215739272, + "grad_norm": 0.5051554441452026, + "learning_rate": 1.0925832200691566e-05, + "loss": 0.9596, + "step": 8863 + }, + { + "epoch": 0.7920119731051891, + "grad_norm": 0.4397631883621216, + "learning_rate": 1.0916805179416761e-05, + "loss": 0.9463, + "step": 8864 + }, + { + "epoch": 0.792101324636451, + "grad_norm": 0.4761650264263153, + "learning_rate": 1.0907781431776121e-05, + "loss": 0.9427, + "step": 8865 + }, + { + "epoch": 0.7921906761677128, + "grad_norm": 0.5251712203025818, + "learning_rate": 1.0898760958525479e-05, + "loss": 0.9143, + "step": 8866 + }, + { + "epoch": 0.7922800276989747, + "grad_norm": 0.4629380404949188, + "learning_rate": 1.0889743760420396e-05, + "loss": 0.9407, + "step": 8867 + }, + { + "epoch": 0.7923693792302365, + "grad_norm": 0.3857233226299286, + "learning_rate": 1.0880729838216153e-05, + "loss": 0.9492, + "step": 8868 + }, + { + "epoch": 0.7924587307614984, + "grad_norm": 0.5180653929710388, + "learning_rate": 1.0871719192667773e-05, + "loss": 0.8831, + "step": 8869 + }, + { + "epoch": 0.7925480822927603, + "grad_norm": 0.49814748764038086, + "learning_rate": 1.0862711824530003e-05, + "loss": 0.9292, + "step": 8870 + }, + { + "epoch": 0.7926374338240222, + "grad_norm": 0.4596767723560333, + "learning_rate": 1.0853707734557272e-05, + "loss": 0.9822, + "step": 8871 + }, + { + "epoch": 0.7927267853552841, + "grad_norm": 0.4747507870197296, + "learning_rate": 1.084470692350379e-05, + "loss": 0.9528, + "step": 8872 + }, + { + "epoch": 0.7928161368865458, + "grad_norm": 0.5321727395057678, + "learning_rate": 1.0835709392123472e-05, + "loss": 0.9908, + "step": 8873 + }, + { + "epoch": 0.7929054884178077, + "grad_norm": 0.529675304889679, + "learning_rate": 1.0826715141169947e-05, + "loss": 0.9526, + "step": 8874 + }, + { + "epoch": 0.7929948399490696, + "grad_norm": 0.4251299798488617, + "learning_rate": 1.0817724171396592e-05, + "loss": 0.9465, + "step": 8875 + }, + { + "epoch": 0.7930841914803315, + "grad_norm": 0.44601455330848694, + "learning_rate": 1.0808736483556487e-05, + "loss": 0.9579, + "step": 8876 + }, + { + "epoch": 0.7931735430115934, + "grad_norm": 0.5946657061576843, + "learning_rate": 1.0799752078402469e-05, + "loss": 0.9116, + "step": 8877 + }, + { + "epoch": 0.7932628945428553, + "grad_norm": 0.460638165473938, + "learning_rate": 1.079077095668704e-05, + "loss": 0.9722, + "step": 8878 + }, + { + "epoch": 0.7933522460741171, + "grad_norm": 0.4943525493144989, + "learning_rate": 1.0781793119162487e-05, + "loss": 0.8539, + "step": 8879 + }, + { + "epoch": 0.7934415976053789, + "grad_norm": 0.45845288038253784, + "learning_rate": 1.0772818566580795e-05, + "loss": 0.9393, + "step": 8880 + }, + { + "epoch": 0.7935309491366408, + "grad_norm": 0.515739917755127, + "learning_rate": 1.0763847299693691e-05, + "loss": 0.8696, + "step": 8881 + }, + { + "epoch": 0.7936203006679027, + "grad_norm": 0.5935304164886475, + "learning_rate": 1.0754879319252591e-05, + "loss": 1.0198, + "step": 8882 + }, + { + "epoch": 0.7937096521991646, + "grad_norm": 0.4660301208496094, + "learning_rate": 1.0745914626008675e-05, + "loss": 0.8689, + "step": 8883 + }, + { + "epoch": 0.7937990037304264, + "grad_norm": 0.47837138175964355, + "learning_rate": 1.0736953220712826e-05, + "loss": 0.8852, + "step": 8884 + }, + { + "epoch": 0.7938883552616883, + "grad_norm": 0.4837789535522461, + "learning_rate": 1.0727995104115656e-05, + "loss": 0.8776, + "step": 8885 + }, + { + "epoch": 0.7939777067929502, + "grad_norm": 0.6152971386909485, + "learning_rate": 1.0719040276967507e-05, + "loss": 0.8065, + "step": 8886 + }, + { + "epoch": 0.794067058324212, + "grad_norm": 0.532853364944458, + "learning_rate": 1.0710088740018437e-05, + "loss": 0.8949, + "step": 8887 + }, + { + "epoch": 0.7941564098554739, + "grad_norm": 0.4451427161693573, + "learning_rate": 1.0701140494018242e-05, + "loss": 0.9532, + "step": 8888 + }, + { + "epoch": 0.7942457613867357, + "grad_norm": 0.4042229652404785, + "learning_rate": 1.0692195539716421e-05, + "loss": 0.9972, + "step": 8889 + }, + { + "epoch": 0.7943351129179976, + "grad_norm": 0.47671109437942505, + "learning_rate": 1.0683253877862225e-05, + "loss": 0.9228, + "step": 8890 + }, + { + "epoch": 0.7944244644492595, + "grad_norm": 0.4329127073287964, + "learning_rate": 1.0674315509204596e-05, + "loss": 0.93, + "step": 8891 + }, + { + "epoch": 0.7945138159805214, + "grad_norm": 0.47998568415641785, + "learning_rate": 1.0665380434492223e-05, + "loss": 0.9536, + "step": 8892 + }, + { + "epoch": 0.7946031675117833, + "grad_norm": 0.5138132572174072, + "learning_rate": 1.0656448654473517e-05, + "loss": 0.9208, + "step": 8893 + }, + { + "epoch": 0.794692519043045, + "grad_norm": 0.48235902190208435, + "learning_rate": 1.0647520169896607e-05, + "loss": 0.8611, + "step": 8894 + }, + { + "epoch": 0.7947818705743069, + "grad_norm": 0.394228458404541, + "learning_rate": 1.0638594981509348e-05, + "loss": 0.974, + "step": 8895 + }, + { + "epoch": 0.7948712221055688, + "grad_norm": 0.4859960675239563, + "learning_rate": 1.0629673090059322e-05, + "loss": 0.9407, + "step": 8896 + }, + { + "epoch": 0.7949605736368307, + "grad_norm": 0.5173121690750122, + "learning_rate": 1.0620754496293833e-05, + "loss": 1.0492, + "step": 8897 + }, + { + "epoch": 0.7950499251680926, + "grad_norm": 0.453870564699173, + "learning_rate": 1.061183920095991e-05, + "loss": 0.9226, + "step": 8898 + }, + { + "epoch": 0.7951392766993545, + "grad_norm": 0.44933760166168213, + "learning_rate": 1.0602927204804296e-05, + "loss": 0.9376, + "step": 8899 + }, + { + "epoch": 0.7952286282306164, + "grad_norm": 0.48174723982810974, + "learning_rate": 1.0594018508573489e-05, + "loss": 0.9127, + "step": 8900 + }, + { + "epoch": 0.7953179797618781, + "grad_norm": 0.5078961253166199, + "learning_rate": 1.0585113113013657e-05, + "loss": 0.8392, + "step": 8901 + }, + { + "epoch": 0.79540733129314, + "grad_norm": 0.4766399562358856, + "learning_rate": 1.0576211018870736e-05, + "loss": 0.9298, + "step": 8902 + }, + { + "epoch": 0.7954966828244019, + "grad_norm": 0.5817358493804932, + "learning_rate": 1.0567312226890364e-05, + "loss": 0.9257, + "step": 8903 + }, + { + "epoch": 0.7955860343556638, + "grad_norm": 0.49117544293403625, + "learning_rate": 1.0558416737817916e-05, + "loss": 0.9214, + "step": 8904 + }, + { + "epoch": 0.7956753858869257, + "grad_norm": 0.46288153529167175, + "learning_rate": 1.0549524552398488e-05, + "loss": 0.9006, + "step": 8905 + }, + { + "epoch": 0.7957647374181875, + "grad_norm": 0.4867618978023529, + "learning_rate": 1.0540635671376885e-05, + "loss": 0.9344, + "step": 8906 + }, + { + "epoch": 0.7958540889494494, + "grad_norm": 0.5538333058357239, + "learning_rate": 1.053175009549765e-05, + "loss": 0.7893, + "step": 8907 + }, + { + "epoch": 0.7959434404807112, + "grad_norm": 0.4785383939743042, + "learning_rate": 1.0522867825505051e-05, + "loss": 0.9209, + "step": 8908 + }, + { + "epoch": 0.7960327920119731, + "grad_norm": 0.5016849637031555, + "learning_rate": 1.0513988862143082e-05, + "loss": 0.9328, + "step": 8909 + }, + { + "epoch": 0.796122143543235, + "grad_norm": 0.4911724030971527, + "learning_rate": 1.050511320615542e-05, + "loss": 0.9874, + "step": 8910 + }, + { + "epoch": 0.7962114950744968, + "grad_norm": 0.5558311939239502, + "learning_rate": 1.0496240858285511e-05, + "loss": 0.9098, + "step": 8911 + }, + { + "epoch": 0.7963008466057587, + "grad_norm": 0.6585413217544556, + "learning_rate": 1.0487371819276509e-05, + "loss": 0.9099, + "step": 8912 + }, + { + "epoch": 0.7963901981370206, + "grad_norm": 0.5155404806137085, + "learning_rate": 1.0478506089871293e-05, + "loss": 0.9004, + "step": 8913 + }, + { + "epoch": 0.7964795496682825, + "grad_norm": 0.4853888154029846, + "learning_rate": 1.0469643670812479e-05, + "loss": 0.9659, + "step": 8914 + }, + { + "epoch": 0.7965689011995443, + "grad_norm": 0.5303243398666382, + "learning_rate": 1.0460784562842351e-05, + "loss": 0.8745, + "step": 8915 + }, + { + "epoch": 0.7966582527308061, + "grad_norm": 0.40654778480529785, + "learning_rate": 1.0451928766702979e-05, + "loss": 0.9293, + "step": 8916 + }, + { + "epoch": 0.796747604262068, + "grad_norm": 0.47125494480133057, + "learning_rate": 1.0443076283136122e-05, + "loss": 0.9285, + "step": 8917 + }, + { + "epoch": 0.7968369557933299, + "grad_norm": 0.5250342488288879, + "learning_rate": 1.043422711288327e-05, + "loss": 0.9306, + "step": 8918 + }, + { + "epoch": 0.7969263073245918, + "grad_norm": 0.495063841342926, + "learning_rate": 1.042538125668564e-05, + "loss": 0.9515, + "step": 8919 + }, + { + "epoch": 0.7970156588558537, + "grad_norm": 0.5038367509841919, + "learning_rate": 1.041653871528418e-05, + "loss": 0.9353, + "step": 8920 + }, + { + "epoch": 0.7971050103871156, + "grad_norm": 0.40401169657707214, + "learning_rate": 1.040769948941952e-05, + "loss": 0.9591, + "step": 8921 + }, + { + "epoch": 0.7971943619183773, + "grad_norm": 0.5238271951675415, + "learning_rate": 1.0398863579832047e-05, + "loss": 0.9321, + "step": 8922 + }, + { + "epoch": 0.7972837134496392, + "grad_norm": 0.5171613097190857, + "learning_rate": 1.0390030987261866e-05, + "loss": 0.957, + "step": 8923 + }, + { + "epoch": 0.7973730649809011, + "grad_norm": 0.49921807646751404, + "learning_rate": 1.0381201712448806e-05, + "loss": 0.94, + "step": 8924 + }, + { + "epoch": 0.797462416512163, + "grad_norm": 0.6377806067466736, + "learning_rate": 1.0372375756132408e-05, + "loss": 0.8859, + "step": 8925 + }, + { + "epoch": 0.7975517680434249, + "grad_norm": 0.4824712872505188, + "learning_rate": 1.036355311905194e-05, + "loss": 0.9774, + "step": 8926 + }, + { + "epoch": 0.7976411195746868, + "grad_norm": 0.5878000855445862, + "learning_rate": 1.0354733801946393e-05, + "loss": 0.91, + "step": 8927 + }, + { + "epoch": 0.7977304711059485, + "grad_norm": 0.5659178495407104, + "learning_rate": 1.0345917805554478e-05, + "loss": 0.9632, + "step": 8928 + }, + { + "epoch": 0.7978198226372104, + "grad_norm": 0.5241888165473938, + "learning_rate": 1.0337105130614627e-05, + "loss": 0.9609, + "step": 8929 + }, + { + "epoch": 0.7979091741684723, + "grad_norm": 0.4879510998725891, + "learning_rate": 1.0328295777865016e-05, + "loss": 0.8878, + "step": 8930 + }, + { + "epoch": 0.7979985256997342, + "grad_norm": 0.5221325159072876, + "learning_rate": 1.0319489748043486e-05, + "loss": 0.8727, + "step": 8931 + }, + { + "epoch": 0.7980878772309961, + "grad_norm": 0.4458564519882202, + "learning_rate": 1.0310687041887652e-05, + "loss": 0.8967, + "step": 8932 + }, + { + "epoch": 0.7981772287622579, + "grad_norm": 0.6000443696975708, + "learning_rate": 1.0301887660134841e-05, + "loss": 0.91, + "step": 8933 + }, + { + "epoch": 0.7982665802935198, + "grad_norm": 0.4376446008682251, + "learning_rate": 1.0293091603522081e-05, + "loss": 0.9492, + "step": 8934 + }, + { + "epoch": 0.7983559318247816, + "grad_norm": 0.4281294643878937, + "learning_rate": 1.0284298872786153e-05, + "loss": 0.9444, + "step": 8935 + }, + { + "epoch": 0.7984452833560435, + "grad_norm": 0.5830131769180298, + "learning_rate": 1.027550946866353e-05, + "loss": 0.8794, + "step": 8936 + }, + { + "epoch": 0.7985346348873054, + "grad_norm": 0.45756039023399353, + "learning_rate": 1.026672339189042e-05, + "loss": 0.9598, + "step": 8937 + }, + { + "epoch": 0.7986239864185672, + "grad_norm": 0.42528459429740906, + "learning_rate": 1.0257940643202757e-05, + "loss": 1.0198, + "step": 8938 + }, + { + "epoch": 0.7987133379498291, + "grad_norm": 0.4692322015762329, + "learning_rate": 1.0249161223336185e-05, + "loss": 1.0496, + "step": 8939 + }, + { + "epoch": 0.798802689481091, + "grad_norm": 0.5667924284934998, + "learning_rate": 1.024038513302607e-05, + "loss": 0.913, + "step": 8940 + }, + { + "epoch": 0.7988920410123529, + "grad_norm": 0.47331857681274414, + "learning_rate": 1.0231612373007521e-05, + "loss": 0.9722, + "step": 8941 + }, + { + "epoch": 0.7989813925436147, + "grad_norm": 0.4151710867881775, + "learning_rate": 1.0222842944015326e-05, + "loss": 0.9374, + "step": 8942 + }, + { + "epoch": 0.7990707440748765, + "grad_norm": 0.582118570804596, + "learning_rate": 1.0214076846784027e-05, + "loss": 0.9451, + "step": 8943 + }, + { + "epoch": 0.7991600956061384, + "grad_norm": 0.46437790989875793, + "learning_rate": 1.0205314082047879e-05, + "loss": 0.9203, + "step": 8944 + }, + { + "epoch": 0.7992494471374003, + "grad_norm": 0.4480501711368561, + "learning_rate": 1.0196554650540857e-05, + "loss": 0.9466, + "step": 8945 + }, + { + "epoch": 0.7993387986686622, + "grad_norm": 0.5746645331382751, + "learning_rate": 1.0187798552996653e-05, + "loss": 0.8555, + "step": 8946 + }, + { + "epoch": 0.7994281501999241, + "grad_norm": 0.4533185064792633, + "learning_rate": 1.0179045790148707e-05, + "loss": 0.9209, + "step": 8947 + }, + { + "epoch": 0.799517501731186, + "grad_norm": 0.4475330412387848, + "learning_rate": 1.0170296362730125e-05, + "loss": 0.9418, + "step": 8948 + }, + { + "epoch": 0.7996068532624477, + "grad_norm": 0.4576277434825897, + "learning_rate": 1.0161550271473774e-05, + "loss": 0.9764, + "step": 8949 + }, + { + "epoch": 0.7996962047937096, + "grad_norm": 0.5483040809631348, + "learning_rate": 1.0152807517112233e-05, + "loss": 0.9551, + "step": 8950 + }, + { + "epoch": 0.7997855563249715, + "grad_norm": 0.49113574624061584, + "learning_rate": 1.0144068100377818e-05, + "loss": 0.9638, + "step": 8951 + }, + { + "epoch": 0.7998749078562334, + "grad_norm": 0.4609823226928711, + "learning_rate": 1.013533202200252e-05, + "loss": 0.949, + "step": 8952 + }, + { + "epoch": 0.7999642593874953, + "grad_norm": 0.3998883068561554, + "learning_rate": 1.0126599282718096e-05, + "loss": 1.0431, + "step": 8953 + }, + { + "epoch": 0.8000536109187572, + "grad_norm": 0.6044648289680481, + "learning_rate": 1.0117869883255998e-05, + "loss": 0.8911, + "step": 8954 + }, + { + "epoch": 0.800142962450019, + "grad_norm": 0.4625280499458313, + "learning_rate": 1.0109143824347411e-05, + "loss": 0.9561, + "step": 8955 + }, + { + "epoch": 0.8002323139812808, + "grad_norm": 0.45291417837142944, + "learning_rate": 1.0100421106723234e-05, + "loss": 0.9568, + "step": 8956 + }, + { + "epoch": 0.8003216655125427, + "grad_norm": 0.4366665780544281, + "learning_rate": 1.009170173111409e-05, + "loss": 1.0054, + "step": 8957 + }, + { + "epoch": 0.8004110170438046, + "grad_norm": 0.5513303875923157, + "learning_rate": 1.0082985698250313e-05, + "loss": 0.9255, + "step": 8958 + }, + { + "epoch": 0.8005003685750665, + "grad_norm": 0.42455315589904785, + "learning_rate": 1.0074273008861973e-05, + "loss": 0.9084, + "step": 8959 + }, + { + "epoch": 0.8005897201063283, + "grad_norm": 0.4331103563308716, + "learning_rate": 1.0065563663678845e-05, + "loss": 0.9705, + "step": 8960 + }, + { + "epoch": 0.8006790716375902, + "grad_norm": 0.4532991349697113, + "learning_rate": 1.005685766343044e-05, + "loss": 0.8854, + "step": 8961 + }, + { + "epoch": 0.8007684231688521, + "grad_norm": 0.4681682288646698, + "learning_rate": 1.0048155008845962e-05, + "loss": 0.9396, + "step": 8962 + }, + { + "epoch": 0.8008577747001139, + "grad_norm": 0.4511670172214508, + "learning_rate": 1.003945570065436e-05, + "loss": 0.8475, + "step": 8963 + }, + { + "epoch": 0.8009471262313758, + "grad_norm": 0.475967675447464, + "learning_rate": 1.0030759739584284e-05, + "loss": 0.9374, + "step": 8964 + }, + { + "epoch": 0.8010364777626376, + "grad_norm": 0.547709047794342, + "learning_rate": 1.0022067126364126e-05, + "loss": 1.0038, + "step": 8965 + }, + { + "epoch": 0.8011258292938995, + "grad_norm": 0.44402167201042175, + "learning_rate": 1.001337786172198e-05, + "loss": 0.9718, + "step": 8966 + }, + { + "epoch": 0.8012151808251614, + "grad_norm": 0.40516430139541626, + "learning_rate": 1.0004691946385663e-05, + "loss": 1.0515, + "step": 8967 + }, + { + "epoch": 0.8013045323564233, + "grad_norm": 0.4301491975784302, + "learning_rate": 9.996009381082717e-06, + "loss": 0.9238, + "step": 8968 + }, + { + "epoch": 0.8013938838876852, + "grad_norm": 0.5374286770820618, + "learning_rate": 9.987330166540398e-06, + "loss": 0.9712, + "step": 8969 + }, + { + "epoch": 0.801483235418947, + "grad_norm": 0.453789085149765, + "learning_rate": 9.97865430348568e-06, + "loss": 0.9511, + "step": 8970 + }, + { + "epoch": 0.8015725869502088, + "grad_norm": 0.5180380344390869, + "learning_rate": 9.969981792645273e-06, + "loss": 0.8728, + "step": 8971 + }, + { + "epoch": 0.8016619384814707, + "grad_norm": 0.46428194642066956, + "learning_rate": 9.961312634745567e-06, + "loss": 0.95, + "step": 8972 + }, + { + "epoch": 0.8017512900127326, + "grad_norm": 0.5181485414505005, + "learning_rate": 9.952646830512712e-06, + "loss": 0.9228, + "step": 8973 + }, + { + "epoch": 0.8018406415439945, + "grad_norm": 0.44874072074890137, + "learning_rate": 9.94398438067256e-06, + "loss": 0.9679, + "step": 8974 + }, + { + "epoch": 0.8019299930752564, + "grad_norm": 0.47210460901260376, + "learning_rate": 9.93532528595068e-06, + "loss": 0.9655, + "step": 8975 + }, + { + "epoch": 0.8020193446065182, + "grad_norm": 0.5403384566307068, + "learning_rate": 9.926669547072364e-06, + "loss": 0.9232, + "step": 8976 + }, + { + "epoch": 0.80210869613778, + "grad_norm": 0.4292760193347931, + "learning_rate": 9.918017164762627e-06, + "loss": 0.9782, + "step": 8977 + }, + { + "epoch": 0.8021980476690419, + "grad_norm": 0.4274854362010956, + "learning_rate": 9.909368139746194e-06, + "loss": 0.9455, + "step": 8978 + }, + { + "epoch": 0.8022873992003038, + "grad_norm": 0.4722135663032532, + "learning_rate": 9.90072247274752e-06, + "loss": 1.0058, + "step": 8979 + }, + { + "epoch": 0.8023767507315657, + "grad_norm": 0.4381501376628876, + "learning_rate": 9.892080164490774e-06, + "loss": 0.9518, + "step": 8980 + }, + { + "epoch": 0.8024661022628276, + "grad_norm": 0.488213449716568, + "learning_rate": 9.883441215699823e-06, + "loss": 0.9261, + "step": 8981 + }, + { + "epoch": 0.8025554537940894, + "grad_norm": 0.5511339902877808, + "learning_rate": 9.874805627098282e-06, + "loss": 0.9793, + "step": 8982 + }, + { + "epoch": 0.8026448053253513, + "grad_norm": 0.5285826921463013, + "learning_rate": 9.866173399409474e-06, + "loss": 0.9814, + "step": 8983 + }, + { + "epoch": 0.8027341568566131, + "grad_norm": 0.5300377607345581, + "learning_rate": 9.857544533356456e-06, + "loss": 1.0165, + "step": 8984 + }, + { + "epoch": 0.802823508387875, + "grad_norm": 0.4703865051269531, + "learning_rate": 9.848919029661952e-06, + "loss": 0.8869, + "step": 8985 + }, + { + "epoch": 0.8029128599191369, + "grad_norm": 0.5108640193939209, + "learning_rate": 9.840296889048462e-06, + "loss": 0.9011, + "step": 8986 + }, + { + "epoch": 0.8030022114503987, + "grad_norm": 0.49105626344680786, + "learning_rate": 9.831678112238173e-06, + "loss": 0.8887, + "step": 8987 + }, + { + "epoch": 0.8030915629816606, + "grad_norm": 0.5277919769287109, + "learning_rate": 9.823062699953012e-06, + "loss": 0.944, + "step": 8988 + }, + { + "epoch": 0.8031809145129225, + "grad_norm": 0.4541589021682739, + "learning_rate": 9.81445065291461e-06, + "loss": 0.9229, + "step": 8989 + }, + { + "epoch": 0.8032702660441843, + "grad_norm": 0.4479905366897583, + "learning_rate": 9.805841971844305e-06, + "loss": 0.9027, + "step": 8990 + }, + { + "epoch": 0.8033596175754462, + "grad_norm": 0.4373217225074768, + "learning_rate": 9.797236657463188e-06, + "loss": 0.9373, + "step": 8991 + }, + { + "epoch": 0.803448969106708, + "grad_norm": 0.407236248254776, + "learning_rate": 9.788634710492017e-06, + "loss": 0.9314, + "step": 8992 + }, + { + "epoch": 0.8035383206379699, + "grad_norm": 0.5327048301696777, + "learning_rate": 9.780036131651315e-06, + "loss": 0.9155, + "step": 8993 + }, + { + "epoch": 0.8036276721692318, + "grad_norm": 0.5354421138763428, + "learning_rate": 9.7714409216613e-06, + "loss": 0.8684, + "step": 8994 + }, + { + "epoch": 0.8037170237004937, + "grad_norm": 0.5037555694580078, + "learning_rate": 9.762849081241915e-06, + "loss": 0.9443, + "step": 8995 + }, + { + "epoch": 0.8038063752317556, + "grad_norm": 0.5367538928985596, + "learning_rate": 9.754260611112814e-06, + "loss": 0.9278, + "step": 8996 + }, + { + "epoch": 0.8038957267630173, + "grad_norm": 0.4558742046356201, + "learning_rate": 9.74567551199338e-06, + "loss": 0.848, + "step": 8997 + }, + { + "epoch": 0.8039850782942792, + "grad_norm": 0.45363011956214905, + "learning_rate": 9.737093784602697e-06, + "loss": 0.9489, + "step": 8998 + }, + { + "epoch": 0.8040744298255411, + "grad_norm": 0.4492855668067932, + "learning_rate": 9.728515429659585e-06, + "loss": 0.9211, + "step": 8999 + }, + { + "epoch": 0.804163781356803, + "grad_norm": 0.44268351793289185, + "learning_rate": 9.719940447882563e-06, + "loss": 0.9763, + "step": 9000 + }, + { + "epoch": 0.8042531328880649, + "grad_norm": 0.4664279520511627, + "learning_rate": 9.711368839989905e-06, + "loss": 0.9177, + "step": 9001 + }, + { + "epoch": 0.8043424844193268, + "grad_norm": 0.4780477285385132, + "learning_rate": 9.70280060669953e-06, + "loss": 0.8817, + "step": 9002 + }, + { + "epoch": 0.8044318359505886, + "grad_norm": 0.5579407811164856, + "learning_rate": 9.694235748729146e-06, + "loss": 0.8823, + "step": 9003 + }, + { + "epoch": 0.8045211874818504, + "grad_norm": 0.47675004601478577, + "learning_rate": 9.685674266796146e-06, + "loss": 0.9631, + "step": 9004 + }, + { + "epoch": 0.8046105390131123, + "grad_norm": 0.3818524479866028, + "learning_rate": 9.67711616161765e-06, + "loss": 0.937, + "step": 9005 + }, + { + "epoch": 0.8046998905443742, + "grad_norm": 0.4102923572063446, + "learning_rate": 9.668561433910484e-06, + "loss": 0.9733, + "step": 9006 + }, + { + "epoch": 0.8047892420756361, + "grad_norm": 0.41497570276260376, + "learning_rate": 9.660010084391197e-06, + "loss": 1.0069, + "step": 9007 + }, + { + "epoch": 0.804878593606898, + "grad_norm": 0.6005346775054932, + "learning_rate": 9.65146211377606e-06, + "loss": 0.925, + "step": 9008 + }, + { + "epoch": 0.8049679451381598, + "grad_norm": 0.5480002164840698, + "learning_rate": 9.642917522781058e-06, + "loss": 0.9914, + "step": 9009 + }, + { + "epoch": 0.8050572966694217, + "grad_norm": 0.5394856333732605, + "learning_rate": 9.63437631212189e-06, + "loss": 0.8734, + "step": 9010 + }, + { + "epoch": 0.8051466482006835, + "grad_norm": 0.48310962319374084, + "learning_rate": 9.625838482513983e-06, + "loss": 1.0201, + "step": 9011 + }, + { + "epoch": 0.8052359997319454, + "grad_norm": 0.485075443983078, + "learning_rate": 9.617304034672448e-06, + "loss": 0.916, + "step": 9012 + }, + { + "epoch": 0.8053253512632073, + "grad_norm": 0.5205706357955933, + "learning_rate": 9.608772969312147e-06, + "loss": 0.9511, + "step": 9013 + }, + { + "epoch": 0.8054147027944691, + "grad_norm": 0.48902761936187744, + "learning_rate": 9.600245287147652e-06, + "loss": 0.896, + "step": 9014 + }, + { + "epoch": 0.805504054325731, + "grad_norm": 0.5032730102539062, + "learning_rate": 9.591720988893244e-06, + "loss": 0.9751, + "step": 9015 + }, + { + "epoch": 0.8055934058569929, + "grad_norm": 0.537691593170166, + "learning_rate": 9.583200075262921e-06, + "loss": 0.983, + "step": 9016 + }, + { + "epoch": 0.8056827573882548, + "grad_norm": 0.5454861521720886, + "learning_rate": 9.57468254697042e-06, + "loss": 0.9127, + "step": 9017 + }, + { + "epoch": 0.8057721089195166, + "grad_norm": 0.4477491080760956, + "learning_rate": 9.566168404729148e-06, + "loss": 0.995, + "step": 9018 + }, + { + "epoch": 0.8058614604507784, + "grad_norm": 0.5274454951286316, + "learning_rate": 9.55765764925226e-06, + "loss": 0.8299, + "step": 9019 + }, + { + "epoch": 0.8059508119820403, + "grad_norm": 0.5348902940750122, + "learning_rate": 9.549150281252633e-06, + "loss": 0.9079, + "step": 9020 + }, + { + "epoch": 0.8060401635133022, + "grad_norm": 0.5387945771217346, + "learning_rate": 9.54064630144284e-06, + "loss": 0.9053, + "step": 9021 + }, + { + "epoch": 0.8061295150445641, + "grad_norm": 0.5068916082382202, + "learning_rate": 9.532145710535207e-06, + "loss": 0.926, + "step": 9022 + }, + { + "epoch": 0.806218866575826, + "grad_norm": 0.39943739771842957, + "learning_rate": 9.523648509241706e-06, + "loss": 0.9773, + "step": 9023 + }, + { + "epoch": 0.8063082181070879, + "grad_norm": 0.676214873790741, + "learning_rate": 9.515154698274093e-06, + "loss": 0.8005, + "step": 9024 + }, + { + "epoch": 0.8063975696383496, + "grad_norm": 0.45418787002563477, + "learning_rate": 9.506664278343808e-06, + "loss": 0.9662, + "step": 9025 + }, + { + "epoch": 0.8064869211696115, + "grad_norm": 0.5260841846466064, + "learning_rate": 9.498177250162022e-06, + "loss": 1.0117, + "step": 9026 + }, + { + "epoch": 0.8065762727008734, + "grad_norm": 0.502400815486908, + "learning_rate": 9.489693614439605e-06, + "loss": 0.8963, + "step": 9027 + }, + { + "epoch": 0.8066656242321353, + "grad_norm": 0.5157961249351501, + "learning_rate": 9.48121337188716e-06, + "loss": 0.9942, + "step": 9028 + }, + { + "epoch": 0.8067549757633972, + "grad_norm": 0.6219414472579956, + "learning_rate": 9.472736523214993e-06, + "loss": 0.9617, + "step": 9029 + }, + { + "epoch": 0.806844327294659, + "grad_norm": 0.41551846265792847, + "learning_rate": 9.464263069133134e-06, + "loss": 0.9622, + "step": 9030 + }, + { + "epoch": 0.8069336788259209, + "grad_norm": 0.454323947429657, + "learning_rate": 9.455793010351321e-06, + "loss": 0.9332, + "step": 9031 + }, + { + "epoch": 0.8070230303571827, + "grad_norm": 0.4970146119594574, + "learning_rate": 9.447326347579028e-06, + "loss": 0.9591, + "step": 9032 + }, + { + "epoch": 0.8071123818884446, + "grad_norm": 0.44244319200515747, + "learning_rate": 9.438863081525396e-06, + "loss": 0.9451, + "step": 9033 + }, + { + "epoch": 0.8072017334197065, + "grad_norm": 0.47928622364997864, + "learning_rate": 9.430403212899336e-06, + "loss": 0.8969, + "step": 9034 + }, + { + "epoch": 0.8072910849509684, + "grad_norm": 0.45787954330444336, + "learning_rate": 9.421946742409448e-06, + "loss": 0.9851, + "step": 9035 + }, + { + "epoch": 0.8073804364822302, + "grad_norm": 0.603981077671051, + "learning_rate": 9.41349367076405e-06, + "loss": 0.9052, + "step": 9036 + }, + { + "epoch": 0.8074697880134921, + "grad_norm": 0.47599250078201294, + "learning_rate": 9.405043998671176e-06, + "loss": 0.9525, + "step": 9037 + }, + { + "epoch": 0.807559139544754, + "grad_norm": 0.4036829471588135, + "learning_rate": 9.39659772683858e-06, + "loss": 0.9176, + "step": 9038 + }, + { + "epoch": 0.8076484910760158, + "grad_norm": 0.49439674615859985, + "learning_rate": 9.38815485597373e-06, + "loss": 0.9366, + "step": 9039 + }, + { + "epoch": 0.8077378426072777, + "grad_norm": 0.5236937403678894, + "learning_rate": 9.379715386783794e-06, + "loss": 0.9086, + "step": 9040 + }, + { + "epoch": 0.8078271941385395, + "grad_norm": 0.4477667808532715, + "learning_rate": 9.371279319975678e-06, + "loss": 0.9332, + "step": 9041 + }, + { + "epoch": 0.8079165456698014, + "grad_norm": 0.509158730506897, + "learning_rate": 9.362846656256008e-06, + "loss": 0.9012, + "step": 9042 + }, + { + "epoch": 0.8080058972010633, + "grad_norm": 0.43936219811439514, + "learning_rate": 9.354417396331073e-06, + "loss": 0.9957, + "step": 9043 + }, + { + "epoch": 0.8080952487323252, + "grad_norm": 0.5267393589019775, + "learning_rate": 9.345991540906934e-06, + "loss": 0.8568, + "step": 9044 + }, + { + "epoch": 0.8081846002635871, + "grad_norm": 0.5685756802558899, + "learning_rate": 9.337569090689346e-06, + "loss": 0.9036, + "step": 9045 + }, + { + "epoch": 0.8082739517948488, + "grad_norm": 0.5792932510375977, + "learning_rate": 9.329150046383772e-06, + "loss": 0.8976, + "step": 9046 + }, + { + "epoch": 0.8083633033261107, + "grad_norm": 0.4724472761154175, + "learning_rate": 9.320734408695403e-06, + "loss": 0.8995, + "step": 9047 + }, + { + "epoch": 0.8084526548573726, + "grad_norm": 0.4375053346157074, + "learning_rate": 9.312322178329142e-06, + "loss": 1.06, + "step": 9048 + }, + { + "epoch": 0.8085420063886345, + "grad_norm": 0.6214897036552429, + "learning_rate": 9.303913355989596e-06, + "loss": 0.9019, + "step": 9049 + }, + { + "epoch": 0.8086313579198964, + "grad_norm": 0.49101778864860535, + "learning_rate": 9.295507942381103e-06, + "loss": 0.9121, + "step": 9050 + }, + { + "epoch": 0.8087207094511583, + "grad_norm": 0.4864928424358368, + "learning_rate": 9.28710593820769e-06, + "loss": 0.8843, + "step": 9051 + }, + { + "epoch": 0.80881006098242, + "grad_norm": 0.4251103401184082, + "learning_rate": 9.278707344173126e-06, + "loss": 0.9909, + "step": 9052 + }, + { + "epoch": 0.8088994125136819, + "grad_norm": 0.5644087195396423, + "learning_rate": 9.270312160980876e-06, + "loss": 0.9755, + "step": 9053 + }, + { + "epoch": 0.8089887640449438, + "grad_norm": 0.42497533559799194, + "learning_rate": 9.261920389334139e-06, + "loss": 0.9219, + "step": 9054 + }, + { + "epoch": 0.8090781155762057, + "grad_norm": 0.37208980321884155, + "learning_rate": 9.253532029935796e-06, + "loss": 0.9487, + "step": 9055 + }, + { + "epoch": 0.8091674671074676, + "grad_norm": 0.5022962689399719, + "learning_rate": 9.245147083488476e-06, + "loss": 0.9539, + "step": 9056 + }, + { + "epoch": 0.8092568186387294, + "grad_norm": 0.4653746485710144, + "learning_rate": 9.236765550694498e-06, + "loss": 1.0132, + "step": 9057 + }, + { + "epoch": 0.8093461701699913, + "grad_norm": 0.4346064329147339, + "learning_rate": 9.228387432255915e-06, + "loss": 0.9706, + "step": 9058 + }, + { + "epoch": 0.8094355217012531, + "grad_norm": 0.6596688032150269, + "learning_rate": 9.220012728874473e-06, + "loss": 0.8905, + "step": 9059 + }, + { + "epoch": 0.809524873232515, + "grad_norm": 0.5736172795295715, + "learning_rate": 9.21164144125165e-06, + "loss": 0.9162, + "step": 9060 + }, + { + "epoch": 0.8096142247637769, + "grad_norm": 0.43389225006103516, + "learning_rate": 9.203273570088634e-06, + "loss": 0.9322, + "step": 9061 + }, + { + "epoch": 0.8097035762950388, + "grad_norm": 0.516688346862793, + "learning_rate": 9.194909116086325e-06, + "loss": 0.9036, + "step": 9062 + }, + { + "epoch": 0.8097929278263006, + "grad_norm": 0.4250909984111786, + "learning_rate": 9.186548079945318e-06, + "loss": 0.9199, + "step": 9063 + }, + { + "epoch": 0.8098822793575625, + "grad_norm": 0.5514670014381409, + "learning_rate": 9.178190462365949e-06, + "loss": 0.9008, + "step": 9064 + }, + { + "epoch": 0.8099716308888244, + "grad_norm": 0.529180109500885, + "learning_rate": 9.169836264048259e-06, + "loss": 0.8523, + "step": 9065 + }, + { + "epoch": 0.8100609824200862, + "grad_norm": 0.4616588056087494, + "learning_rate": 9.161485485692001e-06, + "loss": 0.9791, + "step": 9066 + }, + { + "epoch": 0.8101503339513481, + "grad_norm": 0.5188908576965332, + "learning_rate": 9.15313812799664e-06, + "loss": 0.9484, + "step": 9067 + }, + { + "epoch": 0.8102396854826099, + "grad_norm": 0.43526217341423035, + "learning_rate": 9.144794191661355e-06, + "loss": 0.9121, + "step": 9068 + }, + { + "epoch": 0.8103290370138718, + "grad_norm": 0.44872915744781494, + "learning_rate": 9.136453677385043e-06, + "loss": 1.0135, + "step": 9069 + }, + { + "epoch": 0.8104183885451337, + "grad_norm": 0.4532172679901123, + "learning_rate": 9.128116585866308e-06, + "loss": 0.9565, + "step": 9070 + }, + { + "epoch": 0.8105077400763956, + "grad_norm": 0.5452584028244019, + "learning_rate": 9.119782917803476e-06, + "loss": 0.9311, + "step": 9071 + }, + { + "epoch": 0.8105970916076575, + "grad_norm": 0.5876386165618896, + "learning_rate": 9.111452673894588e-06, + "loss": 0.8881, + "step": 9072 + }, + { + "epoch": 0.8106864431389192, + "grad_norm": 0.4356098175048828, + "learning_rate": 9.103125854837362e-06, + "loss": 0.9527, + "step": 9073 + }, + { + "epoch": 0.8107757946701811, + "grad_norm": 0.47350767254829407, + "learning_rate": 9.094802461329277e-06, + "loss": 0.9068, + "step": 9074 + }, + { + "epoch": 0.810865146201443, + "grad_norm": 0.4826947748661041, + "learning_rate": 9.086482494067506e-06, + "loss": 0.8969, + "step": 9075 + }, + { + "epoch": 0.8109544977327049, + "grad_norm": 0.5265054702758789, + "learning_rate": 9.078165953748936e-06, + "loss": 0.9365, + "step": 9076 + }, + { + "epoch": 0.8110438492639668, + "grad_norm": 0.41897475719451904, + "learning_rate": 9.069852841070159e-06, + "loss": 0.9803, + "step": 9077 + }, + { + "epoch": 0.8111332007952287, + "grad_norm": 0.4707808196544647, + "learning_rate": 9.061543156727486e-06, + "loss": 0.8801, + "step": 9078 + }, + { + "epoch": 0.8112225523264905, + "grad_norm": 0.5158072710037231, + "learning_rate": 9.05323690141695e-06, + "loss": 0.9694, + "step": 9079 + }, + { + "epoch": 0.8113119038577523, + "grad_norm": 0.47056683897972107, + "learning_rate": 9.044934075834289e-06, + "loss": 0.9452, + "step": 9080 + }, + { + "epoch": 0.8114012553890142, + "grad_norm": 0.5674601793289185, + "learning_rate": 9.036634680674943e-06, + "loss": 0.8907, + "step": 9081 + }, + { + "epoch": 0.8114906069202761, + "grad_norm": 0.44375982880592346, + "learning_rate": 9.028338716634094e-06, + "loss": 1.0265, + "step": 9082 + }, + { + "epoch": 0.811579958451538, + "grad_norm": 0.4866836965084076, + "learning_rate": 9.020046184406593e-06, + "loss": 0.9754, + "step": 9083 + }, + { + "epoch": 0.8116693099827998, + "grad_norm": 0.42839664220809937, + "learning_rate": 9.01175708468704e-06, + "loss": 0.9104, + "step": 9084 + }, + { + "epoch": 0.8117586615140617, + "grad_norm": 0.4285668432712555, + "learning_rate": 9.003471418169734e-06, + "loss": 0.9743, + "step": 9085 + }, + { + "epoch": 0.8118480130453236, + "grad_norm": 0.6068586111068726, + "learning_rate": 8.995189185548686e-06, + "loss": 0.978, + "step": 9086 + }, + { + "epoch": 0.8119373645765854, + "grad_norm": 0.4698917865753174, + "learning_rate": 8.986910387517639e-06, + "loss": 0.9434, + "step": 9087 + }, + { + "epoch": 0.8120267161078473, + "grad_norm": 0.4826454222202301, + "learning_rate": 8.97863502477e-06, + "loss": 0.9581, + "step": 9088 + }, + { + "epoch": 0.8121160676391092, + "grad_norm": 0.5964739322662354, + "learning_rate": 8.970363097998936e-06, + "loss": 0.8737, + "step": 9089 + }, + { + "epoch": 0.812205419170371, + "grad_norm": 0.48209187388420105, + "learning_rate": 8.962094607897303e-06, + "loss": 0.8962, + "step": 9090 + }, + { + "epoch": 0.8122947707016329, + "grad_norm": 0.5253683924674988, + "learning_rate": 8.953829555157684e-06, + "loss": 0.874, + "step": 9091 + }, + { + "epoch": 0.8123841222328948, + "grad_norm": 0.3659655451774597, + "learning_rate": 8.945567940472371e-06, + "loss": 1.0097, + "step": 9092 + }, + { + "epoch": 0.8124734737641567, + "grad_norm": 0.5746763944625854, + "learning_rate": 8.937309764533335e-06, + "loss": 0.8948, + "step": 9093 + }, + { + "epoch": 0.8125628252954185, + "grad_norm": 0.5120271444320679, + "learning_rate": 8.929055028032312e-06, + "loss": 0.9083, + "step": 9094 + }, + { + "epoch": 0.8126521768266803, + "grad_norm": 0.4482232928276062, + "learning_rate": 8.92080373166071e-06, + "loss": 0.8775, + "step": 9095 + }, + { + "epoch": 0.8127415283579422, + "grad_norm": 0.7028616070747375, + "learning_rate": 8.912555876109668e-06, + "loss": 0.8673, + "step": 9096 + }, + { + "epoch": 0.8128308798892041, + "grad_norm": 0.5362510681152344, + "learning_rate": 8.904311462070031e-06, + "loss": 0.8818, + "step": 9097 + }, + { + "epoch": 0.812920231420466, + "grad_norm": 0.42716673016548157, + "learning_rate": 8.89607049023236e-06, + "loss": 0.9894, + "step": 9098 + }, + { + "epoch": 0.8130095829517279, + "grad_norm": 0.6123215556144714, + "learning_rate": 8.887832961286923e-06, + "loss": 0.9649, + "step": 9099 + }, + { + "epoch": 0.8130989344829898, + "grad_norm": 0.4452812671661377, + "learning_rate": 8.8795988759237e-06, + "loss": 0.9172, + "step": 9100 + }, + { + "epoch": 0.8131882860142515, + "grad_norm": 0.49280375242233276, + "learning_rate": 8.871368234832378e-06, + "loss": 0.9344, + "step": 9101 + }, + { + "epoch": 0.8132776375455134, + "grad_norm": 0.4815100431442261, + "learning_rate": 8.863141038702382e-06, + "loss": 0.8467, + "step": 9102 + }, + { + "epoch": 0.8133669890767753, + "grad_norm": 0.48964032530784607, + "learning_rate": 8.854917288222803e-06, + "loss": 0.9917, + "step": 9103 + }, + { + "epoch": 0.8134563406080372, + "grad_norm": 0.45770344138145447, + "learning_rate": 8.84669698408247e-06, + "loss": 0.9124, + "step": 9104 + }, + { + "epoch": 0.8135456921392991, + "grad_norm": 0.49135828018188477, + "learning_rate": 8.838480126969928e-06, + "loss": 0.8979, + "step": 9105 + }, + { + "epoch": 0.8136350436705609, + "grad_norm": 0.44308167695999146, + "learning_rate": 8.830266717573427e-06, + "loss": 1.0013, + "step": 9106 + }, + { + "epoch": 0.8137243952018228, + "grad_norm": 0.4538305699825287, + "learning_rate": 8.822056756580926e-06, + "loss": 0.9082, + "step": 9107 + }, + { + "epoch": 0.8138137467330846, + "grad_norm": 0.5253844857215881, + "learning_rate": 8.8138502446801e-06, + "loss": 0.8479, + "step": 9108 + }, + { + "epoch": 0.8139030982643465, + "grad_norm": 0.47331321239471436, + "learning_rate": 8.805647182558324e-06, + "loss": 0.8898, + "step": 9109 + }, + { + "epoch": 0.8139924497956084, + "grad_norm": 0.5414149761199951, + "learning_rate": 8.797447570902695e-06, + "loss": 0.9196, + "step": 9110 + }, + { + "epoch": 0.8140818013268702, + "grad_norm": 0.45004722476005554, + "learning_rate": 8.789251410400023e-06, + "loss": 0.94, + "step": 9111 + }, + { + "epoch": 0.8141711528581321, + "grad_norm": 0.4916594624519348, + "learning_rate": 8.781058701736822e-06, + "loss": 0.9332, + "step": 9112 + }, + { + "epoch": 0.814260504389394, + "grad_norm": 0.4902633726596832, + "learning_rate": 8.77286944559933e-06, + "loss": 0.8747, + "step": 9113 + }, + { + "epoch": 0.8143498559206558, + "grad_norm": 0.5728904008865356, + "learning_rate": 8.764683642673455e-06, + "loss": 0.8908, + "step": 9114 + }, + { + "epoch": 0.8144392074519177, + "grad_norm": 0.5360121726989746, + "learning_rate": 8.756501293644865e-06, + "loss": 0.8705, + "step": 9115 + }, + { + "epoch": 0.8145285589831796, + "grad_norm": 0.5026247501373291, + "learning_rate": 8.748322399198916e-06, + "loss": 1.0092, + "step": 9116 + }, + { + "epoch": 0.8146179105144414, + "grad_norm": 0.46945619583129883, + "learning_rate": 8.740146960020673e-06, + "loss": 0.9574, + "step": 9117 + }, + { + "epoch": 0.8147072620457033, + "grad_norm": 0.4218684434890747, + "learning_rate": 8.731974976794926e-06, + "loss": 0.9714, + "step": 9118 + }, + { + "epoch": 0.8147966135769652, + "grad_norm": 0.5485289692878723, + "learning_rate": 8.723806450206157e-06, + "loss": 0.9748, + "step": 9119 + }, + { + "epoch": 0.8148859651082271, + "grad_norm": 0.5682856440544128, + "learning_rate": 8.715641380938583e-06, + "loss": 0.9386, + "step": 9120 + }, + { + "epoch": 0.8149753166394889, + "grad_norm": 0.49733036756515503, + "learning_rate": 8.707479769676091e-06, + "loss": 0.9663, + "step": 9121 + }, + { + "epoch": 0.8150646681707507, + "grad_norm": 0.42391541600227356, + "learning_rate": 8.699321617102312e-06, + "loss": 0.92, + "step": 9122 + }, + { + "epoch": 0.8151540197020126, + "grad_norm": 0.48684948682785034, + "learning_rate": 8.691166923900585e-06, + "loss": 0.96, + "step": 9123 + }, + { + "epoch": 0.8152433712332745, + "grad_norm": 0.4087164103984833, + "learning_rate": 8.68301569075396e-06, + "loss": 0.9743, + "step": 9124 + }, + { + "epoch": 0.8153327227645364, + "grad_norm": 0.5374601483345032, + "learning_rate": 8.67486791834517e-06, + "loss": 0.9377, + "step": 9125 + }, + { + "epoch": 0.8154220742957983, + "grad_norm": 0.4423445463180542, + "learning_rate": 8.66672360735668e-06, + "loss": 0.9455, + "step": 9126 + }, + { + "epoch": 0.8155114258270602, + "grad_norm": 0.47707852721214294, + "learning_rate": 8.658582758470673e-06, + "loss": 0.926, + "step": 9127 + }, + { + "epoch": 0.8156007773583219, + "grad_norm": 0.4866050183773041, + "learning_rate": 8.650445372369025e-06, + "loss": 0.9738, + "step": 9128 + }, + { + "epoch": 0.8156901288895838, + "grad_norm": 0.5553078651428223, + "learning_rate": 8.642311449733331e-06, + "loss": 0.8595, + "step": 9129 + }, + { + "epoch": 0.8157794804208457, + "grad_norm": 0.4209044277667999, + "learning_rate": 8.634180991244894e-06, + "loss": 0.9365, + "step": 9130 + }, + { + "epoch": 0.8158688319521076, + "grad_norm": 0.4722152352333069, + "learning_rate": 8.626053997584732e-06, + "loss": 0.9841, + "step": 9131 + }, + { + "epoch": 0.8159581834833695, + "grad_norm": 0.5327540040016174, + "learning_rate": 8.61793046943356e-06, + "loss": 0.9321, + "step": 9132 + }, + { + "epoch": 0.8160475350146313, + "grad_norm": 0.4482969045639038, + "learning_rate": 8.609810407471824e-06, + "loss": 0.9341, + "step": 9133 + }, + { + "epoch": 0.8161368865458932, + "grad_norm": 0.46904319524765015, + "learning_rate": 8.60169381237964e-06, + "loss": 1.0379, + "step": 9134 + }, + { + "epoch": 0.816226238077155, + "grad_norm": 0.5137619972229004, + "learning_rate": 8.593580684836877e-06, + "loss": 0.8599, + "step": 9135 + }, + { + "epoch": 0.8163155896084169, + "grad_norm": 0.5669674873352051, + "learning_rate": 8.58547102552309e-06, + "loss": 0.857, + "step": 9136 + }, + { + "epoch": 0.8164049411396788, + "grad_norm": 0.45517855882644653, + "learning_rate": 8.577364835117552e-06, + "loss": 1.0162, + "step": 9137 + }, + { + "epoch": 0.8164942926709406, + "grad_norm": 0.5764082074165344, + "learning_rate": 8.569262114299243e-06, + "loss": 0.9356, + "step": 9138 + }, + { + "epoch": 0.8165836442022025, + "grad_norm": 0.69561767578125, + "learning_rate": 8.561162863746847e-06, + "loss": 0.8668, + "step": 9139 + }, + { + "epoch": 0.8166729957334644, + "grad_norm": 0.6161789298057556, + "learning_rate": 8.553067084138772e-06, + "loss": 0.905, + "step": 9140 + }, + { + "epoch": 0.8167623472647263, + "grad_norm": 0.5391383171081543, + "learning_rate": 8.544974776153124e-06, + "loss": 0.9331, + "step": 9141 + }, + { + "epoch": 0.8168516987959881, + "grad_norm": 0.504325270652771, + "learning_rate": 8.536885940467715e-06, + "loss": 0.9957, + "step": 9142 + }, + { + "epoch": 0.81694105032725, + "grad_norm": 0.4798524081707001, + "learning_rate": 8.528800577760083e-06, + "loss": 0.9162, + "step": 9143 + }, + { + "epoch": 0.8170304018585118, + "grad_norm": 0.48512718081474304, + "learning_rate": 8.520718688707447e-06, + "loss": 0.9431, + "step": 9144 + }, + { + "epoch": 0.8171197533897737, + "grad_norm": 0.5134716033935547, + "learning_rate": 8.512640273986756e-06, + "loss": 0.9639, + "step": 9145 + }, + { + "epoch": 0.8172091049210356, + "grad_norm": 0.3981763422489166, + "learning_rate": 8.504565334274662e-06, + "loss": 0.9315, + "step": 9146 + }, + { + "epoch": 0.8172984564522975, + "grad_norm": 0.43053102493286133, + "learning_rate": 8.496493870247535e-06, + "loss": 0.9206, + "step": 9147 + }, + { + "epoch": 0.8173878079835594, + "grad_norm": 0.5616499185562134, + "learning_rate": 8.488425882581436e-06, + "loss": 0.9045, + "step": 9148 + }, + { + "epoch": 0.8174771595148211, + "grad_norm": 0.4507736265659332, + "learning_rate": 8.480361371952156e-06, + "loss": 0.9098, + "step": 9149 + }, + { + "epoch": 0.817566511046083, + "grad_norm": 0.4418569505214691, + "learning_rate": 8.472300339035178e-06, + "loss": 0.9385, + "step": 9150 + }, + { + "epoch": 0.8176558625773449, + "grad_norm": 0.39274096488952637, + "learning_rate": 8.4642427845057e-06, + "loss": 0.9688, + "step": 9151 + }, + { + "epoch": 0.8177452141086068, + "grad_norm": 0.48565006256103516, + "learning_rate": 8.456188709038632e-06, + "loss": 0.9807, + "step": 9152 + }, + { + "epoch": 0.8178345656398687, + "grad_norm": 0.5166711211204529, + "learning_rate": 8.44813811330859e-06, + "loss": 1.0159, + "step": 9153 + }, + { + "epoch": 0.8179239171711306, + "grad_norm": 0.5695293545722961, + "learning_rate": 8.440090997989885e-06, + "loss": 0.936, + "step": 9154 + }, + { + "epoch": 0.8180132687023924, + "grad_norm": 0.4986543655395508, + "learning_rate": 8.432047363756551e-06, + "loss": 0.9075, + "step": 9155 + }, + { + "epoch": 0.8181026202336542, + "grad_norm": 0.5056511163711548, + "learning_rate": 8.424007211282337e-06, + "loss": 0.9455, + "step": 9156 + }, + { + "epoch": 0.8181919717649161, + "grad_norm": 0.5469470620155334, + "learning_rate": 8.415970541240697e-06, + "loss": 0.8948, + "step": 9157 + }, + { + "epoch": 0.818281323296178, + "grad_norm": 0.5168966054916382, + "learning_rate": 8.407937354304769e-06, + "loss": 0.8955, + "step": 9158 + }, + { + "epoch": 0.8183706748274399, + "grad_norm": 0.4793681800365448, + "learning_rate": 8.39990765114742e-06, + "loss": 0.9188, + "step": 9159 + }, + { + "epoch": 0.8184600263587017, + "grad_norm": 0.5143187046051025, + "learning_rate": 8.39188143244124e-06, + "loss": 0.9672, + "step": 9160 + }, + { + "epoch": 0.8185493778899636, + "grad_norm": 0.5519688725471497, + "learning_rate": 8.383858698858494e-06, + "loss": 0.9495, + "step": 9161 + }, + { + "epoch": 0.8186387294212255, + "grad_norm": 0.45014795660972595, + "learning_rate": 8.375839451071183e-06, + "loss": 0.8814, + "step": 9162 + }, + { + "epoch": 0.8187280809524873, + "grad_norm": 0.4443376064300537, + "learning_rate": 8.367823689751009e-06, + "loss": 0.9321, + "step": 9163 + }, + { + "epoch": 0.8188174324837492, + "grad_norm": 0.5371510982513428, + "learning_rate": 8.359811415569352e-06, + "loss": 0.9393, + "step": 9164 + }, + { + "epoch": 0.818906784015011, + "grad_norm": 0.5210739374160767, + "learning_rate": 8.351802629197348e-06, + "loss": 0.949, + "step": 9165 + }, + { + "epoch": 0.8189961355462729, + "grad_norm": 0.4387337565422058, + "learning_rate": 8.343797331305809e-06, + "loss": 0.9241, + "step": 9166 + }, + { + "epoch": 0.8190854870775348, + "grad_norm": 0.4759118854999542, + "learning_rate": 8.335795522565264e-06, + "loss": 0.9132, + "step": 9167 + }, + { + "epoch": 0.8191748386087967, + "grad_norm": 0.5519284605979919, + "learning_rate": 8.327797203645954e-06, + "loss": 0.9745, + "step": 9168 + }, + { + "epoch": 0.8192641901400586, + "grad_norm": 0.4415756165981293, + "learning_rate": 8.319802375217821e-06, + "loss": 1.0214, + "step": 9169 + }, + { + "epoch": 0.8193535416713204, + "grad_norm": 0.5409457683563232, + "learning_rate": 8.311811037950522e-06, + "loss": 0.979, + "step": 9170 + }, + { + "epoch": 0.8194428932025822, + "grad_norm": 0.49359166622161865, + "learning_rate": 8.30382319251341e-06, + "loss": 0.8628, + "step": 9171 + }, + { + "epoch": 0.8195322447338441, + "grad_norm": 0.45984289050102234, + "learning_rate": 8.295838839575559e-06, + "loss": 0.9556, + "step": 9172 + }, + { + "epoch": 0.819621596265106, + "grad_norm": 0.520918607711792, + "learning_rate": 8.287857979805746e-06, + "loss": 0.9423, + "step": 9173 + }, + { + "epoch": 0.8197109477963679, + "grad_norm": 0.4687787890434265, + "learning_rate": 8.279880613872438e-06, + "loss": 0.9551, + "step": 9174 + }, + { + "epoch": 0.8198002993276298, + "grad_norm": 0.4171432852745056, + "learning_rate": 8.271906742443836e-06, + "loss": 0.9329, + "step": 9175 + }, + { + "epoch": 0.8198896508588915, + "grad_norm": 0.5635870099067688, + "learning_rate": 8.263936366187824e-06, + "loss": 0.8315, + "step": 9176 + }, + { + "epoch": 0.8199790023901534, + "grad_norm": 0.5225967764854431, + "learning_rate": 8.255969485772025e-06, + "loss": 0.9817, + "step": 9177 + }, + { + "epoch": 0.8200683539214153, + "grad_norm": 0.47580423951148987, + "learning_rate": 8.248006101863736e-06, + "loss": 0.9601, + "step": 9178 + }, + { + "epoch": 0.8201577054526772, + "grad_norm": 0.5381955504417419, + "learning_rate": 8.240046215129982e-06, + "loss": 0.9547, + "step": 9179 + }, + { + "epoch": 0.8202470569839391, + "grad_norm": 0.5118776559829712, + "learning_rate": 8.232089826237487e-06, + "loss": 0.9135, + "step": 9180 + }, + { + "epoch": 0.820336408515201, + "grad_norm": 0.6581920981407166, + "learning_rate": 8.224136935852683e-06, + "loss": 0.8308, + "step": 9181 + }, + { + "epoch": 0.8204257600464628, + "grad_norm": 0.45607322454452515, + "learning_rate": 8.216187544641706e-06, + "loss": 0.881, + "step": 9182 + }, + { + "epoch": 0.8205151115777246, + "grad_norm": 0.4387778639793396, + "learning_rate": 8.208241653270421e-06, + "loss": 0.9486, + "step": 9183 + }, + { + "epoch": 0.8206044631089865, + "grad_norm": 0.5159300565719604, + "learning_rate": 8.200299262404353e-06, + "loss": 1.005, + "step": 9184 + }, + { + "epoch": 0.8206938146402484, + "grad_norm": 0.6031007766723633, + "learning_rate": 8.19236037270878e-06, + "loss": 0.8316, + "step": 9185 + }, + { + "epoch": 0.8207831661715103, + "grad_norm": 0.4924890995025635, + "learning_rate": 8.184424984848655e-06, + "loss": 0.9564, + "step": 9186 + }, + { + "epoch": 0.8208725177027721, + "grad_norm": 0.4664970934391022, + "learning_rate": 8.176493099488663e-06, + "loss": 0.9416, + "step": 9187 + }, + { + "epoch": 0.820961869234034, + "grad_norm": 0.46760138869285583, + "learning_rate": 8.168564717293182e-06, + "loss": 0.9566, + "step": 9188 + }, + { + "epoch": 0.8210512207652959, + "grad_norm": 0.5146795511245728, + "learning_rate": 8.160639838926293e-06, + "loss": 0.8951, + "step": 9189 + }, + { + "epoch": 0.8211405722965577, + "grad_norm": 0.4736398458480835, + "learning_rate": 8.152718465051806e-06, + "loss": 1.0171, + "step": 9190 + }, + { + "epoch": 0.8212299238278196, + "grad_norm": 0.6582403182983398, + "learning_rate": 8.144800596333196e-06, + "loss": 0.9182, + "step": 9191 + }, + { + "epoch": 0.8213192753590814, + "grad_norm": 0.49453866481781006, + "learning_rate": 8.136886233433683e-06, + "loss": 0.8465, + "step": 9192 + }, + { + "epoch": 0.8214086268903433, + "grad_norm": 0.4479573965072632, + "learning_rate": 8.128975377016174e-06, + "loss": 0.8996, + "step": 9193 + }, + { + "epoch": 0.8214979784216052, + "grad_norm": 0.5152574777603149, + "learning_rate": 8.1210680277433e-06, + "loss": 0.9579, + "step": 9194 + }, + { + "epoch": 0.8215873299528671, + "grad_norm": 0.5516067743301392, + "learning_rate": 8.113164186277367e-06, + "loss": 0.9078, + "step": 9195 + }, + { + "epoch": 0.821676681484129, + "grad_norm": 0.4801537096500397, + "learning_rate": 8.105263853280416e-06, + "loss": 1.0086, + "step": 9196 + }, + { + "epoch": 0.8217660330153908, + "grad_norm": 0.4999988377094269, + "learning_rate": 8.097367029414182e-06, + "loss": 0.8929, + "step": 9197 + }, + { + "epoch": 0.8218553845466526, + "grad_norm": 0.45241791009902954, + "learning_rate": 8.089473715340107e-06, + "loss": 0.9495, + "step": 9198 + }, + { + "epoch": 0.8219447360779145, + "grad_norm": 0.4731530249118805, + "learning_rate": 8.081583911719343e-06, + "loss": 0.9343, + "step": 9199 + }, + { + "epoch": 0.8220340876091764, + "grad_norm": 0.5134366154670715, + "learning_rate": 8.073697619212745e-06, + "loss": 0.9878, + "step": 9200 + }, + { + "epoch": 0.8221234391404383, + "grad_norm": 0.4594072103500366, + "learning_rate": 8.065814838480879e-06, + "loss": 0.9403, + "step": 9201 + }, + { + "epoch": 0.8222127906717002, + "grad_norm": 0.49172157049179077, + "learning_rate": 8.057935570184e-06, + "loss": 0.9553, + "step": 9202 + }, + { + "epoch": 0.822302142202962, + "grad_norm": 0.44806215167045593, + "learning_rate": 8.050059814982092e-06, + "loss": 0.9714, + "step": 9203 + }, + { + "epoch": 0.8223914937342238, + "grad_norm": 0.4985756576061249, + "learning_rate": 8.042187573534836e-06, + "loss": 0.9834, + "step": 9204 + }, + { + "epoch": 0.8224808452654857, + "grad_norm": 0.5466139912605286, + "learning_rate": 8.034318846501598e-06, + "loss": 0.9727, + "step": 9205 + }, + { + "epoch": 0.8225701967967476, + "grad_norm": 0.4964464008808136, + "learning_rate": 8.026453634541481e-06, + "loss": 0.9444, + "step": 9206 + }, + { + "epoch": 0.8226595483280095, + "grad_norm": 0.49595358967781067, + "learning_rate": 8.018591938313275e-06, + "loss": 0.973, + "step": 9207 + }, + { + "epoch": 0.8227488998592714, + "grad_norm": 0.5283821821212769, + "learning_rate": 8.010733758475486e-06, + "loss": 0.956, + "step": 9208 + }, + { + "epoch": 0.8228382513905332, + "grad_norm": 0.4949239492416382, + "learning_rate": 8.002879095686317e-06, + "loss": 0.9192, + "step": 9209 + }, + { + "epoch": 0.8229276029217951, + "grad_norm": 0.48823899030685425, + "learning_rate": 7.995027950603683e-06, + "loss": 0.8878, + "step": 9210 + }, + { + "epoch": 0.8230169544530569, + "grad_norm": 0.483600378036499, + "learning_rate": 7.987180323885196e-06, + "loss": 0.9239, + "step": 9211 + }, + { + "epoch": 0.8231063059843188, + "grad_norm": 0.4983593225479126, + "learning_rate": 7.979336216188182e-06, + "loss": 0.9776, + "step": 9212 + }, + { + "epoch": 0.8231956575155807, + "grad_norm": 0.4743238389492035, + "learning_rate": 7.971495628169667e-06, + "loss": 0.9436, + "step": 9213 + }, + { + "epoch": 0.8232850090468425, + "grad_norm": 0.4517209529876709, + "learning_rate": 7.963658560486399e-06, + "loss": 0.9313, + "step": 9214 + }, + { + "epoch": 0.8233743605781044, + "grad_norm": 0.5047935247421265, + "learning_rate": 7.955825013794793e-06, + "loss": 0.9093, + "step": 9215 + }, + { + "epoch": 0.8234637121093663, + "grad_norm": 0.603001594543457, + "learning_rate": 7.947994988750995e-06, + "loss": 0.9476, + "step": 9216 + }, + { + "epoch": 0.8235530636406282, + "grad_norm": 0.4813896119594574, + "learning_rate": 7.940168486010862e-06, + "loss": 0.8947, + "step": 9217 + }, + { + "epoch": 0.82364241517189, + "grad_norm": 0.5081338882446289, + "learning_rate": 7.932345506229944e-06, + "loss": 0.8794, + "step": 9218 + }, + { + "epoch": 0.8237317667031518, + "grad_norm": 0.4924575984477997, + "learning_rate": 7.924526050063497e-06, + "loss": 0.9315, + "step": 9219 + }, + { + "epoch": 0.8238211182344137, + "grad_norm": 0.4447862207889557, + "learning_rate": 7.916710118166482e-06, + "loss": 0.9681, + "step": 9220 + }, + { + "epoch": 0.8239104697656756, + "grad_norm": 0.5768123269081116, + "learning_rate": 7.908897711193575e-06, + "loss": 0.9272, + "step": 9221 + }, + { + "epoch": 0.8239998212969375, + "grad_norm": 0.6067156791687012, + "learning_rate": 7.901088829799142e-06, + "loss": 0.9376, + "step": 9222 + }, + { + "epoch": 0.8240891728281994, + "grad_norm": 0.6196531057357788, + "learning_rate": 7.893283474637275e-06, + "loss": 0.8734, + "step": 9223 + }, + { + "epoch": 0.8241785243594613, + "grad_norm": 0.49695178866386414, + "learning_rate": 7.88548164636173e-06, + "loss": 0.9231, + "step": 9224 + }, + { + "epoch": 0.824267875890723, + "grad_norm": 0.4435543417930603, + "learning_rate": 7.877683345626008e-06, + "loss": 0.9304, + "step": 9225 + }, + { + "epoch": 0.8243572274219849, + "grad_norm": 0.4490625560283661, + "learning_rate": 7.869888573083294e-06, + "loss": 0.9596, + "step": 9226 + }, + { + "epoch": 0.8244465789532468, + "grad_norm": 0.4157819151878357, + "learning_rate": 7.862097329386497e-06, + "loss": 0.9515, + "step": 9227 + }, + { + "epoch": 0.8245359304845087, + "grad_norm": 0.4776933491230011, + "learning_rate": 7.854309615188198e-06, + "loss": 0.9247, + "step": 9228 + }, + { + "epoch": 0.8246252820157706, + "grad_norm": 0.4814663529396057, + "learning_rate": 7.84652543114071e-06, + "loss": 0.9378, + "step": 9229 + }, + { + "epoch": 0.8247146335470324, + "grad_norm": 0.45337748527526855, + "learning_rate": 7.83874477789604e-06, + "loss": 0.9357, + "step": 9230 + }, + { + "epoch": 0.8248039850782943, + "grad_norm": 0.44966641068458557, + "learning_rate": 7.830967656105903e-06, + "loss": 0.9343, + "step": 9231 + }, + { + "epoch": 0.8248933366095561, + "grad_norm": 0.4764994978904724, + "learning_rate": 7.823194066421707e-06, + "loss": 0.8798, + "step": 9232 + }, + { + "epoch": 0.824982688140818, + "grad_norm": 0.44345101714134216, + "learning_rate": 7.815424009494588e-06, + "loss": 0.9255, + "step": 9233 + }, + { + "epoch": 0.8250720396720799, + "grad_norm": 0.44141438603401184, + "learning_rate": 7.807657485975372e-06, + "loss": 0.9607, + "step": 9234 + }, + { + "epoch": 0.8251613912033418, + "grad_norm": 0.4870518147945404, + "learning_rate": 7.79989449651457e-06, + "loss": 0.8497, + "step": 9235 + }, + { + "epoch": 0.8252507427346036, + "grad_norm": 0.6152021288871765, + "learning_rate": 7.792135041762421e-06, + "loss": 0.8489, + "step": 9236 + }, + { + "epoch": 0.8253400942658655, + "grad_norm": 0.6551914811134338, + "learning_rate": 7.784379122368873e-06, + "loss": 0.9771, + "step": 9237 + }, + { + "epoch": 0.8254294457971274, + "grad_norm": 0.4421270787715912, + "learning_rate": 7.776626738983556e-06, + "loss": 0.9964, + "step": 9238 + }, + { + "epoch": 0.8255187973283892, + "grad_norm": 0.55922931432724, + "learning_rate": 7.768877892255816e-06, + "loss": 0.968, + "step": 9239 + }, + { + "epoch": 0.8256081488596511, + "grad_norm": 0.4829465448856354, + "learning_rate": 7.761132582834708e-06, + "loss": 0.967, + "step": 9240 + }, + { + "epoch": 0.8256975003909129, + "grad_norm": 0.5545680522918701, + "learning_rate": 7.753390811368971e-06, + "loss": 0.9614, + "step": 9241 + }, + { + "epoch": 0.8257868519221748, + "grad_norm": 0.5379648804664612, + "learning_rate": 7.745652578507079e-06, + "loss": 0.846, + "step": 9242 + }, + { + "epoch": 0.8258762034534367, + "grad_norm": 0.6437764167785645, + "learning_rate": 7.737917884897177e-06, + "loss": 0.9404, + "step": 9243 + }, + { + "epoch": 0.8259655549846986, + "grad_norm": 0.4301976263523102, + "learning_rate": 7.73018673118715e-06, + "loss": 0.9681, + "step": 9244 + }, + { + "epoch": 0.8260549065159604, + "grad_norm": 0.46342891454696655, + "learning_rate": 7.722459118024538e-06, + "loss": 0.9234, + "step": 9245 + }, + { + "epoch": 0.8261442580472222, + "grad_norm": 0.45039889216423035, + "learning_rate": 7.714735046056615e-06, + "loss": 0.9904, + "step": 9246 + }, + { + "epoch": 0.8262336095784841, + "grad_norm": 0.5035346746444702, + "learning_rate": 7.707014515930366e-06, + "loss": 0.891, + "step": 9247 + }, + { + "epoch": 0.826322961109746, + "grad_norm": 0.41054171323776245, + "learning_rate": 7.69929752829246e-06, + "loss": 0.943, + "step": 9248 + }, + { + "epoch": 0.8264123126410079, + "grad_norm": 0.4819393754005432, + "learning_rate": 7.691584083789277e-06, + "loss": 0.9219, + "step": 9249 + }, + { + "epoch": 0.8265016641722698, + "grad_norm": 0.4592518210411072, + "learning_rate": 7.683874183066903e-06, + "loss": 0.9504, + "step": 9250 + }, + { + "epoch": 0.8265910157035317, + "grad_norm": 0.5692583322525024, + "learning_rate": 7.676167826771124e-06, + "loss": 0.8824, + "step": 9251 + }, + { + "epoch": 0.8266803672347934, + "grad_norm": 0.48362407088279724, + "learning_rate": 7.668465015547427e-06, + "loss": 0.8867, + "step": 9252 + }, + { + "epoch": 0.8267697187660553, + "grad_norm": 0.5431807041168213, + "learning_rate": 7.66076575004101e-06, + "loss": 0.8867, + "step": 9253 + }, + { + "epoch": 0.8268590702973172, + "grad_norm": 0.6217208504676819, + "learning_rate": 7.653070030896774e-06, + "loss": 0.9971, + "step": 9254 + }, + { + "epoch": 0.8269484218285791, + "grad_norm": 0.4189354181289673, + "learning_rate": 7.6453778587593e-06, + "loss": 0.9719, + "step": 9255 + }, + { + "epoch": 0.827037773359841, + "grad_norm": 0.50577312707901, + "learning_rate": 7.637689234272899e-06, + "loss": 0.9394, + "step": 9256 + }, + { + "epoch": 0.8271271248911028, + "grad_norm": 0.5943676829338074, + "learning_rate": 7.630004158081572e-06, + "loss": 0.8626, + "step": 9257 + }, + { + "epoch": 0.8272164764223647, + "grad_norm": 0.42551112174987793, + "learning_rate": 7.622322630829032e-06, + "loss": 0.9436, + "step": 9258 + }, + { + "epoch": 0.8273058279536265, + "grad_norm": 0.6228631138801575, + "learning_rate": 7.6146446531586815e-06, + "loss": 0.9074, + "step": 9259 + }, + { + "epoch": 0.8273951794848884, + "grad_norm": 0.4399883449077606, + "learning_rate": 7.60697022571365e-06, + "loss": 0.9957, + "step": 9260 + }, + { + "epoch": 0.8274845310161503, + "grad_norm": 0.5171003341674805, + "learning_rate": 7.599299349136723e-06, + "loss": 0.9123, + "step": 9261 + }, + { + "epoch": 0.8275738825474122, + "grad_norm": 0.4070376753807068, + "learning_rate": 7.591632024070439e-06, + "loss": 0.9231, + "step": 9262 + }, + { + "epoch": 0.827663234078674, + "grad_norm": 0.47427892684936523, + "learning_rate": 7.583968251157014e-06, + "loss": 0.9726, + "step": 9263 + }, + { + "epoch": 0.8277525856099359, + "grad_norm": 0.5160714983940125, + "learning_rate": 7.576308031038381e-06, + "loss": 0.9672, + "step": 9264 + }, + { + "epoch": 0.8278419371411978, + "grad_norm": 0.5352666974067688, + "learning_rate": 7.568651364356144e-06, + "loss": 0.8548, + "step": 9265 + }, + { + "epoch": 0.8279312886724596, + "grad_norm": 0.4556531310081482, + "learning_rate": 7.560998251751639e-06, + "loss": 0.9801, + "step": 9266 + }, + { + "epoch": 0.8280206402037215, + "grad_norm": 0.46089810132980347, + "learning_rate": 7.553348693865897e-06, + "loss": 0.9177, + "step": 9267 + }, + { + "epoch": 0.8281099917349833, + "grad_norm": 0.4947889447212219, + "learning_rate": 7.545702691339657e-06, + "loss": 0.9936, + "step": 9268 + }, + { + "epoch": 0.8281993432662452, + "grad_norm": 0.46438440680503845, + "learning_rate": 7.538060244813339e-06, + "loss": 0.9395, + "step": 9269 + }, + { + "epoch": 0.8282886947975071, + "grad_norm": 0.46110087633132935, + "learning_rate": 7.530421354927092e-06, + "loss": 0.9576, + "step": 9270 + }, + { + "epoch": 0.828378046328769, + "grad_norm": 0.4900936484336853, + "learning_rate": 7.522786022320744e-06, + "loss": 0.9473, + "step": 9271 + }, + { + "epoch": 0.8284673978600309, + "grad_norm": 0.48400017619132996, + "learning_rate": 7.5151542476338485e-06, + "loss": 0.9329, + "step": 9272 + }, + { + "epoch": 0.8285567493912926, + "grad_norm": 0.49266934394836426, + "learning_rate": 7.507526031505635e-06, + "loss": 0.8979, + "step": 9273 + }, + { + "epoch": 0.8286461009225545, + "grad_norm": 0.5182574987411499, + "learning_rate": 7.499901374575069e-06, + "loss": 0.8894, + "step": 9274 + }, + { + "epoch": 0.8287354524538164, + "grad_norm": 0.5964707732200623, + "learning_rate": 7.492280277480768e-06, + "loss": 0.9323, + "step": 9275 + }, + { + "epoch": 0.8288248039850783, + "grad_norm": 0.45841360092163086, + "learning_rate": 7.4846627408610925e-06, + "loss": 0.9406, + "step": 9276 + }, + { + "epoch": 0.8289141555163402, + "grad_norm": 0.4415697753429413, + "learning_rate": 7.477048765354094e-06, + "loss": 0.9698, + "step": 9277 + }, + { + "epoch": 0.8290035070476021, + "grad_norm": 0.49116966128349304, + "learning_rate": 7.469438351597524e-06, + "loss": 0.9453, + "step": 9278 + }, + { + "epoch": 0.829092858578864, + "grad_norm": 0.5854166746139526, + "learning_rate": 7.4618315002288384e-06, + "loss": 0.9059, + "step": 9279 + }, + { + "epoch": 0.8291822101101257, + "grad_norm": 0.46509405970573425, + "learning_rate": 7.454228211885184e-06, + "loss": 0.9222, + "step": 9280 + }, + { + "epoch": 0.8292715616413876, + "grad_norm": 0.442025363445282, + "learning_rate": 7.446628487203422e-06, + "loss": 0.9237, + "step": 9281 + }, + { + "epoch": 0.8293609131726495, + "grad_norm": 0.6018460392951965, + "learning_rate": 7.439032326820117e-06, + "loss": 0.9529, + "step": 9282 + }, + { + "epoch": 0.8294502647039114, + "grad_norm": 0.48638513684272766, + "learning_rate": 7.4314397313715145e-06, + "loss": 0.8751, + "step": 9283 + }, + { + "epoch": 0.8295396162351732, + "grad_norm": 0.4461117684841156, + "learning_rate": 7.423850701493585e-06, + "loss": 0.9923, + "step": 9284 + }, + { + "epoch": 0.8296289677664351, + "grad_norm": 0.5634244084358215, + "learning_rate": 7.416265237822001e-06, + "loss": 0.9615, + "step": 9285 + }, + { + "epoch": 0.829718319297697, + "grad_norm": 0.5193483233451843, + "learning_rate": 7.408683340992101e-06, + "loss": 0.8622, + "step": 9286 + }, + { + "epoch": 0.8298076708289588, + "grad_norm": 0.5628817081451416, + "learning_rate": 7.401105011638965e-06, + "loss": 0.8551, + "step": 9287 + }, + { + "epoch": 0.8298970223602207, + "grad_norm": 0.5427822470664978, + "learning_rate": 7.3935302503973515e-06, + "loss": 0.8488, + "step": 9288 + }, + { + "epoch": 0.8299863738914826, + "grad_norm": 0.4602343440055847, + "learning_rate": 7.385959057901737e-06, + "loss": 1.054, + "step": 9289 + }, + { + "epoch": 0.8300757254227444, + "grad_norm": 0.5840815305709839, + "learning_rate": 7.378391434786281e-06, + "loss": 0.911, + "step": 9290 + }, + { + "epoch": 0.8301650769540063, + "grad_norm": 0.514077365398407, + "learning_rate": 7.370827381684853e-06, + "loss": 0.923, + "step": 9291 + }, + { + "epoch": 0.8302544284852682, + "grad_norm": 0.5028949975967407, + "learning_rate": 7.3632668992310305e-06, + "loss": 0.8987, + "step": 9292 + }, + { + "epoch": 0.8303437800165301, + "grad_norm": 0.3772270083427429, + "learning_rate": 7.35570998805809e-06, + "loss": 0.9484, + "step": 9293 + }, + { + "epoch": 0.8304331315477919, + "grad_norm": 0.5475994944572449, + "learning_rate": 7.348156648798981e-06, + "loss": 0.9564, + "step": 9294 + }, + { + "epoch": 0.8305224830790537, + "grad_norm": 0.5647354125976562, + "learning_rate": 7.340606882086393e-06, + "loss": 0.894, + "step": 9295 + }, + { + "epoch": 0.8306118346103156, + "grad_norm": 0.5140451788902283, + "learning_rate": 7.33306068855269e-06, + "loss": 0.9726, + "step": 9296 + }, + { + "epoch": 0.8307011861415775, + "grad_norm": 0.45875853300094604, + "learning_rate": 7.325518068829967e-06, + "loss": 0.9158, + "step": 9297 + }, + { + "epoch": 0.8307905376728394, + "grad_norm": 0.38240689039230347, + "learning_rate": 7.31797902354997e-06, + "loss": 0.9517, + "step": 9298 + }, + { + "epoch": 0.8308798892041013, + "grad_norm": 0.5880802869796753, + "learning_rate": 7.310443553344182e-06, + "loss": 0.9576, + "step": 9299 + }, + { + "epoch": 0.8309692407353632, + "grad_norm": 0.5717623829841614, + "learning_rate": 7.302911658843792e-06, + "loss": 0.9193, + "step": 9300 + }, + { + "epoch": 0.8310585922666249, + "grad_norm": 0.4772949814796448, + "learning_rate": 7.2953833406796675e-06, + "loss": 0.8271, + "step": 9301 + }, + { + "epoch": 0.8311479437978868, + "grad_norm": 0.43257272243499756, + "learning_rate": 7.287858599482383e-06, + "loss": 0.946, + "step": 9302 + }, + { + "epoch": 0.8312372953291487, + "grad_norm": 0.5265113115310669, + "learning_rate": 7.280337435882223e-06, + "loss": 0.8861, + "step": 9303 + }, + { + "epoch": 0.8313266468604106, + "grad_norm": 0.4444237947463989, + "learning_rate": 7.272819850509161e-06, + "loss": 0.9427, + "step": 9304 + }, + { + "epoch": 0.8314159983916725, + "grad_norm": 0.53780198097229, + "learning_rate": 7.265305843992881e-06, + "loss": 0.8518, + "step": 9305 + }, + { + "epoch": 0.8315053499229343, + "grad_norm": 0.5039278268814087, + "learning_rate": 7.257795416962753e-06, + "loss": 0.9144, + "step": 9306 + }, + { + "epoch": 0.8315947014541961, + "grad_norm": 0.5485106110572815, + "learning_rate": 7.250288570047853e-06, + "loss": 0.9448, + "step": 9307 + }, + { + "epoch": 0.831684052985458, + "grad_norm": 0.592823326587677, + "learning_rate": 7.242785303876965e-06, + "loss": 0.9934, + "step": 9308 + }, + { + "epoch": 0.8317734045167199, + "grad_norm": 0.42520976066589355, + "learning_rate": 7.235285619078569e-06, + "loss": 0.9649, + "step": 9309 + }, + { + "epoch": 0.8318627560479818, + "grad_norm": 0.4947132170200348, + "learning_rate": 7.2277895162808416e-06, + "loss": 0.9205, + "step": 9310 + }, + { + "epoch": 0.8319521075792436, + "grad_norm": 0.42473235726356506, + "learning_rate": 7.220296996111658e-06, + "loss": 0.9293, + "step": 9311 + }, + { + "epoch": 0.8320414591105055, + "grad_norm": 0.5703517198562622, + "learning_rate": 7.2128080591986e-06, + "loss": 0.8687, + "step": 9312 + }, + { + "epoch": 0.8321308106417674, + "grad_norm": 0.5139065980911255, + "learning_rate": 7.205322706168949e-06, + "loss": 0.8515, + "step": 9313 + }, + { + "epoch": 0.8322201621730292, + "grad_norm": 0.49325382709503174, + "learning_rate": 7.19784093764968e-06, + "loss": 0.9548, + "step": 9314 + }, + { + "epoch": 0.8323095137042911, + "grad_norm": 0.4829562306404114, + "learning_rate": 7.190362754267477e-06, + "loss": 0.923, + "step": 9315 + }, + { + "epoch": 0.832398865235553, + "grad_norm": 0.4533390402793884, + "learning_rate": 7.182888156648704e-06, + "loss": 0.9178, + "step": 9316 + }, + { + "epoch": 0.8324882167668148, + "grad_norm": 0.42005079984664917, + "learning_rate": 7.175417145419444e-06, + "loss": 0.912, + "step": 9317 + }, + { + "epoch": 0.8325775682980767, + "grad_norm": 0.5088595747947693, + "learning_rate": 7.16794972120548e-06, + "loss": 0.9998, + "step": 9318 + }, + { + "epoch": 0.8326669198293386, + "grad_norm": 0.4803122580051422, + "learning_rate": 7.160485884632278e-06, + "loss": 0.8923, + "step": 9319 + }, + { + "epoch": 0.8327562713606005, + "grad_norm": 0.439439058303833, + "learning_rate": 7.153025636325023e-06, + "loss": 0.9342, + "step": 9320 + }, + { + "epoch": 0.8328456228918623, + "grad_norm": 0.4299473464488983, + "learning_rate": 7.145568976908584e-06, + "loss": 0.9986, + "step": 9321 + }, + { + "epoch": 0.8329349744231241, + "grad_norm": 0.46970871090888977, + "learning_rate": 7.138115907007537e-06, + "loss": 0.9392, + "step": 9322 + }, + { + "epoch": 0.833024325954386, + "grad_norm": 0.500106692314148, + "learning_rate": 7.1306664272461635e-06, + "loss": 0.9441, + "step": 9323 + }, + { + "epoch": 0.8331136774856479, + "grad_norm": 0.5206024050712585, + "learning_rate": 7.123220538248426e-06, + "loss": 0.8643, + "step": 9324 + }, + { + "epoch": 0.8332030290169098, + "grad_norm": 0.4776027798652649, + "learning_rate": 7.115778240638016e-06, + "loss": 0.9586, + "step": 9325 + }, + { + "epoch": 0.8332923805481717, + "grad_norm": 0.45335862040519714, + "learning_rate": 7.1083395350382776e-06, + "loss": 0.941, + "step": 9326 + }, + { + "epoch": 0.8333817320794336, + "grad_norm": 0.47026875615119934, + "learning_rate": 7.100904422072296e-06, + "loss": 0.9022, + "step": 9327 + }, + { + "epoch": 0.8334710836106953, + "grad_norm": 0.4703599810600281, + "learning_rate": 7.093472902362841e-06, + "loss": 1.0376, + "step": 9328 + }, + { + "epoch": 0.8335604351419572, + "grad_norm": 0.48730385303497314, + "learning_rate": 7.086044976532386e-06, + "loss": 0.983, + "step": 9329 + }, + { + "epoch": 0.8336497866732191, + "grad_norm": 0.44514140486717224, + "learning_rate": 7.078620645203099e-06, + "loss": 0.8899, + "step": 9330 + }, + { + "epoch": 0.833739138204481, + "grad_norm": 0.4243552088737488, + "learning_rate": 7.0711999089968335e-06, + "loss": 0.9852, + "step": 9331 + }, + { + "epoch": 0.8338284897357429, + "grad_norm": 0.49389246106147766, + "learning_rate": 7.06378276853516e-06, + "loss": 0.8915, + "step": 9332 + }, + { + "epoch": 0.8339178412670047, + "grad_norm": 0.4797744154930115, + "learning_rate": 7.056369224439352e-06, + "loss": 0.9607, + "step": 9333 + }, + { + "epoch": 0.8340071927982666, + "grad_norm": 0.5037931203842163, + "learning_rate": 7.048959277330363e-06, + "loss": 0.9384, + "step": 9334 + }, + { + "epoch": 0.8340965443295284, + "grad_norm": 0.474246084690094, + "learning_rate": 7.0415529278288765e-06, + "loss": 1.0259, + "step": 9335 + }, + { + "epoch": 0.8341858958607903, + "grad_norm": 0.46430274844169617, + "learning_rate": 7.034150176555221e-06, + "loss": 0.9752, + "step": 9336 + }, + { + "epoch": 0.8342752473920522, + "grad_norm": 0.5320345759391785, + "learning_rate": 7.0267510241294745e-06, + "loss": 0.917, + "step": 9337 + }, + { + "epoch": 0.834364598923314, + "grad_norm": 0.4332819879055023, + "learning_rate": 7.01935547117139e-06, + "loss": 0.9781, + "step": 9338 + }, + { + "epoch": 0.8344539504545759, + "grad_norm": 0.4649805724620819, + "learning_rate": 7.011963518300424e-06, + "loss": 0.9405, + "step": 9339 + }, + { + "epoch": 0.8345433019858378, + "grad_norm": 0.4542677700519562, + "learning_rate": 7.004575166135735e-06, + "loss": 0.9736, + "step": 9340 + }, + { + "epoch": 0.8346326535170997, + "grad_norm": 0.4982972741127014, + "learning_rate": 6.997190415296173e-06, + "loss": 0.8426, + "step": 9341 + }, + { + "epoch": 0.8347220050483615, + "grad_norm": 0.49553072452545166, + "learning_rate": 6.989809266400288e-06, + "loss": 0.92, + "step": 9342 + }, + { + "epoch": 0.8348113565796234, + "grad_norm": 0.4943731427192688, + "learning_rate": 6.982431720066335e-06, + "loss": 0.9163, + "step": 9343 + }, + { + "epoch": 0.8349007081108852, + "grad_norm": 0.4164564311504364, + "learning_rate": 6.975057776912258e-06, + "loss": 0.9058, + "step": 9344 + }, + { + "epoch": 0.8349900596421471, + "grad_norm": 0.4946102797985077, + "learning_rate": 6.96768743755572e-06, + "loss": 0.9323, + "step": 9345 + }, + { + "epoch": 0.835079411173409, + "grad_norm": 0.5377085208892822, + "learning_rate": 6.960320702614037e-06, + "loss": 0.8814, + "step": 9346 + }, + { + "epoch": 0.8351687627046709, + "grad_norm": 0.5740790367126465, + "learning_rate": 6.952957572704266e-06, + "loss": 0.9291, + "step": 9347 + }, + { + "epoch": 0.8352581142359328, + "grad_norm": 0.5461665987968445, + "learning_rate": 6.945598048443147e-06, + "loss": 0.8508, + "step": 9348 + }, + { + "epoch": 0.8353474657671945, + "grad_norm": 0.48720571398735046, + "learning_rate": 6.93824213044712e-06, + "loss": 0.9276, + "step": 9349 + }, + { + "epoch": 0.8354368172984564, + "grad_norm": 0.4371801018714905, + "learning_rate": 6.93088981933232e-06, + "loss": 0.9428, + "step": 9350 + }, + { + "epoch": 0.8355261688297183, + "grad_norm": 0.661524772644043, + "learning_rate": 6.9235411157145765e-06, + "loss": 0.8726, + "step": 9351 + }, + { + "epoch": 0.8356155203609802, + "grad_norm": 0.4543387293815613, + "learning_rate": 6.916196020209431e-06, + "loss": 0.9342, + "step": 9352 + }, + { + "epoch": 0.8357048718922421, + "grad_norm": 0.40771013498306274, + "learning_rate": 6.908854533432107e-06, + "loss": 0.9474, + "step": 9353 + }, + { + "epoch": 0.835794223423504, + "grad_norm": 0.4102661609649658, + "learning_rate": 6.901516655997536e-06, + "loss": 0.9604, + "step": 9354 + }, + { + "epoch": 0.8358835749547658, + "grad_norm": 0.47201770544052124, + "learning_rate": 6.8941823885203485e-06, + "loss": 0.9408, + "step": 9355 + }, + { + "epoch": 0.8359729264860276, + "grad_norm": 0.508849561214447, + "learning_rate": 6.886851731614857e-06, + "loss": 0.8948, + "step": 9356 + }, + { + "epoch": 0.8360622780172895, + "grad_norm": 0.48625683784484863, + "learning_rate": 6.879524685895084e-06, + "loss": 0.9328, + "step": 9357 + }, + { + "epoch": 0.8361516295485514, + "grad_norm": 0.4217790961265564, + "learning_rate": 6.872201251974747e-06, + "loss": 1.0213, + "step": 9358 + }, + { + "epoch": 0.8362409810798133, + "grad_norm": 0.4759739637374878, + "learning_rate": 6.864881430467262e-06, + "loss": 0.9459, + "step": 9359 + }, + { + "epoch": 0.8363303326110751, + "grad_norm": 0.4266846776008606, + "learning_rate": 6.857565221985751e-06, + "loss": 0.9863, + "step": 9360 + }, + { + "epoch": 0.836419684142337, + "grad_norm": 0.5146978497505188, + "learning_rate": 6.850252627143017e-06, + "loss": 0.963, + "step": 9361 + }, + { + "epoch": 0.8365090356735989, + "grad_norm": 0.4575260877609253, + "learning_rate": 6.842943646551564e-06, + "loss": 0.9757, + "step": 9362 + }, + { + "epoch": 0.8365983872048607, + "grad_norm": 0.4543364346027374, + "learning_rate": 6.835638280823614e-06, + "loss": 0.9393, + "step": 9363 + }, + { + "epoch": 0.8366877387361226, + "grad_norm": 0.47015100717544556, + "learning_rate": 6.82833653057105e-06, + "loss": 1.009, + "step": 9364 + }, + { + "epoch": 0.8367770902673844, + "grad_norm": 0.4650666415691376, + "learning_rate": 6.821038396405477e-06, + "loss": 0.9546, + "step": 9365 + }, + { + "epoch": 0.8368664417986463, + "grad_norm": 0.537955105304718, + "learning_rate": 6.813743878938189e-06, + "loss": 0.9242, + "step": 9366 + }, + { + "epoch": 0.8369557933299082, + "grad_norm": 0.5456058979034424, + "learning_rate": 6.806452978780198e-06, + "loss": 0.8273, + "step": 9367 + }, + { + "epoch": 0.8370451448611701, + "grad_norm": 0.48449915647506714, + "learning_rate": 6.79916569654217e-06, + "loss": 0.9456, + "step": 9368 + }, + { + "epoch": 0.8371344963924319, + "grad_norm": 0.5084838271141052, + "learning_rate": 6.791882032834501e-06, + "loss": 0.9431, + "step": 9369 + }, + { + "epoch": 0.8372238479236938, + "grad_norm": 0.5198284387588501, + "learning_rate": 6.784601988267281e-06, + "loss": 0.9717, + "step": 9370 + }, + { + "epoch": 0.8373131994549556, + "grad_norm": 0.5668700933456421, + "learning_rate": 6.777325563450282e-06, + "loss": 0.9418, + "step": 9371 + }, + { + "epoch": 0.8374025509862175, + "grad_norm": 0.6485138535499573, + "learning_rate": 6.77005275899299e-06, + "loss": 0.966, + "step": 9372 + }, + { + "epoch": 0.8374919025174794, + "grad_norm": 0.39778807759284973, + "learning_rate": 6.762783575504578e-06, + "loss": 0.9455, + "step": 9373 + }, + { + "epoch": 0.8375812540487413, + "grad_norm": 0.48835721611976624, + "learning_rate": 6.755518013593914e-06, + "loss": 0.978, + "step": 9374 + }, + { + "epoch": 0.8376706055800032, + "grad_norm": 0.6808341145515442, + "learning_rate": 6.748256073869575e-06, + "loss": 0.8574, + "step": 9375 + }, + { + "epoch": 0.8377599571112649, + "grad_norm": 0.43278858065605164, + "learning_rate": 6.7409977569398265e-06, + "loss": 0.964, + "step": 9376 + }, + { + "epoch": 0.8378493086425268, + "grad_norm": 0.4334334433078766, + "learning_rate": 6.733743063412612e-06, + "loss": 0.9898, + "step": 9377 + }, + { + "epoch": 0.8379386601737887, + "grad_norm": 0.4900834262371063, + "learning_rate": 6.726491993895606e-06, + "loss": 0.8638, + "step": 9378 + }, + { + "epoch": 0.8380280117050506, + "grad_norm": 0.6439689993858337, + "learning_rate": 6.719244548996157e-06, + "loss": 0.8321, + "step": 9379 + }, + { + "epoch": 0.8381173632363125, + "grad_norm": 0.4418420195579529, + "learning_rate": 6.712000729321311e-06, + "loss": 0.9594, + "step": 9380 + }, + { + "epoch": 0.8382067147675744, + "grad_norm": 0.5338155031204224, + "learning_rate": 6.704760535477828e-06, + "loss": 0.891, + "step": 9381 + }, + { + "epoch": 0.8382960662988362, + "grad_norm": 0.48710137605667114, + "learning_rate": 6.697523968072139e-06, + "loss": 0.9445, + "step": 9382 + }, + { + "epoch": 0.838385417830098, + "grad_norm": 0.42618831992149353, + "learning_rate": 6.690291027710394e-06, + "loss": 0.9553, + "step": 9383 + }, + { + "epoch": 0.8384747693613599, + "grad_norm": 0.4654203951358795, + "learning_rate": 6.6830617149984175e-06, + "loss": 0.9249, + "step": 9384 + }, + { + "epoch": 0.8385641208926218, + "grad_norm": 0.5858107805252075, + "learning_rate": 6.675836030541755e-06, + "loss": 0.8551, + "step": 9385 + }, + { + "epoch": 0.8386534724238837, + "grad_norm": 0.5533519387245178, + "learning_rate": 6.668613974945631e-06, + "loss": 0.9609, + "step": 9386 + }, + { + "epoch": 0.8387428239551455, + "grad_norm": 0.4850926399230957, + "learning_rate": 6.661395548814958e-06, + "loss": 0.8791, + "step": 9387 + }, + { + "epoch": 0.8388321754864074, + "grad_norm": 0.3984906077384949, + "learning_rate": 6.6541807527543635e-06, + "loss": 0.9768, + "step": 9388 + }, + { + "epoch": 0.8389215270176693, + "grad_norm": 0.5376686453819275, + "learning_rate": 6.646969587368168e-06, + "loss": 0.957, + "step": 9389 + }, + { + "epoch": 0.8390108785489311, + "grad_norm": 0.5016261339187622, + "learning_rate": 6.6397620532603735e-06, + "loss": 1.0018, + "step": 9390 + }, + { + "epoch": 0.839100230080193, + "grad_norm": 0.4670695960521698, + "learning_rate": 6.632558151034701e-06, + "loss": 0.9015, + "step": 9391 + }, + { + "epoch": 0.8391895816114548, + "grad_norm": 0.653988778591156, + "learning_rate": 6.6253578812945414e-06, + "loss": 0.8509, + "step": 9392 + }, + { + "epoch": 0.8392789331427167, + "grad_norm": 0.5167982578277588, + "learning_rate": 6.6181612446430005e-06, + "loss": 0.8648, + "step": 9393 + }, + { + "epoch": 0.8393682846739786, + "grad_norm": 0.5537070631980896, + "learning_rate": 6.610968241682875e-06, + "loss": 0.8879, + "step": 9394 + }, + { + "epoch": 0.8394576362052405, + "grad_norm": 0.489687979221344, + "learning_rate": 6.603778873016653e-06, + "loss": 0.92, + "step": 9395 + }, + { + "epoch": 0.8395469877365024, + "grad_norm": 0.4659658670425415, + "learning_rate": 6.596593139246532e-06, + "loss": 0.9537, + "step": 9396 + }, + { + "epoch": 0.8396363392677642, + "grad_norm": 0.45842671394348145, + "learning_rate": 6.589411040974369e-06, + "loss": 0.9171, + "step": 9397 + }, + { + "epoch": 0.839725690799026, + "grad_norm": 0.5154210329055786, + "learning_rate": 6.582232578801756e-06, + "loss": 0.8698, + "step": 9398 + }, + { + "epoch": 0.8398150423302879, + "grad_norm": 0.5210223197937012, + "learning_rate": 6.575057753329966e-06, + "loss": 0.8417, + "step": 9399 + }, + { + "epoch": 0.8399043938615498, + "grad_norm": 0.605532169342041, + "learning_rate": 6.567886565159975e-06, + "loss": 0.8945, + "step": 9400 + }, + { + "epoch": 0.8399937453928117, + "grad_norm": 0.4984580874443054, + "learning_rate": 6.560719014892425e-06, + "loss": 0.9281, + "step": 9401 + }, + { + "epoch": 0.8400830969240736, + "grad_norm": 0.4397487938404083, + "learning_rate": 6.55355510312769e-06, + "loss": 0.9569, + "step": 9402 + }, + { + "epoch": 0.8401724484553355, + "grad_norm": 0.5144175291061401, + "learning_rate": 6.546394830465819e-06, + "loss": 0.9275, + "step": 9403 + }, + { + "epoch": 0.8402617999865972, + "grad_norm": 0.4532061219215393, + "learning_rate": 6.539238197506564e-06, + "loss": 0.9112, + "step": 9404 + }, + { + "epoch": 0.8403511515178591, + "grad_norm": 0.5863621234893799, + "learning_rate": 6.53208520484937e-06, + "loss": 0.9107, + "step": 9405 + }, + { + "epoch": 0.840440503049121, + "grad_norm": 0.5393437147140503, + "learning_rate": 6.524935853093383e-06, + "loss": 0.8843, + "step": 9406 + }, + { + "epoch": 0.8405298545803829, + "grad_norm": 0.4814126193523407, + "learning_rate": 6.5177901428374154e-06, + "loss": 0.9806, + "step": 9407 + }, + { + "epoch": 0.8406192061116448, + "grad_norm": 0.38951998949050903, + "learning_rate": 6.510648074680015e-06, + "loss": 0.9301, + "step": 9408 + }, + { + "epoch": 0.8407085576429066, + "grad_norm": 0.487699955701828, + "learning_rate": 6.503509649219403e-06, + "loss": 0.9929, + "step": 9409 + }, + { + "epoch": 0.8407979091741685, + "grad_norm": 0.6564955711364746, + "learning_rate": 6.496374867053495e-06, + "loss": 0.8443, + "step": 9410 + }, + { + "epoch": 0.8408872607054303, + "grad_norm": 0.4607921838760376, + "learning_rate": 6.489243728779904e-06, + "loss": 0.9666, + "step": 9411 + }, + { + "epoch": 0.8409766122366922, + "grad_norm": 0.6501184105873108, + "learning_rate": 6.482116234995944e-06, + "loss": 0.8588, + "step": 9412 + }, + { + "epoch": 0.8410659637679541, + "grad_norm": 0.48878639936447144, + "learning_rate": 6.474992386298617e-06, + "loss": 0.964, + "step": 9413 + }, + { + "epoch": 0.8411553152992159, + "grad_norm": 0.5011094808578491, + "learning_rate": 6.467872183284623e-06, + "loss": 0.8937, + "step": 9414 + }, + { + "epoch": 0.8412446668304778, + "grad_norm": 0.429160475730896, + "learning_rate": 6.460755626550352e-06, + "loss": 0.9129, + "step": 9415 + }, + { + "epoch": 0.8413340183617397, + "grad_norm": 0.4783921539783478, + "learning_rate": 6.453642716691905e-06, + "loss": 0.9367, + "step": 9416 + }, + { + "epoch": 0.8414233698930016, + "grad_norm": 0.4726746380329132, + "learning_rate": 6.446533454305037e-06, + "loss": 0.8902, + "step": 9417 + }, + { + "epoch": 0.8415127214242634, + "grad_norm": 0.4634154438972473, + "learning_rate": 6.43942783998524e-06, + "loss": 0.9265, + "step": 9418 + }, + { + "epoch": 0.8416020729555252, + "grad_norm": 0.44272157549858093, + "learning_rate": 6.432325874327683e-06, + "loss": 0.9241, + "step": 9419 + }, + { + "epoch": 0.8416914244867871, + "grad_norm": 0.49922677874565125, + "learning_rate": 6.425227557927232e-06, + "loss": 0.9276, + "step": 9420 + }, + { + "epoch": 0.841780776018049, + "grad_norm": 0.4457327723503113, + "learning_rate": 6.418132891378448e-06, + "loss": 0.9738, + "step": 9421 + }, + { + "epoch": 0.8418701275493109, + "grad_norm": 0.48788073658943176, + "learning_rate": 6.411041875275581e-06, + "loss": 0.9399, + "step": 9422 + }, + { + "epoch": 0.8419594790805728, + "grad_norm": 0.4666908383369446, + "learning_rate": 6.4039545102125845e-06, + "loss": 0.8912, + "step": 9423 + }, + { + "epoch": 0.8420488306118347, + "grad_norm": 0.5896921753883362, + "learning_rate": 6.396870796783095e-06, + "loss": 0.8858, + "step": 9424 + }, + { + "epoch": 0.8421381821430964, + "grad_norm": 0.4257669150829315, + "learning_rate": 6.389790735580458e-06, + "loss": 0.9384, + "step": 9425 + }, + { + "epoch": 0.8422275336743583, + "grad_norm": 0.43601882457733154, + "learning_rate": 6.382714327197703e-06, + "loss": 1.0046, + "step": 9426 + }, + { + "epoch": 0.8423168852056202, + "grad_norm": 0.5371537208557129, + "learning_rate": 6.375641572227542e-06, + "loss": 0.9728, + "step": 9427 + }, + { + "epoch": 0.8424062367368821, + "grad_norm": 0.5477351546287537, + "learning_rate": 6.368572471262402e-06, + "loss": 0.9175, + "step": 9428 + }, + { + "epoch": 0.842495588268144, + "grad_norm": 0.4891539216041565, + "learning_rate": 6.361507024894392e-06, + "loss": 0.9221, + "step": 9429 + }, + { + "epoch": 0.8425849397994059, + "grad_norm": 0.5484804511070251, + "learning_rate": 6.354445233715328e-06, + "loss": 0.8749, + "step": 9430 + }, + { + "epoch": 0.8426742913306676, + "grad_norm": 0.5287595391273499, + "learning_rate": 6.347387098316698e-06, + "loss": 0.9356, + "step": 9431 + }, + { + "epoch": 0.8427636428619295, + "grad_norm": 0.4888148605823517, + "learning_rate": 6.340332619289701e-06, + "loss": 0.9794, + "step": 9432 + }, + { + "epoch": 0.8428529943931914, + "grad_norm": 0.5393836498260498, + "learning_rate": 6.333281797225244e-06, + "loss": 0.8226, + "step": 9433 + }, + { + "epoch": 0.8429423459244533, + "grad_norm": 0.5648978352546692, + "learning_rate": 6.326234632713873e-06, + "loss": 0.902, + "step": 9434 + }, + { + "epoch": 0.8430316974557152, + "grad_norm": 0.4472680687904358, + "learning_rate": 6.319191126345881e-06, + "loss": 0.9544, + "step": 9435 + }, + { + "epoch": 0.843121048986977, + "grad_norm": 0.4734359681606293, + "learning_rate": 6.312151278711237e-06, + "loss": 0.8552, + "step": 9436 + }, + { + "epoch": 0.8432104005182389, + "grad_norm": 0.492206335067749, + "learning_rate": 6.305115090399616e-06, + "loss": 0.9213, + "step": 9437 + }, + { + "epoch": 0.8432997520495007, + "grad_norm": 0.4859955608844757, + "learning_rate": 6.2980825620003455e-06, + "loss": 0.8452, + "step": 9438 + }, + { + "epoch": 0.8433891035807626, + "grad_norm": 0.4734523296356201, + "learning_rate": 6.291053694102489e-06, + "loss": 0.9237, + "step": 9439 + }, + { + "epoch": 0.8434784551120245, + "grad_norm": 0.5449217557907104, + "learning_rate": 6.284028487294796e-06, + "loss": 0.8981, + "step": 9440 + }, + { + "epoch": 0.8435678066432863, + "grad_norm": 0.4198647439479828, + "learning_rate": 6.277006942165692e-06, + "loss": 0.9808, + "step": 9441 + }, + { + "epoch": 0.8436571581745482, + "grad_norm": 0.4316485524177551, + "learning_rate": 6.269989059303305e-06, + "loss": 0.9737, + "step": 9442 + }, + { + "epoch": 0.8437465097058101, + "grad_norm": 0.46406179666519165, + "learning_rate": 6.262974839295471e-06, + "loss": 0.9257, + "step": 9443 + }, + { + "epoch": 0.843835861237072, + "grad_norm": 0.4460984170436859, + "learning_rate": 6.255964282729692e-06, + "loss": 0.9946, + "step": 9444 + }, + { + "epoch": 0.8439252127683338, + "grad_norm": 0.511184573173523, + "learning_rate": 6.248957390193183e-06, + "loss": 0.9457, + "step": 9445 + }, + { + "epoch": 0.8440145642995956, + "grad_norm": 0.5544686913490295, + "learning_rate": 6.2419541622728565e-06, + "loss": 0.9525, + "step": 9446 + }, + { + "epoch": 0.8441039158308575, + "grad_norm": 0.44529908895492554, + "learning_rate": 6.2349545995552875e-06, + "loss": 0.9527, + "step": 9447 + }, + { + "epoch": 0.8441932673621194, + "grad_norm": 0.5516801476478577, + "learning_rate": 6.227958702626769e-06, + "loss": 0.8854, + "step": 9448 + }, + { + "epoch": 0.8442826188933813, + "grad_norm": 0.4624561369419098, + "learning_rate": 6.220966472073287e-06, + "loss": 0.9874, + "step": 9449 + }, + { + "epoch": 0.8443719704246432, + "grad_norm": 0.4866270422935486, + "learning_rate": 6.213977908480518e-06, + "loss": 0.9601, + "step": 9450 + }, + { + "epoch": 0.8444613219559051, + "grad_norm": 0.4873912036418915, + "learning_rate": 6.206993012433815e-06, + "loss": 0.9031, + "step": 9451 + }, + { + "epoch": 0.8445506734871668, + "grad_norm": 0.4816325306892395, + "learning_rate": 6.200011784518256e-06, + "loss": 0.9123, + "step": 9452 + }, + { + "epoch": 0.8446400250184287, + "grad_norm": 0.5002326965332031, + "learning_rate": 6.193034225318578e-06, + "loss": 0.8816, + "step": 9453 + }, + { + "epoch": 0.8447293765496906, + "grad_norm": 0.4894864559173584, + "learning_rate": 6.186060335419236e-06, + "loss": 0.9751, + "step": 9454 + }, + { + "epoch": 0.8448187280809525, + "grad_norm": 0.5670939683914185, + "learning_rate": 6.179090115404368e-06, + "loss": 0.8294, + "step": 9455 + }, + { + "epoch": 0.8449080796122144, + "grad_norm": 0.553982138633728, + "learning_rate": 6.172123565857796e-06, + "loss": 0.8847, + "step": 9456 + }, + { + "epoch": 0.8449974311434763, + "grad_norm": 0.48611873388290405, + "learning_rate": 6.165160687363053e-06, + "loss": 0.9528, + "step": 9457 + }, + { + "epoch": 0.8450867826747381, + "grad_norm": 0.4134733974933624, + "learning_rate": 6.158201480503345e-06, + "loss": 1.0282, + "step": 9458 + }, + { + "epoch": 0.8451761342059999, + "grad_norm": 0.6506361961364746, + "learning_rate": 6.151245945861578e-06, + "loss": 0.8545, + "step": 9459 + }, + { + "epoch": 0.8452654857372618, + "grad_norm": 0.49033278226852417, + "learning_rate": 6.1442940840203575e-06, + "loss": 0.857, + "step": 9460 + }, + { + "epoch": 0.8453548372685237, + "grad_norm": 0.5886410474777222, + "learning_rate": 6.13734589556198e-06, + "loss": 0.8042, + "step": 9461 + }, + { + "epoch": 0.8454441887997856, + "grad_norm": 0.4791713058948517, + "learning_rate": 6.130401381068424e-06, + "loss": 0.9883, + "step": 9462 + }, + { + "epoch": 0.8455335403310474, + "grad_norm": 0.5414884686470032, + "learning_rate": 6.123460541121368e-06, + "loss": 0.9804, + "step": 9463 + }, + { + "epoch": 0.8456228918623093, + "grad_norm": 0.5500231981277466, + "learning_rate": 6.11652337630218e-06, + "loss": 0.9065, + "step": 9464 + }, + { + "epoch": 0.8457122433935712, + "grad_norm": 0.4416286051273346, + "learning_rate": 6.109589887191924e-06, + "loss": 0.9156, + "step": 9465 + }, + { + "epoch": 0.845801594924833, + "grad_norm": 0.4755689799785614, + "learning_rate": 6.102660074371369e-06, + "loss": 0.9018, + "step": 9466 + }, + { + "epoch": 0.8458909464560949, + "grad_norm": 0.5172240138053894, + "learning_rate": 6.095733938420928e-06, + "loss": 0.9731, + "step": 9467 + }, + { + "epoch": 0.8459802979873567, + "grad_norm": 0.47983941435813904, + "learning_rate": 6.0888114799207586e-06, + "loss": 0.99, + "step": 9468 + }, + { + "epoch": 0.8460696495186186, + "grad_norm": 0.4617604911327362, + "learning_rate": 6.081892699450687e-06, + "loss": 0.9207, + "step": 9469 + }, + { + "epoch": 0.8461590010498805, + "grad_norm": 0.43974384665489197, + "learning_rate": 6.0749775975902424e-06, + "loss": 0.919, + "step": 9470 + }, + { + "epoch": 0.8462483525811424, + "grad_norm": 0.5544477701187134, + "learning_rate": 6.068066174918624e-06, + "loss": 0.9164, + "step": 9471 + }, + { + "epoch": 0.8463377041124043, + "grad_norm": 0.5196595191955566, + "learning_rate": 6.0611584320147465e-06, + "loss": 0.9605, + "step": 9472 + }, + { + "epoch": 0.846427055643666, + "grad_norm": 0.47909513115882874, + "learning_rate": 6.054254369457202e-06, + "loss": 0.8867, + "step": 9473 + }, + { + "epoch": 0.8465164071749279, + "grad_norm": 0.5090166926383972, + "learning_rate": 6.047353987824278e-06, + "loss": 0.8925, + "step": 9474 + }, + { + "epoch": 0.8466057587061898, + "grad_norm": 0.4992344081401825, + "learning_rate": 6.040457287693963e-06, + "loss": 0.9032, + "step": 9475 + }, + { + "epoch": 0.8466951102374517, + "grad_norm": 0.5999143123626709, + "learning_rate": 6.033564269643926e-06, + "loss": 0.9008, + "step": 9476 + }, + { + "epoch": 0.8467844617687136, + "grad_norm": 0.5238880515098572, + "learning_rate": 6.026674934251542e-06, + "loss": 0.9067, + "step": 9477 + }, + { + "epoch": 0.8468738132999755, + "grad_norm": 0.5634439587593079, + "learning_rate": 6.0197892820938405e-06, + "loss": 0.8967, + "step": 9478 + }, + { + "epoch": 0.8469631648312373, + "grad_norm": 0.4486282467842102, + "learning_rate": 6.012907313747585e-06, + "loss": 0.9542, + "step": 9479 + }, + { + "epoch": 0.8470525163624991, + "grad_norm": 0.49240148067474365, + "learning_rate": 6.006029029789206e-06, + "loss": 0.9843, + "step": 9480 + }, + { + "epoch": 0.847141867893761, + "grad_norm": 0.5785838961601257, + "learning_rate": 5.99915443079484e-06, + "loss": 0.9319, + "step": 9481 + }, + { + "epoch": 0.8472312194250229, + "grad_norm": 0.6052419543266296, + "learning_rate": 5.992283517340308e-06, + "loss": 0.7929, + "step": 9482 + }, + { + "epoch": 0.8473205709562848, + "grad_norm": 0.452104777097702, + "learning_rate": 5.985416290001117e-06, + "loss": 0.9212, + "step": 9483 + }, + { + "epoch": 0.8474099224875467, + "grad_norm": 0.49323323369026184, + "learning_rate": 5.978552749352473e-06, + "loss": 0.8627, + "step": 9484 + }, + { + "epoch": 0.8474992740188085, + "grad_norm": 0.5414223670959473, + "learning_rate": 5.971692895969272e-06, + "loss": 0.9574, + "step": 9485 + }, + { + "epoch": 0.8475886255500704, + "grad_norm": 0.6366201043128967, + "learning_rate": 5.964836730426099e-06, + "loss": 0.9616, + "step": 9486 + }, + { + "epoch": 0.8476779770813322, + "grad_norm": 0.6016992926597595, + "learning_rate": 5.957984253297244e-06, + "loss": 0.9459, + "step": 9487 + }, + { + "epoch": 0.8477673286125941, + "grad_norm": 0.48284274339675903, + "learning_rate": 5.9511354651566485e-06, + "loss": 0.9174, + "step": 9488 + }, + { + "epoch": 0.847856680143856, + "grad_norm": 0.42610475420951843, + "learning_rate": 5.944290366577987e-06, + "loss": 0.965, + "step": 9489 + }, + { + "epoch": 0.8479460316751178, + "grad_norm": 0.4493615925312042, + "learning_rate": 5.937448958134606e-06, + "loss": 0.9472, + "step": 9490 + }, + { + "epoch": 0.8480353832063797, + "grad_norm": 0.46299251914024353, + "learning_rate": 5.930611240399548e-06, + "loss": 0.948, + "step": 9491 + }, + { + "epoch": 0.8481247347376416, + "grad_norm": 0.482902854681015, + "learning_rate": 5.923777213945541e-06, + "loss": 0.905, + "step": 9492 + }, + { + "epoch": 0.8482140862689035, + "grad_norm": 0.5270372033119202, + "learning_rate": 5.916946879345015e-06, + "loss": 0.9044, + "step": 9493 + }, + { + "epoch": 0.8483034378001653, + "grad_norm": 0.47987642884254456, + "learning_rate": 5.91012023717008e-06, + "loss": 0.9566, + "step": 9494 + }, + { + "epoch": 0.8483927893314271, + "grad_norm": 0.4810824990272522, + "learning_rate": 5.903297287992538e-06, + "loss": 0.8675, + "step": 9495 + }, + { + "epoch": 0.848482140862689, + "grad_norm": 0.5205212831497192, + "learning_rate": 5.89647803238389e-06, + "loss": 0.9361, + "step": 9496 + }, + { + "epoch": 0.8485714923939509, + "grad_norm": 0.45572689175605774, + "learning_rate": 5.889662470915319e-06, + "loss": 0.9834, + "step": 9497 + }, + { + "epoch": 0.8486608439252128, + "grad_norm": 0.4855954945087433, + "learning_rate": 5.882850604157691e-06, + "loss": 0.8503, + "step": 9498 + }, + { + "epoch": 0.8487501954564747, + "grad_norm": 0.4647740423679352, + "learning_rate": 5.876042432681584e-06, + "loss": 0.943, + "step": 9499 + }, + { + "epoch": 0.8488395469877364, + "grad_norm": 0.5627195239067078, + "learning_rate": 5.869237957057244e-06, + "loss": 0.8846, + "step": 9500 + }, + { + "epoch": 0.8489288985189983, + "grad_norm": 0.4476568102836609, + "learning_rate": 5.86243717785463e-06, + "loss": 0.9631, + "step": 9501 + }, + { + "epoch": 0.8490182500502602, + "grad_norm": 0.42511650919914246, + "learning_rate": 5.855640095643372e-06, + "loss": 0.8941, + "step": 9502 + }, + { + "epoch": 0.8491076015815221, + "grad_norm": 0.6032181978225708, + "learning_rate": 5.848846710992817e-06, + "loss": 0.9229, + "step": 9503 + }, + { + "epoch": 0.849196953112784, + "grad_norm": 0.49917519092559814, + "learning_rate": 5.842057024471953e-06, + "loss": 0.9726, + "step": 9504 + }, + { + "epoch": 0.8492863046440459, + "grad_norm": 0.4302211105823517, + "learning_rate": 5.835271036649503e-06, + "loss": 0.9793, + "step": 9505 + }, + { + "epoch": 0.8493756561753077, + "grad_norm": 0.6006591320037842, + "learning_rate": 5.8284887480938636e-06, + "loss": 0.9136, + "step": 9506 + }, + { + "epoch": 0.8494650077065695, + "grad_norm": 0.5111293792724609, + "learning_rate": 5.821710159373128e-06, + "loss": 0.8993, + "step": 9507 + }, + { + "epoch": 0.8495543592378314, + "grad_norm": 0.5666483640670776, + "learning_rate": 5.814935271055083e-06, + "loss": 0.921, + "step": 9508 + }, + { + "epoch": 0.8496437107690933, + "grad_norm": 0.522527813911438, + "learning_rate": 5.808164083707179e-06, + "loss": 0.9923, + "step": 9509 + }, + { + "epoch": 0.8497330623003552, + "grad_norm": 0.5020400285720825, + "learning_rate": 5.8013965978965855e-06, + "loss": 0.9426, + "step": 9510 + }, + { + "epoch": 0.849822413831617, + "grad_norm": 0.4437236487865448, + "learning_rate": 5.794632814190148e-06, + "loss": 0.9501, + "step": 9511 + }, + { + "epoch": 0.8499117653628789, + "grad_norm": 0.49814578890800476, + "learning_rate": 5.787872733154409e-06, + "loss": 0.936, + "step": 9512 + }, + { + "epoch": 0.8500011168941408, + "grad_norm": 0.550423264503479, + "learning_rate": 5.781116355355593e-06, + "loss": 1.0181, + "step": 9513 + }, + { + "epoch": 0.8500904684254026, + "grad_norm": 0.7392222285270691, + "learning_rate": 5.774363681359624e-06, + "loss": 0.9255, + "step": 9514 + }, + { + "epoch": 0.8501798199566645, + "grad_norm": 0.5093478560447693, + "learning_rate": 5.767614711732111e-06, + "loss": 0.8951, + "step": 9515 + }, + { + "epoch": 0.8502691714879264, + "grad_norm": 0.5823287963867188, + "learning_rate": 5.760869447038348e-06, + "loss": 0.861, + "step": 9516 + }, + { + "epoch": 0.8503585230191882, + "grad_norm": 0.42861655354499817, + "learning_rate": 5.75412788784333e-06, + "loss": 0.9809, + "step": 9517 + }, + { + "epoch": 0.8504478745504501, + "grad_norm": 0.44293922185897827, + "learning_rate": 5.74739003471172e-06, + "loss": 0.9667, + "step": 9518 + }, + { + "epoch": 0.850537226081712, + "grad_norm": 0.46408700942993164, + "learning_rate": 5.740655888207897e-06, + "loss": 0.9613, + "step": 9519 + }, + { + "epoch": 0.8506265776129739, + "grad_norm": 0.42871806025505066, + "learning_rate": 5.73392544889591e-06, + "loss": 0.987, + "step": 9520 + }, + { + "epoch": 0.8507159291442357, + "grad_norm": 0.5720949172973633, + "learning_rate": 5.727198717339511e-06, + "loss": 0.8512, + "step": 9521 + }, + { + "epoch": 0.8508052806754975, + "grad_norm": 0.46692514419555664, + "learning_rate": 5.720475694102128e-06, + "loss": 0.9157, + "step": 9522 + }, + { + "epoch": 0.8508946322067594, + "grad_norm": 0.47131991386413574, + "learning_rate": 5.713756379746898e-06, + "loss": 0.9097, + "step": 9523 + }, + { + "epoch": 0.8509839837380213, + "grad_norm": 0.5264059901237488, + "learning_rate": 5.707040774836625e-06, + "loss": 0.8181, + "step": 9524 + }, + { + "epoch": 0.8510733352692832, + "grad_norm": 0.5986820459365845, + "learning_rate": 5.70032887993382e-06, + "loss": 0.9024, + "step": 9525 + }, + { + "epoch": 0.8511626868005451, + "grad_norm": 0.5732765197753906, + "learning_rate": 5.693620695600671e-06, + "loss": 0.8888, + "step": 9526 + }, + { + "epoch": 0.851252038331807, + "grad_norm": 0.5805981755256653, + "learning_rate": 5.686916222399069e-06, + "loss": 0.8615, + "step": 9527 + }, + { + "epoch": 0.8513413898630687, + "grad_norm": 0.4108264744281769, + "learning_rate": 5.68021546089057e-06, + "loss": 1.0108, + "step": 9528 + }, + { + "epoch": 0.8514307413943306, + "grad_norm": 0.48710349202156067, + "learning_rate": 5.673518411636436e-06, + "loss": 0.9861, + "step": 9529 + }, + { + "epoch": 0.8515200929255925, + "grad_norm": 0.5818296670913696, + "learning_rate": 5.666825075197624e-06, + "loss": 0.882, + "step": 9530 + }, + { + "epoch": 0.8516094444568544, + "grad_norm": 0.433623343706131, + "learning_rate": 5.6601354521347685e-06, + "loss": 0.9407, + "step": 9531 + }, + { + "epoch": 0.8516987959881163, + "grad_norm": 0.5480315685272217, + "learning_rate": 5.653449543008199e-06, + "loss": 0.8762, + "step": 9532 + }, + { + "epoch": 0.8517881475193781, + "grad_norm": 0.45491036772727966, + "learning_rate": 5.646767348377935e-06, + "loss": 0.9376, + "step": 9533 + }, + { + "epoch": 0.85187749905064, + "grad_norm": 0.43504709005355835, + "learning_rate": 5.640088868803673e-06, + "loss": 0.9852, + "step": 9534 + }, + { + "epoch": 0.8519668505819018, + "grad_norm": 0.5115997791290283, + "learning_rate": 5.633414104844808e-06, + "loss": 0.9546, + "step": 9535 + }, + { + "epoch": 0.8520562021131637, + "grad_norm": 0.3994527757167816, + "learning_rate": 5.6267430570604405e-06, + "loss": 0.9405, + "step": 9536 + }, + { + "epoch": 0.8521455536444256, + "grad_norm": 0.5334661602973938, + "learning_rate": 5.620075726009311e-06, + "loss": 0.9164, + "step": 9537 + }, + { + "epoch": 0.8522349051756875, + "grad_norm": 0.41900575160980225, + "learning_rate": 5.6134121122498995e-06, + "loss": 0.9545, + "step": 9538 + }, + { + "epoch": 0.8523242567069493, + "grad_norm": 0.583827555179596, + "learning_rate": 5.606752216340349e-06, + "loss": 0.8339, + "step": 9539 + }, + { + "epoch": 0.8524136082382112, + "grad_norm": 0.6421988606452942, + "learning_rate": 5.6000960388385095e-06, + "loss": 0.8723, + "step": 9540 + }, + { + "epoch": 0.8525029597694731, + "grad_norm": 0.4667465090751648, + "learning_rate": 5.593443580301888e-06, + "loss": 0.934, + "step": 9541 + }, + { + "epoch": 0.8525923113007349, + "grad_norm": 0.48862892389297485, + "learning_rate": 5.586794841287701e-06, + "loss": 0.9056, + "step": 9542 + }, + { + "epoch": 0.8526816628319968, + "grad_norm": 0.4161957800388336, + "learning_rate": 5.5801498223528644e-06, + "loss": 0.9632, + "step": 9543 + }, + { + "epoch": 0.8527710143632586, + "grad_norm": 0.6144065260887146, + "learning_rate": 5.573508524053955e-06, + "loss": 0.8685, + "step": 9544 + }, + { + "epoch": 0.8528603658945205, + "grad_norm": 0.44452551007270813, + "learning_rate": 5.566870946947261e-06, + "loss": 0.9958, + "step": 9545 + }, + { + "epoch": 0.8529497174257824, + "grad_norm": 0.6683482527732849, + "learning_rate": 5.5602370915887494e-06, + "loss": 0.8042, + "step": 9546 + }, + { + "epoch": 0.8530390689570443, + "grad_norm": 0.4790489971637726, + "learning_rate": 5.553606958534074e-06, + "loss": 0.9908, + "step": 9547 + }, + { + "epoch": 0.8531284204883062, + "grad_norm": 0.4621983766555786, + "learning_rate": 5.546980548338593e-06, + "loss": 0.9601, + "step": 9548 + }, + { + "epoch": 0.8532177720195679, + "grad_norm": 0.42011430859565735, + "learning_rate": 5.54035786155731e-06, + "loss": 0.965, + "step": 9549 + }, + { + "epoch": 0.8533071235508298, + "grad_norm": 0.4877914488315582, + "learning_rate": 5.533738898744967e-06, + "loss": 0.8975, + "step": 9550 + }, + { + "epoch": 0.8533964750820917, + "grad_norm": 0.49201685190200806, + "learning_rate": 5.527123660455969e-06, + "loss": 0.9032, + "step": 9551 + }, + { + "epoch": 0.8534858266133536, + "grad_norm": 0.4355340898036957, + "learning_rate": 5.5205121472444055e-06, + "loss": 0.9941, + "step": 9552 + }, + { + "epoch": 0.8535751781446155, + "grad_norm": 0.43532299995422363, + "learning_rate": 5.513904359664074e-06, + "loss": 0.9603, + "step": 9553 + }, + { + "epoch": 0.8536645296758774, + "grad_norm": 0.5881833434104919, + "learning_rate": 5.507300298268436e-06, + "loss": 0.9591, + "step": 9554 + }, + { + "epoch": 0.8537538812071392, + "grad_norm": 0.4763388931751251, + "learning_rate": 5.500699963610656e-06, + "loss": 0.9825, + "step": 9555 + }, + { + "epoch": 0.853843232738401, + "grad_norm": 0.45608317852020264, + "learning_rate": 5.49410335624358e-06, + "loss": 0.9578, + "step": 9556 + }, + { + "epoch": 0.8539325842696629, + "grad_norm": 0.5364043712615967, + "learning_rate": 5.487510476719748e-06, + "loss": 1.0419, + "step": 9557 + }, + { + "epoch": 0.8540219358009248, + "grad_norm": 0.41415050625801086, + "learning_rate": 5.480921325591398e-06, + "loss": 0.9417, + "step": 9558 + }, + { + "epoch": 0.8541112873321867, + "grad_norm": 0.42311692237854004, + "learning_rate": 5.47433590341041e-06, + "loss": 0.9467, + "step": 9559 + }, + { + "epoch": 0.8542006388634485, + "grad_norm": 0.44652259349823, + "learning_rate": 5.467754210728404e-06, + "loss": 1.0331, + "step": 9560 + }, + { + "epoch": 0.8542899903947104, + "grad_norm": 0.4289079010486603, + "learning_rate": 5.4611762480966555e-06, + "loss": 0.9254, + "step": 9561 + }, + { + "epoch": 0.8543793419259722, + "grad_norm": 0.4332316219806671, + "learning_rate": 5.4546020160661515e-06, + "loss": 1.0019, + "step": 9562 + }, + { + "epoch": 0.8544686934572341, + "grad_norm": 0.5433120727539062, + "learning_rate": 5.448031515187552e-06, + "loss": 0.8531, + "step": 9563 + }, + { + "epoch": 0.854558044988496, + "grad_norm": 0.4383524954319, + "learning_rate": 5.441464746011194e-06, + "loss": 0.9369, + "step": 9564 + }, + { + "epoch": 0.8546473965197579, + "grad_norm": 0.4835048019886017, + "learning_rate": 5.434901709087131e-06, + "loss": 0.9205, + "step": 9565 + }, + { + "epoch": 0.8547367480510197, + "grad_norm": 0.44369155168533325, + "learning_rate": 5.428342404965076e-06, + "loss": 0.9472, + "step": 9566 + }, + { + "epoch": 0.8548260995822816, + "grad_norm": 0.4760458469390869, + "learning_rate": 5.421786834194442e-06, + "loss": 0.8901, + "step": 9567 + }, + { + "epoch": 0.8549154511135435, + "grad_norm": 0.4910159707069397, + "learning_rate": 5.415234997324348e-06, + "loss": 1.0034, + "step": 9568 + }, + { + "epoch": 0.8550048026448053, + "grad_norm": 0.600982129573822, + "learning_rate": 5.4086868949035475e-06, + "loss": 0.838, + "step": 9569 + }, + { + "epoch": 0.8550941541760672, + "grad_norm": 0.6115323901176453, + "learning_rate": 5.4021425274805245e-06, + "loss": 0.8826, + "step": 9570 + }, + { + "epoch": 0.855183505707329, + "grad_norm": 0.41822290420532227, + "learning_rate": 5.395601895603453e-06, + "loss": 0.9599, + "step": 9571 + }, + { + "epoch": 0.8552728572385909, + "grad_norm": 0.5147873163223267, + "learning_rate": 5.389064999820165e-06, + "loss": 0.9285, + "step": 9572 + }, + { + "epoch": 0.8553622087698528, + "grad_norm": 0.5221825838088989, + "learning_rate": 5.3825318406782125e-06, + "loss": 0.9447, + "step": 9573 + }, + { + "epoch": 0.8554515603011147, + "grad_norm": 0.4737570285797119, + "learning_rate": 5.376002418724796e-06, + "loss": 0.9483, + "step": 9574 + }, + { + "epoch": 0.8555409118323766, + "grad_norm": 0.5008856058120728, + "learning_rate": 5.3694767345068345e-06, + "loss": 0.8795, + "step": 9575 + }, + { + "epoch": 0.8556302633636383, + "grad_norm": 0.4690033197402954, + "learning_rate": 5.362954788570928e-06, + "loss": 0.9913, + "step": 9576 + }, + { + "epoch": 0.8557196148949002, + "grad_norm": 0.5061206221580505, + "learning_rate": 5.356436581463353e-06, + "loss": 0.8482, + "step": 9577 + }, + { + "epoch": 0.8558089664261621, + "grad_norm": 0.5794250965118408, + "learning_rate": 5.3499221137300885e-06, + "loss": 0.8922, + "step": 9578 + }, + { + "epoch": 0.855898317957424, + "grad_norm": 0.5473932027816772, + "learning_rate": 5.343411385916769e-06, + "loss": 0.9497, + "step": 9579 + }, + { + "epoch": 0.8559876694886859, + "grad_norm": 0.42786940932273865, + "learning_rate": 5.3369043985687595e-06, + "loss": 0.9562, + "step": 9580 + }, + { + "epoch": 0.8560770210199478, + "grad_norm": 0.5070851445198059, + "learning_rate": 5.330401152231074e-06, + "loss": 0.9229, + "step": 9581 + }, + { + "epoch": 0.8561663725512096, + "grad_norm": 0.45324572920799255, + "learning_rate": 5.323901647448437e-06, + "loss": 1.023, + "step": 9582 + }, + { + "epoch": 0.8562557240824714, + "grad_norm": 0.6096236109733582, + "learning_rate": 5.317405884765253e-06, + "loss": 0.9328, + "step": 9583 + }, + { + "epoch": 0.8563450756137333, + "grad_norm": 0.5863158106803894, + "learning_rate": 5.310913864725609e-06, + "loss": 0.915, + "step": 9584 + }, + { + "epoch": 0.8564344271449952, + "grad_norm": 0.4709130525588989, + "learning_rate": 5.304425587873274e-06, + "loss": 0.8696, + "step": 9585 + }, + { + "epoch": 0.8565237786762571, + "grad_norm": 0.607227623462677, + "learning_rate": 5.29794105475172e-06, + "loss": 0.924, + "step": 9586 + }, + { + "epoch": 0.856613130207519, + "grad_norm": 0.4545291066169739, + "learning_rate": 5.291460265904097e-06, + "loss": 0.9702, + "step": 9587 + }, + { + "epoch": 0.8567024817387808, + "grad_norm": 0.44768497347831726, + "learning_rate": 5.284983221873241e-06, + "loss": 1.0336, + "step": 9588 + }, + { + "epoch": 0.8567918332700427, + "grad_norm": 0.666070282459259, + "learning_rate": 5.278509923201657e-06, + "loss": 0.8109, + "step": 9589 + }, + { + "epoch": 0.8568811848013045, + "grad_norm": 0.5126855969429016, + "learning_rate": 5.2720403704315635e-06, + "loss": 0.904, + "step": 9590 + }, + { + "epoch": 0.8569705363325664, + "grad_norm": 0.47559547424316406, + "learning_rate": 5.265574564104858e-06, + "loss": 0.9819, + "step": 9591 + }, + { + "epoch": 0.8570598878638283, + "grad_norm": 0.43424925208091736, + "learning_rate": 5.259112504763114e-06, + "loss": 0.9388, + "step": 9592 + }, + { + "epoch": 0.8571492393950901, + "grad_norm": 0.3889505863189697, + "learning_rate": 5.252654192947603e-06, + "loss": 1.003, + "step": 9593 + }, + { + "epoch": 0.857238590926352, + "grad_norm": 0.47286322712898254, + "learning_rate": 5.246199629199277e-06, + "loss": 0.8823, + "step": 9594 + }, + { + "epoch": 0.8573279424576139, + "grad_norm": 0.46248650550842285, + "learning_rate": 5.23974881405877e-06, + "loss": 0.9612, + "step": 9595 + }, + { + "epoch": 0.8574172939888758, + "grad_norm": 0.5204552412033081, + "learning_rate": 5.233301748066416e-06, + "loss": 0.9113, + "step": 9596 + }, + { + "epoch": 0.8575066455201376, + "grad_norm": 0.42434239387512207, + "learning_rate": 5.226858431762216e-06, + "loss": 0.9449, + "step": 9597 + }, + { + "epoch": 0.8575959970513994, + "grad_norm": 0.6636669039726257, + "learning_rate": 5.2204188656858775e-06, + "loss": 0.9023, + "step": 9598 + }, + { + "epoch": 0.8576853485826613, + "grad_norm": 0.49965164065361023, + "learning_rate": 5.213983050376764e-06, + "loss": 0.9547, + "step": 9599 + }, + { + "epoch": 0.8577747001139232, + "grad_norm": 0.43195703625679016, + "learning_rate": 5.207550986373961e-06, + "loss": 0.9084, + "step": 9600 + }, + { + "epoch": 0.8578640516451851, + "grad_norm": 0.4506111145019531, + "learning_rate": 5.201122674216208e-06, + "loss": 0.9734, + "step": 9601 + }, + { + "epoch": 0.857953403176447, + "grad_norm": 0.4218018054962158, + "learning_rate": 5.194698114441959e-06, + "loss": 0.9299, + "step": 9602 + }, + { + "epoch": 0.8580427547077089, + "grad_norm": 0.5177663564682007, + "learning_rate": 5.188277307589334e-06, + "loss": 0.8914, + "step": 9603 + }, + { + "epoch": 0.8581321062389706, + "grad_norm": 0.5173326134681702, + "learning_rate": 5.181860254196136e-06, + "loss": 0.9674, + "step": 9604 + }, + { + "epoch": 0.8582214577702325, + "grad_norm": 0.49760016798973083, + "learning_rate": 5.175446954799873e-06, + "loss": 0.8798, + "step": 9605 + }, + { + "epoch": 0.8583108093014944, + "grad_norm": 0.48603054881095886, + "learning_rate": 5.169037409937733e-06, + "loss": 0.8737, + "step": 9606 + }, + { + "epoch": 0.8584001608327563, + "grad_norm": 0.4998380243778229, + "learning_rate": 5.162631620146563e-06, + "loss": 0.9471, + "step": 9607 + }, + { + "epoch": 0.8584895123640182, + "grad_norm": 0.5357629656791687, + "learning_rate": 5.15622958596293e-06, + "loss": 0.9189, + "step": 9608 + }, + { + "epoch": 0.85857886389528, + "grad_norm": 0.5003303289413452, + "learning_rate": 5.149831307923064e-06, + "loss": 0.9853, + "step": 9609 + }, + { + "epoch": 0.8586682154265419, + "grad_norm": 0.4467265009880066, + "learning_rate": 5.1434367865629e-06, + "loss": 0.9858, + "step": 9610 + }, + { + "epoch": 0.8587575669578037, + "grad_norm": 0.6121494770050049, + "learning_rate": 5.137046022418046e-06, + "loss": 0.859, + "step": 9611 + }, + { + "epoch": 0.8588469184890656, + "grad_norm": 0.5334439873695374, + "learning_rate": 5.130659016023787e-06, + "loss": 0.884, + "step": 9612 + }, + { + "epoch": 0.8589362700203275, + "grad_norm": 0.4914329946041107, + "learning_rate": 5.12427576791511e-06, + "loss": 0.9521, + "step": 9613 + }, + { + "epoch": 0.8590256215515893, + "grad_norm": 0.5462926626205444, + "learning_rate": 5.117896278626671e-06, + "loss": 0.9285, + "step": 9614 + }, + { + "epoch": 0.8591149730828512, + "grad_norm": 0.4273386001586914, + "learning_rate": 5.111520548692833e-06, + "loss": 0.9606, + "step": 9615 + }, + { + "epoch": 0.8592043246141131, + "grad_norm": 0.6681180596351624, + "learning_rate": 5.105148578647623e-06, + "loss": 0.773, + "step": 9616 + }, + { + "epoch": 0.859293676145375, + "grad_norm": 0.4087236225605011, + "learning_rate": 5.0987803690247635e-06, + "loss": 0.9703, + "step": 9617 + }, + { + "epoch": 0.8593830276766368, + "grad_norm": 0.499735027551651, + "learning_rate": 5.092415920357674e-06, + "loss": 0.9526, + "step": 9618 + }, + { + "epoch": 0.8594723792078987, + "grad_norm": 0.482085257768631, + "learning_rate": 5.086055233179421e-06, + "loss": 0.8669, + "step": 9619 + }, + { + "epoch": 0.8595617307391605, + "grad_norm": 0.448794960975647, + "learning_rate": 5.07969830802279e-06, + "loss": 0.9589, + "step": 9620 + }, + { + "epoch": 0.8596510822704224, + "grad_norm": 0.464096337556839, + "learning_rate": 5.073345145420238e-06, + "loss": 0.9648, + "step": 9621 + }, + { + "epoch": 0.8597404338016843, + "grad_norm": 0.5646736025810242, + "learning_rate": 5.066995745903919e-06, + "loss": 0.96, + "step": 9622 + }, + { + "epoch": 0.8598297853329462, + "grad_norm": 0.5770590901374817, + "learning_rate": 5.060650110005655e-06, + "loss": 0.9433, + "step": 9623 + }, + { + "epoch": 0.859919136864208, + "grad_norm": 0.5076577067375183, + "learning_rate": 5.0543082382569666e-06, + "loss": 0.8988, + "step": 9624 + }, + { + "epoch": 0.8600084883954698, + "grad_norm": 0.40074893832206726, + "learning_rate": 5.047970131189045e-06, + "loss": 0.9946, + "step": 9625 + }, + { + "epoch": 0.8600978399267317, + "grad_norm": 0.530560314655304, + "learning_rate": 5.0416357893327826e-06, + "loss": 1.0576, + "step": 9626 + }, + { + "epoch": 0.8601871914579936, + "grad_norm": 0.5239288806915283, + "learning_rate": 5.035305213218744e-06, + "loss": 0.9083, + "step": 9627 + }, + { + "epoch": 0.8602765429892555, + "grad_norm": 0.6042668223381042, + "learning_rate": 5.028978403377182e-06, + "loss": 0.8771, + "step": 9628 + }, + { + "epoch": 0.8603658945205174, + "grad_norm": 0.5551727414131165, + "learning_rate": 5.022655360338047e-06, + "loss": 0.9087, + "step": 9629 + }, + { + "epoch": 0.8604552460517793, + "grad_norm": 0.5318360328674316, + "learning_rate": 5.016336084630935e-06, + "loss": 0.8723, + "step": 9630 + }, + { + "epoch": 0.860544597583041, + "grad_norm": 0.42179903388023376, + "learning_rate": 5.010020576785174e-06, + "loss": 0.9642, + "step": 9631 + }, + { + "epoch": 0.8606339491143029, + "grad_norm": 0.5042697787284851, + "learning_rate": 5.0037088373297455e-06, + "loss": 0.8744, + "step": 9632 + }, + { + "epoch": 0.8607233006455648, + "grad_norm": 0.539291501045227, + "learning_rate": 4.997400866793328e-06, + "loss": 0.9179, + "step": 9633 + }, + { + "epoch": 0.8608126521768267, + "grad_norm": 0.5395140647888184, + "learning_rate": 4.991096665704281e-06, + "loss": 0.9229, + "step": 9634 + }, + { + "epoch": 0.8609020037080886, + "grad_norm": 0.46536731719970703, + "learning_rate": 4.984796234590644e-06, + "loss": 0.9679, + "step": 9635 + }, + { + "epoch": 0.8609913552393504, + "grad_norm": 0.4683972895145416, + "learning_rate": 4.978499573980155e-06, + "loss": 0.9381, + "step": 9636 + }, + { + "epoch": 0.8610807067706123, + "grad_norm": 0.49697425961494446, + "learning_rate": 4.972206684400221e-06, + "loss": 0.9342, + "step": 9637 + }, + { + "epoch": 0.8611700583018741, + "grad_norm": 0.5050958395004272, + "learning_rate": 4.9659175663779365e-06, + "loss": 0.9076, + "step": 9638 + }, + { + "epoch": 0.861259409833136, + "grad_norm": 0.5287109613418579, + "learning_rate": 4.959632220440097e-06, + "loss": 0.8461, + "step": 9639 + }, + { + "epoch": 0.8613487613643979, + "grad_norm": 0.7550068497657776, + "learning_rate": 4.953350647113148e-06, + "loss": 0.8207, + "step": 9640 + }, + { + "epoch": 0.8614381128956597, + "grad_norm": 0.4816301763057709, + "learning_rate": 4.94707284692324e-06, + "loss": 0.911, + "step": 9641 + }, + { + "epoch": 0.8615274644269216, + "grad_norm": 0.4800666570663452, + "learning_rate": 4.940798820396214e-06, + "loss": 0.8894, + "step": 9642 + }, + { + "epoch": 0.8616168159581835, + "grad_norm": 0.4487608075141907, + "learning_rate": 4.934528568057589e-06, + "loss": 0.9399, + "step": 9643 + }, + { + "epoch": 0.8617061674894454, + "grad_norm": 0.5139451026916504, + "learning_rate": 4.928262090432556e-06, + "loss": 0.9371, + "step": 9644 + }, + { + "epoch": 0.8617955190207072, + "grad_norm": 0.44007474184036255, + "learning_rate": 4.921999388045995e-06, + "loss": 0.9442, + "step": 9645 + }, + { + "epoch": 0.861884870551969, + "grad_norm": 0.4355262219905853, + "learning_rate": 4.915740461422491e-06, + "loss": 0.9202, + "step": 9646 + }, + { + "epoch": 0.8619742220832309, + "grad_norm": 0.425823450088501, + "learning_rate": 4.909485311086281e-06, + "loss": 1.0389, + "step": 9647 + }, + { + "epoch": 0.8620635736144928, + "grad_norm": 0.5076099038124084, + "learning_rate": 4.903233937561308e-06, + "loss": 0.9965, + "step": 9648 + }, + { + "epoch": 0.8621529251457547, + "grad_norm": 0.5130895972251892, + "learning_rate": 4.896986341371201e-06, + "loss": 0.8541, + "step": 9649 + }, + { + "epoch": 0.8622422766770166, + "grad_norm": 0.4927546977996826, + "learning_rate": 4.890742523039238e-06, + "loss": 0.8913, + "step": 9650 + }, + { + "epoch": 0.8623316282082785, + "grad_norm": 0.6410254240036011, + "learning_rate": 4.884502483088421e-06, + "loss": 0.8739, + "step": 9651 + }, + { + "epoch": 0.8624209797395402, + "grad_norm": 0.4251442849636078, + "learning_rate": 4.8782662220414225e-06, + "loss": 0.9444, + "step": 9652 + }, + { + "epoch": 0.8625103312708021, + "grad_norm": 0.46380364894866943, + "learning_rate": 4.872033740420584e-06, + "loss": 0.9366, + "step": 9653 + }, + { + "epoch": 0.862599682802064, + "grad_norm": 0.4519484043121338, + "learning_rate": 4.86580503874795e-06, + "loss": 0.8937, + "step": 9654 + }, + { + "epoch": 0.8626890343333259, + "grad_norm": 0.6642407178878784, + "learning_rate": 4.8595801175452425e-06, + "loss": 0.8715, + "step": 9655 + }, + { + "epoch": 0.8627783858645878, + "grad_norm": 0.6670805215835571, + "learning_rate": 4.853358977333866e-06, + "loss": 0.8974, + "step": 9656 + }, + { + "epoch": 0.8628677373958497, + "grad_norm": 0.4275446832180023, + "learning_rate": 4.847141618634898e-06, + "loss": 0.9749, + "step": 9657 + }, + { + "epoch": 0.8629570889271115, + "grad_norm": 0.4982154369354248, + "learning_rate": 4.8409280419691176e-06, + "loss": 0.9995, + "step": 9658 + }, + { + "epoch": 0.8630464404583733, + "grad_norm": 0.47910749912261963, + "learning_rate": 4.834718247856978e-06, + "loss": 0.944, + "step": 9659 + }, + { + "epoch": 0.8631357919896352, + "grad_norm": 0.633050799369812, + "learning_rate": 4.828512236818611e-06, + "loss": 0.863, + "step": 9660 + }, + { + "epoch": 0.8632251435208971, + "grad_norm": 0.6047154068946838, + "learning_rate": 4.822310009373832e-06, + "loss": 0.91, + "step": 9661 + }, + { + "epoch": 0.863314495052159, + "grad_norm": 0.49586907029151917, + "learning_rate": 4.816111566042153e-06, + "loss": 0.9588, + "step": 9662 + }, + { + "epoch": 0.8634038465834208, + "grad_norm": 0.5318160057067871, + "learning_rate": 4.809916907342754e-06, + "loss": 0.8139, + "step": 9663 + }, + { + "epoch": 0.8634931981146827, + "grad_norm": 0.5686520338058472, + "learning_rate": 4.803726033794509e-06, + "loss": 0.9418, + "step": 9664 + }, + { + "epoch": 0.8635825496459446, + "grad_norm": 0.5216206908226013, + "learning_rate": 4.797538945915958e-06, + "loss": 0.9086, + "step": 9665 + }, + { + "epoch": 0.8636719011772064, + "grad_norm": 0.4904700815677643, + "learning_rate": 4.791355644225354e-06, + "loss": 0.93, + "step": 9666 + }, + { + "epoch": 0.8637612527084683, + "grad_norm": 0.4535919725894928, + "learning_rate": 4.7851761292405985e-06, + "loss": 0.924, + "step": 9667 + }, + { + "epoch": 0.8638506042397301, + "grad_norm": 0.5180773138999939, + "learning_rate": 4.7790004014793e-06, + "loss": 0.9589, + "step": 9668 + }, + { + "epoch": 0.863939955770992, + "grad_norm": 0.4436119496822357, + "learning_rate": 4.772828461458745e-06, + "loss": 0.9582, + "step": 9669 + }, + { + "epoch": 0.8640293073022539, + "grad_norm": 0.41180846095085144, + "learning_rate": 4.766660309695886e-06, + "loss": 0.9333, + "step": 9670 + }, + { + "epoch": 0.8641186588335158, + "grad_norm": 0.4437258243560791, + "learning_rate": 4.7604959467073774e-06, + "loss": 0.9341, + "step": 9671 + }, + { + "epoch": 0.8642080103647777, + "grad_norm": 0.5144766569137573, + "learning_rate": 4.754335373009555e-06, + "loss": 0.9834, + "step": 9672 + }, + { + "epoch": 0.8642973618960395, + "grad_norm": 0.43276622891426086, + "learning_rate": 4.7481785891184295e-06, + "loss": 0.9718, + "step": 9673 + }, + { + "epoch": 0.8643867134273013, + "grad_norm": 0.5187088847160339, + "learning_rate": 4.7420255955496925e-06, + "loss": 0.9335, + "step": 9674 + }, + { + "epoch": 0.8644760649585632, + "grad_norm": 0.5164356827735901, + "learning_rate": 4.735876392818727e-06, + "loss": 0.9012, + "step": 9675 + }, + { + "epoch": 0.8645654164898251, + "grad_norm": 0.49884214997291565, + "learning_rate": 4.729730981440611e-06, + "loss": 0.9389, + "step": 9676 + }, + { + "epoch": 0.864654768021087, + "grad_norm": 0.5525695085525513, + "learning_rate": 4.723589361930053e-06, + "loss": 0.9312, + "step": 9677 + }, + { + "epoch": 0.8647441195523489, + "grad_norm": 0.4606095850467682, + "learning_rate": 4.717451534801504e-06, + "loss": 0.9609, + "step": 9678 + }, + { + "epoch": 0.8648334710836107, + "grad_norm": 0.513602077960968, + "learning_rate": 4.711317500569068e-06, + "loss": 0.9551, + "step": 9679 + }, + { + "epoch": 0.8649228226148725, + "grad_norm": 0.4917820990085602, + "learning_rate": 4.70518725974653e-06, + "loss": 0.878, + "step": 9680 + }, + { + "epoch": 0.8650121741461344, + "grad_norm": 0.49893245100975037, + "learning_rate": 4.699060812847378e-06, + "loss": 0.9257, + "step": 9681 + }, + { + "epoch": 0.8651015256773963, + "grad_norm": 0.5214920043945312, + "learning_rate": 4.692938160384747e-06, + "loss": 0.8962, + "step": 9682 + }, + { + "epoch": 0.8651908772086582, + "grad_norm": 0.5985978841781616, + "learning_rate": 4.686819302871481e-06, + "loss": 0.8617, + "step": 9683 + }, + { + "epoch": 0.86528022873992, + "grad_norm": 0.47699883580207825, + "learning_rate": 4.6807042408201e-06, + "loss": 0.9344, + "step": 9684 + }, + { + "epoch": 0.8653695802711819, + "grad_norm": 0.4804041087627411, + "learning_rate": 4.674592974742814e-06, + "loss": 0.9556, + "step": 9685 + }, + { + "epoch": 0.8654589318024437, + "grad_norm": 0.5328719019889832, + "learning_rate": 4.668485505151498e-06, + "loss": 0.9159, + "step": 9686 + }, + { + "epoch": 0.8655482833337056, + "grad_norm": 0.5171124935150146, + "learning_rate": 4.662381832557722e-06, + "loss": 0.9119, + "step": 9687 + }, + { + "epoch": 0.8656376348649675, + "grad_norm": 0.44296491146087646, + "learning_rate": 4.65628195747273e-06, + "loss": 0.9932, + "step": 9688 + }, + { + "epoch": 0.8657269863962294, + "grad_norm": 0.5319826602935791, + "learning_rate": 4.65018588040746e-06, + "loss": 0.9008, + "step": 9689 + }, + { + "epoch": 0.8658163379274912, + "grad_norm": 0.5790243744850159, + "learning_rate": 4.644093601872513e-06, + "loss": 0.8628, + "step": 9690 + }, + { + "epoch": 0.8659056894587531, + "grad_norm": 0.46617162227630615, + "learning_rate": 4.638005122378181e-06, + "loss": 0.9073, + "step": 9691 + }, + { + "epoch": 0.865995040990015, + "grad_norm": 0.4653133451938629, + "learning_rate": 4.631920442434446e-06, + "loss": 0.9665, + "step": 9692 + }, + { + "epoch": 0.8660843925212768, + "grad_norm": 0.4677208662033081, + "learning_rate": 4.625839562550965e-06, + "loss": 0.8954, + "step": 9693 + }, + { + "epoch": 0.8661737440525387, + "grad_norm": 0.5210303664207458, + "learning_rate": 4.619762483237072e-06, + "loss": 0.9758, + "step": 9694 + }, + { + "epoch": 0.8662630955838005, + "grad_norm": 0.5137758255004883, + "learning_rate": 4.613689205001792e-06, + "loss": 0.8698, + "step": 9695 + }, + { + "epoch": 0.8663524471150624, + "grad_norm": 0.45305004715919495, + "learning_rate": 4.607619728353818e-06, + "loss": 0.9656, + "step": 9696 + }, + { + "epoch": 0.8664417986463243, + "grad_norm": 0.5791176557540894, + "learning_rate": 4.601554053801549e-06, + "loss": 0.8502, + "step": 9697 + }, + { + "epoch": 0.8665311501775862, + "grad_norm": 0.40173932909965515, + "learning_rate": 4.595492181853034e-06, + "loss": 0.9574, + "step": 9698 + }, + { + "epoch": 0.8666205017088481, + "grad_norm": 0.4410465955734253, + "learning_rate": 4.589434113016039e-06, + "loss": 0.9465, + "step": 9699 + }, + { + "epoch": 0.8667098532401099, + "grad_norm": 0.49588507413864136, + "learning_rate": 4.583379847797964e-06, + "loss": 0.9517, + "step": 9700 + }, + { + "epoch": 0.8667992047713717, + "grad_norm": 0.46362167596817017, + "learning_rate": 4.577329386705942e-06, + "loss": 0.9281, + "step": 9701 + }, + { + "epoch": 0.8668885563026336, + "grad_norm": 0.5212817192077637, + "learning_rate": 4.571282730246745e-06, + "loss": 0.9243, + "step": 9702 + }, + { + "epoch": 0.8669779078338955, + "grad_norm": 0.45512816309928894, + "learning_rate": 4.565239878926863e-06, + "loss": 0.9509, + "step": 9703 + }, + { + "epoch": 0.8670672593651574, + "grad_norm": 0.5159767866134644, + "learning_rate": 4.5592008332524364e-06, + "loss": 0.8746, + "step": 9704 + }, + { + "epoch": 0.8671566108964193, + "grad_norm": 0.4418531656265259, + "learning_rate": 4.553165593729303e-06, + "loss": 1.0525, + "step": 9705 + }, + { + "epoch": 0.8672459624276811, + "grad_norm": 0.5434200763702393, + "learning_rate": 4.547134160862981e-06, + "loss": 0.8959, + "step": 9706 + }, + { + "epoch": 0.8673353139589429, + "grad_norm": 0.4653039276599884, + "learning_rate": 4.541106535158668e-06, + "loss": 0.9446, + "step": 9707 + }, + { + "epoch": 0.8674246654902048, + "grad_norm": 0.46755141019821167, + "learning_rate": 4.5350827171212366e-06, + "loss": 0.8746, + "step": 9708 + }, + { + "epoch": 0.8675140170214667, + "grad_norm": 0.5309289693832397, + "learning_rate": 4.529062707255261e-06, + "loss": 0.9246, + "step": 9709 + }, + { + "epoch": 0.8676033685527286, + "grad_norm": 0.5540649890899658, + "learning_rate": 4.5230465060649595e-06, + "loss": 0.8565, + "step": 9710 + }, + { + "epoch": 0.8676927200839905, + "grad_norm": 0.5240215063095093, + "learning_rate": 4.517034114054258e-06, + "loss": 0.9513, + "step": 9711 + }, + { + "epoch": 0.8677820716152523, + "grad_norm": 0.4945807158946991, + "learning_rate": 4.5110255317267704e-06, + "loss": 1.0085, + "step": 9712 + }, + { + "epoch": 0.8678714231465142, + "grad_norm": 0.446004182100296, + "learning_rate": 4.505020759585765e-06, + "loss": 0.9658, + "step": 9713 + }, + { + "epoch": 0.867960774677776, + "grad_norm": 0.4994924068450928, + "learning_rate": 4.499019798134224e-06, + "loss": 0.9725, + "step": 9714 + }, + { + "epoch": 0.8680501262090379, + "grad_norm": 0.42142254114151, + "learning_rate": 4.493022647874773e-06, + "loss": 0.9469, + "step": 9715 + }, + { + "epoch": 0.8681394777402998, + "grad_norm": 0.49049112200737, + "learning_rate": 4.487029309309748e-06, + "loss": 0.9365, + "step": 9716 + }, + { + "epoch": 0.8682288292715616, + "grad_norm": 0.5305113792419434, + "learning_rate": 4.481039782941143e-06, + "loss": 0.9549, + "step": 9717 + }, + { + "epoch": 0.8683181808028235, + "grad_norm": 0.5698569416999817, + "learning_rate": 4.4750540692706625e-06, + "loss": 0.9227, + "step": 9718 + }, + { + "epoch": 0.8684075323340854, + "grad_norm": 0.5504754185676575, + "learning_rate": 4.469072168799659e-06, + "loss": 1.002, + "step": 9719 + }, + { + "epoch": 0.8684968838653473, + "grad_norm": 0.4171115756034851, + "learning_rate": 4.4630940820291955e-06, + "loss": 0.9375, + "step": 9720 + }, + { + "epoch": 0.8685862353966091, + "grad_norm": 0.5085521340370178, + "learning_rate": 4.4571198094599866e-06, + "loss": 0.8873, + "step": 9721 + }, + { + "epoch": 0.868675586927871, + "grad_norm": 0.3779529929161072, + "learning_rate": 4.451149351592437e-06, + "loss": 1.0097, + "step": 9722 + }, + { + "epoch": 0.8687649384591328, + "grad_norm": 0.4312131702899933, + "learning_rate": 4.445182708926654e-06, + "loss": 0.9988, + "step": 9723 + }, + { + "epoch": 0.8688542899903947, + "grad_norm": 0.578957200050354, + "learning_rate": 4.439219881962392e-06, + "loss": 0.8544, + "step": 9724 + }, + { + "epoch": 0.8689436415216566, + "grad_norm": 0.47734835743904114, + "learning_rate": 4.433260871199113e-06, + "loss": 0.9883, + "step": 9725 + }, + { + "epoch": 0.8690329930529185, + "grad_norm": 0.5002614259719849, + "learning_rate": 4.42730567713594e-06, + "loss": 0.9303, + "step": 9726 + }, + { + "epoch": 0.8691223445841804, + "grad_norm": 0.4868052303791046, + "learning_rate": 4.4213543002716904e-06, + "loss": 0.9285, + "step": 9727 + }, + { + "epoch": 0.8692116961154421, + "grad_norm": 0.4923020005226135, + "learning_rate": 4.415406741104844e-06, + "loss": 0.971, + "step": 9728 + }, + { + "epoch": 0.869301047646704, + "grad_norm": 0.4633644223213196, + "learning_rate": 4.409463000133584e-06, + "loss": 0.9052, + "step": 9729 + }, + { + "epoch": 0.8693903991779659, + "grad_norm": 0.5671141147613525, + "learning_rate": 4.40352307785577e-06, + "loss": 0.9258, + "step": 9730 + }, + { + "epoch": 0.8694797507092278, + "grad_norm": 0.40942826867103577, + "learning_rate": 4.397586974768908e-06, + "loss": 0.9291, + "step": 9731 + }, + { + "epoch": 0.8695691022404897, + "grad_norm": 0.38688722252845764, + "learning_rate": 4.391654691370229e-06, + "loss": 0.9367, + "step": 9732 + }, + { + "epoch": 0.8696584537717515, + "grad_norm": 0.43114298582077026, + "learning_rate": 4.385726228156617e-06, + "loss": 0.9637, + "step": 9733 + }, + { + "epoch": 0.8697478053030134, + "grad_norm": 0.4530818462371826, + "learning_rate": 4.379801585624643e-06, + "loss": 0.9887, + "step": 9734 + }, + { + "epoch": 0.8698371568342752, + "grad_norm": 0.582784116268158, + "learning_rate": 4.373880764270566e-06, + "loss": 0.9341, + "step": 9735 + }, + { + "epoch": 0.8699265083655371, + "grad_norm": 0.5109930634498596, + "learning_rate": 4.367963764590311e-06, + "loss": 0.9377, + "step": 9736 + }, + { + "epoch": 0.870015859896799, + "grad_norm": 0.5649825930595398, + "learning_rate": 4.362050587079497e-06, + "loss": 0.9036, + "step": 9737 + }, + { + "epoch": 0.8701052114280609, + "grad_norm": 0.6133013963699341, + "learning_rate": 4.356141232233413e-06, + "loss": 0.8088, + "step": 9738 + }, + { + "epoch": 0.8701945629593227, + "grad_norm": 0.6256003379821777, + "learning_rate": 4.350235700547028e-06, + "loss": 0.8784, + "step": 9739 + }, + { + "epoch": 0.8702839144905846, + "grad_norm": 0.5290203094482422, + "learning_rate": 4.344333992515004e-06, + "loss": 0.8624, + "step": 9740 + }, + { + "epoch": 0.8703732660218465, + "grad_norm": 0.44871601462364197, + "learning_rate": 4.338436108631649e-06, + "loss": 0.9352, + "step": 9741 + }, + { + "epoch": 0.8704626175531083, + "grad_norm": 0.6028452515602112, + "learning_rate": 4.332542049390992e-06, + "loss": 0.8648, + "step": 9742 + }, + { + "epoch": 0.8705519690843702, + "grad_norm": 0.4009053409099579, + "learning_rate": 4.3266518152867145e-06, + "loss": 0.9586, + "step": 9743 + }, + { + "epoch": 0.870641320615632, + "grad_norm": 0.5013288259506226, + "learning_rate": 4.320765406812194e-06, + "loss": 0.8975, + "step": 9744 + }, + { + "epoch": 0.8707306721468939, + "grad_norm": 0.5538919568061829, + "learning_rate": 4.314882824460475e-06, + "loss": 1.09, + "step": 9745 + }, + { + "epoch": 0.8708200236781558, + "grad_norm": 0.4379560947418213, + "learning_rate": 4.309004068724298e-06, + "loss": 0.9737, + "step": 9746 + }, + { + "epoch": 0.8709093752094177, + "grad_norm": 0.47535446286201477, + "learning_rate": 4.303129140096052e-06, + "loss": 0.9569, + "step": 9747 + }, + { + "epoch": 0.8709987267406795, + "grad_norm": 0.5819507837295532, + "learning_rate": 4.297258039067831e-06, + "loss": 0.8848, + "step": 9748 + }, + { + "epoch": 0.8710880782719413, + "grad_norm": 0.4755156934261322, + "learning_rate": 4.291390766131409e-06, + "loss": 0.8717, + "step": 9749 + }, + { + "epoch": 0.8711774298032032, + "grad_norm": 0.605339527130127, + "learning_rate": 4.285527321778232e-06, + "loss": 0.9916, + "step": 9750 + }, + { + "epoch": 0.8712667813344651, + "grad_norm": 0.6067926287651062, + "learning_rate": 4.279667706499424e-06, + "loss": 0.9132, + "step": 9751 + }, + { + "epoch": 0.871356132865727, + "grad_norm": 0.5548110604286194, + "learning_rate": 4.273811920785786e-06, + "loss": 0.8592, + "step": 9752 + }, + { + "epoch": 0.8714454843969889, + "grad_norm": 0.533915638923645, + "learning_rate": 4.2679599651278045e-06, + "loss": 0.9079, + "step": 9753 + }, + { + "epoch": 0.8715348359282508, + "grad_norm": 0.5274013876914978, + "learning_rate": 4.262111840015642e-06, + "loss": 0.9541, + "step": 9754 + }, + { + "epoch": 0.8716241874595125, + "grad_norm": 0.4624837636947632, + "learning_rate": 4.256267545939147e-06, + "loss": 0.8876, + "step": 9755 + }, + { + "epoch": 0.8717135389907744, + "grad_norm": 0.5232068300247192, + "learning_rate": 4.250427083387837e-06, + "loss": 0.9834, + "step": 9756 + }, + { + "epoch": 0.8718028905220363, + "grad_norm": 0.666278600692749, + "learning_rate": 4.244590452850916e-06, + "loss": 0.8801, + "step": 9757 + }, + { + "epoch": 0.8718922420532982, + "grad_norm": 0.49066150188446045, + "learning_rate": 4.2387576548172605e-06, + "loss": 0.907, + "step": 9758 + }, + { + "epoch": 0.8719815935845601, + "grad_norm": 0.599941074848175, + "learning_rate": 4.232928689775428e-06, + "loss": 0.9306, + "step": 9759 + }, + { + "epoch": 0.872070945115822, + "grad_norm": 0.5583867430686951, + "learning_rate": 4.227103558213674e-06, + "loss": 0.8552, + "step": 9760 + }, + { + "epoch": 0.8721602966470838, + "grad_norm": 0.5444775819778442, + "learning_rate": 4.221282260619891e-06, + "loss": 0.9385, + "step": 9761 + }, + { + "epoch": 0.8722496481783456, + "grad_norm": 0.5456721782684326, + "learning_rate": 4.215464797481683e-06, + "loss": 0.8826, + "step": 9762 + }, + { + "epoch": 0.8723389997096075, + "grad_norm": 0.5117126107215881, + "learning_rate": 4.2096511692863275e-06, + "loss": 0.9481, + "step": 9763 + }, + { + "epoch": 0.8724283512408694, + "grad_norm": 0.5069064497947693, + "learning_rate": 4.203841376520773e-06, + "loss": 0.9362, + "step": 9764 + }, + { + "epoch": 0.8725177027721313, + "grad_norm": 0.5353370904922485, + "learning_rate": 4.198035419671658e-06, + "loss": 0.8717, + "step": 9765 + }, + { + "epoch": 0.8726070543033931, + "grad_norm": 0.4016374945640564, + "learning_rate": 4.19223329922529e-06, + "loss": 0.8964, + "step": 9766 + }, + { + "epoch": 0.872696405834655, + "grad_norm": 0.4020116627216339, + "learning_rate": 4.186435015667661e-06, + "loss": 1.0015, + "step": 9767 + }, + { + "epoch": 0.8727857573659169, + "grad_norm": 0.6577222347259521, + "learning_rate": 4.180640569484434e-06, + "loss": 0.9153, + "step": 9768 + }, + { + "epoch": 0.8728751088971787, + "grad_norm": 0.5189597010612488, + "learning_rate": 4.17484996116096e-06, + "loss": 0.903, + "step": 9769 + }, + { + "epoch": 0.8729644604284406, + "grad_norm": 0.44712257385253906, + "learning_rate": 4.169063191182271e-06, + "loss": 0.9051, + "step": 9770 + }, + { + "epoch": 0.8730538119597024, + "grad_norm": 0.497765451669693, + "learning_rate": 4.163280260033053e-06, + "loss": 0.9542, + "step": 9771 + }, + { + "epoch": 0.8731431634909643, + "grad_norm": 0.4439071714878082, + "learning_rate": 4.157501168197703e-06, + "loss": 0.9827, + "step": 9772 + }, + { + "epoch": 0.8732325150222262, + "grad_norm": 0.4466901421546936, + "learning_rate": 4.1517259161602705e-06, + "loss": 0.939, + "step": 9773 + }, + { + "epoch": 0.8733218665534881, + "grad_norm": 0.5371690988540649, + "learning_rate": 4.145954504404498e-06, + "loss": 0.9615, + "step": 9774 + }, + { + "epoch": 0.87341121808475, + "grad_norm": 0.4514279067516327, + "learning_rate": 4.140186933413809e-06, + "loss": 0.9192, + "step": 9775 + }, + { + "epoch": 0.8735005696160117, + "grad_norm": 0.47442543506622314, + "learning_rate": 4.134423203671295e-06, + "loss": 0.9659, + "step": 9776 + }, + { + "epoch": 0.8735899211472736, + "grad_norm": 0.532922625541687, + "learning_rate": 4.128663315659725e-06, + "loss": 0.9492, + "step": 9777 + }, + { + "epoch": 0.8736792726785355, + "grad_norm": 0.6389763355255127, + "learning_rate": 4.122907269861559e-06, + "loss": 0.8919, + "step": 9778 + }, + { + "epoch": 0.8737686242097974, + "grad_norm": 0.4954306483268738, + "learning_rate": 4.117155066758938e-06, + "loss": 1.0198, + "step": 9779 + }, + { + "epoch": 0.8738579757410593, + "grad_norm": 0.46348780393600464, + "learning_rate": 4.111406706833637e-06, + "loss": 0.9218, + "step": 9780 + }, + { + "epoch": 0.8739473272723212, + "grad_norm": 0.47658228874206543, + "learning_rate": 4.105662190567166e-06, + "loss": 0.8703, + "step": 9781 + }, + { + "epoch": 0.874036678803583, + "grad_norm": 0.5489630103111267, + "learning_rate": 4.099921518440686e-06, + "loss": 0.9327, + "step": 9782 + }, + { + "epoch": 0.8741260303348448, + "grad_norm": 0.5429866909980774, + "learning_rate": 4.094184690935038e-06, + "loss": 0.8676, + "step": 9783 + }, + { + "epoch": 0.8742153818661067, + "grad_norm": 0.5859069228172302, + "learning_rate": 4.088451708530755e-06, + "loss": 0.8897, + "step": 9784 + }, + { + "epoch": 0.8743047333973686, + "grad_norm": 0.5639720559120178, + "learning_rate": 4.082722571708008e-06, + "loss": 0.9152, + "step": 9785 + }, + { + "epoch": 0.8743940849286305, + "grad_norm": 0.4945251941680908, + "learning_rate": 4.076997280946693e-06, + "loss": 0.9027, + "step": 9786 + }, + { + "epoch": 0.8744834364598923, + "grad_norm": 0.49459564685821533, + "learning_rate": 4.071275836726357e-06, + "loss": 0.8952, + "step": 9787 + }, + { + "epoch": 0.8745727879911542, + "grad_norm": 0.5829160213470459, + "learning_rate": 4.065558239526241e-06, + "loss": 0.9146, + "step": 9788 + }, + { + "epoch": 0.8746621395224161, + "grad_norm": 0.472232848405838, + "learning_rate": 4.059844489825243e-06, + "loss": 0.9677, + "step": 9789 + }, + { + "epoch": 0.8747514910536779, + "grad_norm": 0.4521077275276184, + "learning_rate": 4.054134588101965e-06, + "loss": 0.9456, + "step": 9790 + }, + { + "epoch": 0.8748408425849398, + "grad_norm": 0.5067082047462463, + "learning_rate": 4.048428534834653e-06, + "loss": 0.9167, + "step": 9791 + }, + { + "epoch": 0.8749301941162017, + "grad_norm": 0.6378511786460876, + "learning_rate": 4.042726330501262e-06, + "loss": 0.8104, + "step": 9792 + }, + { + "epoch": 0.8750195456474635, + "grad_norm": 0.483568400144577, + "learning_rate": 4.03702797557941e-06, + "loss": 0.9183, + "step": 9793 + }, + { + "epoch": 0.8751088971787254, + "grad_norm": 0.47959351539611816, + "learning_rate": 4.031333470546394e-06, + "loss": 0.9048, + "step": 9794 + }, + { + "epoch": 0.8751982487099873, + "grad_norm": 0.5103621482849121, + "learning_rate": 4.025642815879188e-06, + "loss": 0.8854, + "step": 9795 + }, + { + "epoch": 0.8752876002412492, + "grad_norm": 0.4524891674518585, + "learning_rate": 4.019956012054455e-06, + "loss": 1.0298, + "step": 9796 + }, + { + "epoch": 0.875376951772511, + "grad_norm": 0.4642522633075714, + "learning_rate": 4.014273059548512e-06, + "loss": 0.9861, + "step": 9797 + }, + { + "epoch": 0.8754663033037728, + "grad_norm": 0.456893652677536, + "learning_rate": 4.0085939588373754e-06, + "loss": 0.9152, + "step": 9798 + }, + { + "epoch": 0.8755556548350347, + "grad_norm": 0.491441547870636, + "learning_rate": 4.0029187103967245e-06, + "loss": 0.9677, + "step": 9799 + }, + { + "epoch": 0.8756450063662966, + "grad_norm": 0.4507903754711151, + "learning_rate": 3.997247314701935e-06, + "loss": 0.9474, + "step": 9800 + }, + { + "epoch": 0.8757343578975585, + "grad_norm": 0.6348841190338135, + "learning_rate": 3.991579772228032e-06, + "loss": 0.9522, + "step": 9801 + }, + { + "epoch": 0.8758237094288204, + "grad_norm": 0.40709996223449707, + "learning_rate": 3.985916083449737e-06, + "loss": 0.9453, + "step": 9802 + }, + { + "epoch": 0.8759130609600823, + "grad_norm": 0.46370193362236023, + "learning_rate": 3.980256248841441e-06, + "loss": 0.9726, + "step": 9803 + }, + { + "epoch": 0.876002412491344, + "grad_norm": 0.3987545073032379, + "learning_rate": 3.974600268877221e-06, + "loss": 0.9591, + "step": 9804 + }, + { + "epoch": 0.8760917640226059, + "grad_norm": 0.5190705060958862, + "learning_rate": 3.9689481440308265e-06, + "loss": 0.9205, + "step": 9805 + }, + { + "epoch": 0.8761811155538678, + "grad_norm": 0.4716487228870392, + "learning_rate": 3.963299874775678e-06, + "loss": 0.9981, + "step": 9806 + }, + { + "epoch": 0.8762704670851297, + "grad_norm": 0.4962429702281952, + "learning_rate": 3.957655461584881e-06, + "loss": 0.897, + "step": 9807 + }, + { + "epoch": 0.8763598186163916, + "grad_norm": 0.6615248322486877, + "learning_rate": 3.952014904931217e-06, + "loss": 0.8475, + "step": 9808 + }, + { + "epoch": 0.8764491701476534, + "grad_norm": 0.47163599729537964, + "learning_rate": 3.946378205287138e-06, + "loss": 0.9074, + "step": 9809 + }, + { + "epoch": 0.8765385216789153, + "grad_norm": 0.6437599658966064, + "learning_rate": 3.940745363124787e-06, + "loss": 0.8462, + "step": 9810 + }, + { + "epoch": 0.8766278732101771, + "grad_norm": 0.5201234221458435, + "learning_rate": 3.935116378915971e-06, + "loss": 0.9933, + "step": 9811 + }, + { + "epoch": 0.876717224741439, + "grad_norm": 0.47189584374427795, + "learning_rate": 3.929491253132167e-06, + "loss": 0.9501, + "step": 9812 + }, + { + "epoch": 0.8768065762727009, + "grad_norm": 0.41938087344169617, + "learning_rate": 3.923869986244549e-06, + "loss": 0.9382, + "step": 9813 + }, + { + "epoch": 0.8768959278039627, + "grad_norm": 0.42869406938552856, + "learning_rate": 3.918252578723952e-06, + "loss": 0.9126, + "step": 9814 + }, + { + "epoch": 0.8769852793352246, + "grad_norm": 0.4569510221481323, + "learning_rate": 3.912639031040899e-06, + "loss": 0.9354, + "step": 9815 + }, + { + "epoch": 0.8770746308664865, + "grad_norm": 0.46077507734298706, + "learning_rate": 3.907029343665586e-06, + "loss": 0.9599, + "step": 9816 + }, + { + "epoch": 0.8771639823977483, + "grad_norm": 0.5191541314125061, + "learning_rate": 3.901423517067887e-06, + "loss": 0.8914, + "step": 9817 + }, + { + "epoch": 0.8772533339290102, + "grad_norm": 0.48782411217689514, + "learning_rate": 3.895821551717338e-06, + "loss": 0.8774, + "step": 9818 + }, + { + "epoch": 0.877342685460272, + "grad_norm": 0.42482325434684753, + "learning_rate": 3.890223448083163e-06, + "loss": 0.9598, + "step": 9819 + }, + { + "epoch": 0.8774320369915339, + "grad_norm": 0.3945348858833313, + "learning_rate": 3.884629206634277e-06, + "loss": 0.9229, + "step": 9820 + }, + { + "epoch": 0.8775213885227958, + "grad_norm": 0.41309240460395813, + "learning_rate": 3.879038827839255e-06, + "loss": 0.975, + "step": 9821 + }, + { + "epoch": 0.8776107400540577, + "grad_norm": 0.5235944390296936, + "learning_rate": 3.873452312166337e-06, + "loss": 0.8413, + "step": 9822 + }, + { + "epoch": 0.8777000915853196, + "grad_norm": 0.4890744388103485, + "learning_rate": 3.867869660083456e-06, + "loss": 0.8615, + "step": 9823 + }, + { + "epoch": 0.8777894431165814, + "grad_norm": 0.46650564670562744, + "learning_rate": 3.862290872058233e-06, + "loss": 0.9691, + "step": 9824 + }, + { + "epoch": 0.8778787946478432, + "grad_norm": 0.46951133012771606, + "learning_rate": 3.856715948557938e-06, + "loss": 0.9661, + "step": 9825 + }, + { + "epoch": 0.8779681461791051, + "grad_norm": 0.5398757457733154, + "learning_rate": 3.851144890049535e-06, + "loss": 0.9167, + "step": 9826 + }, + { + "epoch": 0.878057497710367, + "grad_norm": 0.49195584654808044, + "learning_rate": 3.845577696999659e-06, + "loss": 0.9914, + "step": 9827 + }, + { + "epoch": 0.8781468492416289, + "grad_norm": 0.5577439665794373, + "learning_rate": 3.840014369874617e-06, + "loss": 0.856, + "step": 9828 + }, + { + "epoch": 0.8782362007728908, + "grad_norm": 0.5505506992340088, + "learning_rate": 3.834454909140406e-06, + "loss": 0.8946, + "step": 9829 + }, + { + "epoch": 0.8783255523041527, + "grad_norm": 0.4931388795375824, + "learning_rate": 3.828899315262685e-06, + "loss": 0.9288, + "step": 9830 + }, + { + "epoch": 0.8784149038354144, + "grad_norm": 0.6808313131332397, + "learning_rate": 3.823347588706805e-06, + "loss": 0.9366, + "step": 9831 + }, + { + "epoch": 0.8785042553666763, + "grad_norm": 0.6877908706665039, + "learning_rate": 3.81779972993776e-06, + "loss": 0.8903, + "step": 9832 + }, + { + "epoch": 0.8785936068979382, + "grad_norm": 0.5493754744529724, + "learning_rate": 3.812255739420256e-06, + "loss": 0.9696, + "step": 9833 + }, + { + "epoch": 0.8786829584292001, + "grad_norm": 0.48021358251571655, + "learning_rate": 3.8067156176186616e-06, + "loss": 0.9312, + "step": 9834 + }, + { + "epoch": 0.878772309960462, + "grad_norm": 0.46737149357795715, + "learning_rate": 3.8011793649970207e-06, + "loss": 0.9433, + "step": 9835 + }, + { + "epoch": 0.8788616614917238, + "grad_norm": 0.4425142705440521, + "learning_rate": 3.7956469820190465e-06, + "loss": 0.9032, + "step": 9836 + }, + { + "epoch": 0.8789510130229857, + "grad_norm": 0.49997785687446594, + "learning_rate": 3.790118469148146e-06, + "loss": 0.8658, + "step": 9837 + }, + { + "epoch": 0.8790403645542475, + "grad_norm": 0.47397980093955994, + "learning_rate": 3.7845938268473823e-06, + "loss": 0.9049, + "step": 9838 + }, + { + "epoch": 0.8791297160855094, + "grad_norm": 0.5154423713684082, + "learning_rate": 3.7790730555795075e-06, + "loss": 0.956, + "step": 9839 + }, + { + "epoch": 0.8792190676167713, + "grad_norm": 0.5313198566436768, + "learning_rate": 3.7735561558069455e-06, + "loss": 0.915, + "step": 9840 + }, + { + "epoch": 0.8793084191480331, + "grad_norm": 0.499161034822464, + "learning_rate": 3.7680431279917994e-06, + "loss": 0.8453, + "step": 9841 + }, + { + "epoch": 0.879397770679295, + "grad_norm": 0.4321475028991699, + "learning_rate": 3.762533972595833e-06, + "loss": 0.9776, + "step": 9842 + }, + { + "epoch": 0.8794871222105569, + "grad_norm": 0.52239590883255, + "learning_rate": 3.7570286900804998e-06, + "loss": 0.9106, + "step": 9843 + }, + { + "epoch": 0.8795764737418188, + "grad_norm": 0.5335208177566528, + "learning_rate": 3.7515272809069303e-06, + "loss": 0.9377, + "step": 9844 + }, + { + "epoch": 0.8796658252730806, + "grad_norm": 0.5441358089447021, + "learning_rate": 3.746029745535923e-06, + "loss": 0.8779, + "step": 9845 + }, + { + "epoch": 0.8797551768043425, + "grad_norm": 0.5699974894523621, + "learning_rate": 3.7405360844279537e-06, + "loss": 0.8933, + "step": 9846 + }, + { + "epoch": 0.8798445283356043, + "grad_norm": 0.42555707693099976, + "learning_rate": 3.735046298043182e-06, + "loss": 0.9373, + "step": 9847 + }, + { + "epoch": 0.8799338798668662, + "grad_norm": 0.5089072585105896, + "learning_rate": 3.7295603868414297e-06, + "loss": 0.863, + "step": 9848 + }, + { + "epoch": 0.8800232313981281, + "grad_norm": 0.45135679841041565, + "learning_rate": 3.724078351282212e-06, + "loss": 0.9436, + "step": 9849 + }, + { + "epoch": 0.88011258292939, + "grad_norm": 0.5915647745132446, + "learning_rate": 3.7186001918246893e-06, + "loss": 0.8365, + "step": 9850 + }, + { + "epoch": 0.8802019344606519, + "grad_norm": 0.49184906482696533, + "learning_rate": 3.713125908927728e-06, + "loss": 0.8851, + "step": 9851 + }, + { + "epoch": 0.8802912859919136, + "grad_norm": 0.4521387815475464, + "learning_rate": 3.7076555030498506e-06, + "loss": 0.8955, + "step": 9852 + }, + { + "epoch": 0.8803806375231755, + "grad_norm": 0.43427813053131104, + "learning_rate": 3.7021889746492676e-06, + "loss": 0.9594, + "step": 9853 + }, + { + "epoch": 0.8804699890544374, + "grad_norm": 0.4693211019039154, + "learning_rate": 3.6967263241838636e-06, + "loss": 0.8923, + "step": 9854 + }, + { + "epoch": 0.8805593405856993, + "grad_norm": 0.4870004653930664, + "learning_rate": 3.691267552111183e-06, + "loss": 0.9082, + "step": 9855 + }, + { + "epoch": 0.8806486921169612, + "grad_norm": 0.4808718264102936, + "learning_rate": 3.6858126588884544e-06, + "loss": 0.9936, + "step": 9856 + }, + { + "epoch": 0.8807380436482231, + "grad_norm": 0.602287232875824, + "learning_rate": 3.6803616449725964e-06, + "loss": 0.8709, + "step": 9857 + }, + { + "epoch": 0.8808273951794849, + "grad_norm": 0.549919843673706, + "learning_rate": 3.6749145108201766e-06, + "loss": 0.9254, + "step": 9858 + }, + { + "epoch": 0.8809167467107467, + "grad_norm": 0.6031754612922668, + "learning_rate": 3.6694712568874577e-06, + "loss": 0.8894, + "step": 9859 + }, + { + "epoch": 0.8810060982420086, + "grad_norm": 0.6014052033424377, + "learning_rate": 3.66403188363037e-06, + "loss": 0.9024, + "step": 9860 + }, + { + "epoch": 0.8810954497732705, + "grad_norm": 0.4804718792438507, + "learning_rate": 3.6585963915045264e-06, + "loss": 0.9533, + "step": 9861 + }, + { + "epoch": 0.8811848013045324, + "grad_norm": 0.5219516754150391, + "learning_rate": 3.6531647809651904e-06, + "loss": 0.9346, + "step": 9862 + }, + { + "epoch": 0.8812741528357942, + "grad_norm": 0.4467054307460785, + "learning_rate": 3.647737052467326e-06, + "loss": 0.969, + "step": 9863 + }, + { + "epoch": 0.8813635043670561, + "grad_norm": 0.4762718081474304, + "learning_rate": 3.642313206465564e-06, + "loss": 0.9917, + "step": 9864 + }, + { + "epoch": 0.881452855898318, + "grad_norm": 0.5307314395904541, + "learning_rate": 3.6368932434142076e-06, + "loss": 0.9542, + "step": 9865 + }, + { + "epoch": 0.8815422074295798, + "grad_norm": 0.38814452290534973, + "learning_rate": 3.631477163767233e-06, + "loss": 0.9693, + "step": 9866 + }, + { + "epoch": 0.8816315589608417, + "grad_norm": 0.4484859108924866, + "learning_rate": 3.6260649679783044e-06, + "loss": 0.9018, + "step": 9867 + }, + { + "epoch": 0.8817209104921035, + "grad_norm": 0.6517726182937622, + "learning_rate": 3.620656656500743e-06, + "loss": 0.7952, + "step": 9868 + }, + { + "epoch": 0.8818102620233654, + "grad_norm": 0.5478501915931702, + "learning_rate": 3.615252229787558e-06, + "loss": 0.8884, + "step": 9869 + }, + { + "epoch": 0.8818996135546273, + "grad_norm": 0.5623330473899841, + "learning_rate": 3.6098516882914213e-06, + "loss": 0.9665, + "step": 9870 + }, + { + "epoch": 0.8819889650858892, + "grad_norm": 0.5018959641456604, + "learning_rate": 3.6044550324646987e-06, + "loss": 0.9083, + "step": 9871 + }, + { + "epoch": 0.8820783166171511, + "grad_norm": 0.3743535280227661, + "learning_rate": 3.599062262759395e-06, + "loss": 0.9624, + "step": 9872 + }, + { + "epoch": 0.8821676681484129, + "grad_norm": 0.6572086215019226, + "learning_rate": 3.5936733796272327e-06, + "loss": 0.8587, + "step": 9873 + }, + { + "epoch": 0.8822570196796747, + "grad_norm": 0.49752864241600037, + "learning_rate": 3.588288383519578e-06, + "loss": 0.8681, + "step": 9874 + }, + { + "epoch": 0.8823463712109366, + "grad_norm": 0.5613961219787598, + "learning_rate": 3.5829072748874813e-06, + "loss": 0.9468, + "step": 9875 + }, + { + "epoch": 0.8824357227421985, + "grad_norm": 0.5418183207511902, + "learning_rate": 3.577530054181677e-06, + "loss": 0.8794, + "step": 9876 + }, + { + "epoch": 0.8825250742734604, + "grad_norm": 0.5422852039337158, + "learning_rate": 3.5721567218525542e-06, + "loss": 0.9458, + "step": 9877 + }, + { + "epoch": 0.8826144258047223, + "grad_norm": 0.42068326473236084, + "learning_rate": 3.566787278350192e-06, + "loss": 0.9237, + "step": 9878 + }, + { + "epoch": 0.882703777335984, + "grad_norm": 0.4543195068836212, + "learning_rate": 3.5614217241243363e-06, + "loss": 0.9494, + "step": 9879 + }, + { + "epoch": 0.8827931288672459, + "grad_norm": 0.5260372757911682, + "learning_rate": 3.556060059624411e-06, + "loss": 0.9434, + "step": 9880 + }, + { + "epoch": 0.8828824803985078, + "grad_norm": 0.5226284861564636, + "learning_rate": 3.550702285299523e-06, + "loss": 0.9078, + "step": 9881 + }, + { + "epoch": 0.8829718319297697, + "grad_norm": 0.4836858808994293, + "learning_rate": 3.5453484015984253e-06, + "loss": 0.9321, + "step": 9882 + }, + { + "epoch": 0.8830611834610316, + "grad_norm": 0.47180086374282837, + "learning_rate": 3.539998408969569e-06, + "loss": 0.885, + "step": 9883 + }, + { + "epoch": 0.8831505349922935, + "grad_norm": 0.473090261220932, + "learning_rate": 3.5346523078610748e-06, + "loss": 0.9625, + "step": 9884 + }, + { + "epoch": 0.8832398865235553, + "grad_norm": 0.4888576567173004, + "learning_rate": 3.529310098720734e-06, + "loss": 0.8954, + "step": 9885 + }, + { + "epoch": 0.8833292380548171, + "grad_norm": 0.46975868940353394, + "learning_rate": 3.5239717819960104e-06, + "loss": 1.0308, + "step": 9886 + }, + { + "epoch": 0.883418589586079, + "grad_norm": 0.4399878680706024, + "learning_rate": 3.5186373581340636e-06, + "loss": 0.9445, + "step": 9887 + }, + { + "epoch": 0.8835079411173409, + "grad_norm": 0.4216008484363556, + "learning_rate": 3.5133068275816806e-06, + "loss": 0.9615, + "step": 9888 + }, + { + "epoch": 0.8835972926486028, + "grad_norm": 0.6720105409622192, + "learning_rate": 3.5079801907853648e-06, + "loss": 0.8721, + "step": 9889 + }, + { + "epoch": 0.8836866441798646, + "grad_norm": 0.4864836037158966, + "learning_rate": 3.5026574481912767e-06, + "loss": 0.9017, + "step": 9890 + }, + { + "epoch": 0.8837759957111265, + "grad_norm": 0.44451501965522766, + "learning_rate": 3.4973386002452535e-06, + "loss": 0.9123, + "step": 9891 + }, + { + "epoch": 0.8838653472423884, + "grad_norm": 0.4861214756965637, + "learning_rate": 3.492023647392817e-06, + "loss": 0.9691, + "step": 9892 + }, + { + "epoch": 0.8839546987736502, + "grad_norm": 0.5341703295707703, + "learning_rate": 3.4867125900791274e-06, + "loss": 0.8504, + "step": 9893 + }, + { + "epoch": 0.8840440503049121, + "grad_norm": 0.4494452178478241, + "learning_rate": 3.481405428749057e-06, + "loss": 0.962, + "step": 9894 + }, + { + "epoch": 0.884133401836174, + "grad_norm": 0.5674958229064941, + "learning_rate": 3.4761021638471337e-06, + "loss": 0.9775, + "step": 9895 + }, + { + "epoch": 0.8842227533674358, + "grad_norm": 0.480878621339798, + "learning_rate": 3.4708027958175625e-06, + "loss": 0.8908, + "step": 9896 + }, + { + "epoch": 0.8843121048986977, + "grad_norm": 0.6041078567504883, + "learning_rate": 3.4655073251042226e-06, + "loss": 0.8541, + "step": 9897 + }, + { + "epoch": 0.8844014564299596, + "grad_norm": 0.47140049934387207, + "learning_rate": 3.4602157521506638e-06, + "loss": 0.9046, + "step": 9898 + }, + { + "epoch": 0.8844908079612215, + "grad_norm": 0.5176533460617065, + "learning_rate": 3.4549280774001158e-06, + "loss": 0.9528, + "step": 9899 + }, + { + "epoch": 0.8845801594924833, + "grad_norm": 0.48900794982910156, + "learning_rate": 3.4496443012954795e-06, + "loss": 0.904, + "step": 9900 + }, + { + "epoch": 0.8846695110237451, + "grad_norm": 0.4905250668525696, + "learning_rate": 3.444364424279323e-06, + "loss": 0.8826, + "step": 9901 + }, + { + "epoch": 0.884758862555007, + "grad_norm": 0.5473768711090088, + "learning_rate": 3.4390884467938978e-06, + "loss": 0.9235, + "step": 9902 + }, + { + "epoch": 0.8848482140862689, + "grad_norm": 0.4095969498157501, + "learning_rate": 3.433816369281112e-06, + "loss": 0.9529, + "step": 9903 + }, + { + "epoch": 0.8849375656175308, + "grad_norm": 0.5141059756278992, + "learning_rate": 3.428548192182568e-06, + "loss": 0.9323, + "step": 9904 + }, + { + "epoch": 0.8850269171487927, + "grad_norm": 0.5582877993583679, + "learning_rate": 3.423283915939529e-06, + "loss": 0.9181, + "step": 9905 + }, + { + "epoch": 0.8851162686800546, + "grad_norm": 0.4510602653026581, + "learning_rate": 3.418023540992932e-06, + "loss": 1.0325, + "step": 9906 + }, + { + "epoch": 0.8852056202113163, + "grad_norm": 0.47843095660209656, + "learning_rate": 3.41276706778339e-06, + "loss": 1.007, + "step": 9907 + }, + { + "epoch": 0.8852949717425782, + "grad_norm": 0.5486652255058289, + "learning_rate": 3.4075144967511963e-06, + "loss": 0.9213, + "step": 9908 + }, + { + "epoch": 0.8853843232738401, + "grad_norm": 0.4576515257358551, + "learning_rate": 3.4022658283362985e-06, + "loss": 0.9397, + "step": 9909 + }, + { + "epoch": 0.885473674805102, + "grad_norm": 0.47343116998672485, + "learning_rate": 3.397021062978334e-06, + "loss": 0.9073, + "step": 9910 + }, + { + "epoch": 0.8855630263363639, + "grad_norm": 0.5906479358673096, + "learning_rate": 3.3917802011166067e-06, + "loss": 0.867, + "step": 9911 + }, + { + "epoch": 0.8856523778676257, + "grad_norm": 0.4365607798099518, + "learning_rate": 3.3865432431901046e-06, + "loss": 0.9728, + "step": 9912 + }, + { + "epoch": 0.8857417293988876, + "grad_norm": 0.47159871459007263, + "learning_rate": 3.3813101896374653e-06, + "loss": 0.9237, + "step": 9913 + }, + { + "epoch": 0.8858310809301494, + "grad_norm": 0.6291021704673767, + "learning_rate": 3.3760810408970113e-06, + "loss": 0.9258, + "step": 9914 + }, + { + "epoch": 0.8859204324614113, + "grad_norm": 0.4011688530445099, + "learning_rate": 3.3708557974067523e-06, + "loss": 0.9217, + "step": 9915 + }, + { + "epoch": 0.8860097839926732, + "grad_norm": 0.6337020397186279, + "learning_rate": 3.3656344596043442e-06, + "loss": 0.9362, + "step": 9916 + }, + { + "epoch": 0.886099135523935, + "grad_norm": 0.47050267457962036, + "learning_rate": 3.3604170279271374e-06, + "loss": 0.9049, + "step": 9917 + }, + { + "epoch": 0.8861884870551969, + "grad_norm": 0.5305570960044861, + "learning_rate": 3.3552035028121486e-06, + "loss": 0.8664, + "step": 9918 + }, + { + "epoch": 0.8862778385864588, + "grad_norm": 0.4407976269721985, + "learning_rate": 3.3499938846960675e-06, + "loss": 0.9272, + "step": 9919 + }, + { + "epoch": 0.8863671901177207, + "grad_norm": 0.4790654182434082, + "learning_rate": 3.3447881740152566e-06, + "loss": 0.93, + "step": 9920 + }, + { + "epoch": 0.8864565416489825, + "grad_norm": 0.5529206395149231, + "learning_rate": 3.3395863712057383e-06, + "loss": 0.8797, + "step": 9921 + }, + { + "epoch": 0.8865458931802443, + "grad_norm": 0.6508722901344299, + "learning_rate": 3.334388476703226e-06, + "loss": 0.8327, + "step": 9922 + }, + { + "epoch": 0.8866352447115062, + "grad_norm": 0.4920750558376312, + "learning_rate": 3.329194490943094e-06, + "loss": 0.9091, + "step": 9923 + }, + { + "epoch": 0.8867245962427681, + "grad_norm": 0.5537749528884888, + "learning_rate": 3.32400441436041e-06, + "loss": 0.9325, + "step": 9924 + }, + { + "epoch": 0.88681394777403, + "grad_norm": 0.5373173952102661, + "learning_rate": 3.3188182473898767e-06, + "loss": 0.8741, + "step": 9925 + }, + { + "epoch": 0.8869032993052919, + "grad_norm": 0.5053735375404358, + "learning_rate": 3.313635990465902e-06, + "loss": 0.8817, + "step": 9926 + }, + { + "epoch": 0.8869926508365538, + "grad_norm": 0.5587336421012878, + "learning_rate": 3.3084576440225555e-06, + "loss": 0.9233, + "step": 9927 + }, + { + "epoch": 0.8870820023678155, + "grad_norm": 0.45530253648757935, + "learning_rate": 3.3032832084935795e-06, + "loss": 0.9825, + "step": 9928 + }, + { + "epoch": 0.8871713538990774, + "grad_norm": 0.575091540813446, + "learning_rate": 3.298112684312382e-06, + "loss": 0.8839, + "step": 9929 + }, + { + "epoch": 0.8872607054303393, + "grad_norm": 0.5958606004714966, + "learning_rate": 3.292946071912051e-06, + "loss": 0.8694, + "step": 9930 + }, + { + "epoch": 0.8873500569616012, + "grad_norm": 0.5950548648834229, + "learning_rate": 3.2877833717253503e-06, + "loss": 0.9257, + "step": 9931 + }, + { + "epoch": 0.8874394084928631, + "grad_norm": 0.46099838614463806, + "learning_rate": 3.282624584184718e-06, + "loss": 0.9811, + "step": 9932 + }, + { + "epoch": 0.887528760024125, + "grad_norm": 0.5730208158493042, + "learning_rate": 3.277469709722242e-06, + "loss": 0.9349, + "step": 9933 + }, + { + "epoch": 0.8876181115553868, + "grad_norm": 0.4635668992996216, + "learning_rate": 3.2723187487696982e-06, + "loss": 0.9519, + "step": 9934 + }, + { + "epoch": 0.8877074630866486, + "grad_norm": 0.5630105137825012, + "learning_rate": 3.267171701758548e-06, + "loss": 0.9327, + "step": 9935 + }, + { + "epoch": 0.8877968146179105, + "grad_norm": 0.43999359011650085, + "learning_rate": 3.262028569119896e-06, + "loss": 0.9838, + "step": 9936 + }, + { + "epoch": 0.8878861661491724, + "grad_norm": 0.4146654009819031, + "learning_rate": 3.2568893512845477e-06, + "loss": 0.9695, + "step": 9937 + }, + { + "epoch": 0.8879755176804343, + "grad_norm": 0.48017647862434387, + "learning_rate": 3.251754048682959e-06, + "loss": 0.8955, + "step": 9938 + }, + { + "epoch": 0.8880648692116961, + "grad_norm": 0.48065900802612305, + "learning_rate": 3.2466226617452745e-06, + "loss": 0.9756, + "step": 9939 + }, + { + "epoch": 0.888154220742958, + "grad_norm": 0.5396913290023804, + "learning_rate": 3.2414951909012946e-06, + "loss": 0.96, + "step": 9940 + }, + { + "epoch": 0.8882435722742198, + "grad_norm": 0.665141224861145, + "learning_rate": 3.2363716365804984e-06, + "loss": 0.887, + "step": 9941 + }, + { + "epoch": 0.8883329238054817, + "grad_norm": 0.548674464225769, + "learning_rate": 3.2312519992120538e-06, + "loss": 0.8511, + "step": 9942 + }, + { + "epoch": 0.8884222753367436, + "grad_norm": 0.6099333763122559, + "learning_rate": 3.226136279224762e-06, + "loss": 0.9248, + "step": 9943 + }, + { + "epoch": 0.8885116268680054, + "grad_norm": 0.5216543078422546, + "learning_rate": 3.2210244770471356e-06, + "loss": 0.8861, + "step": 9944 + }, + { + "epoch": 0.8886009783992673, + "grad_norm": 0.4764314591884613, + "learning_rate": 3.215916593107332e-06, + "loss": 0.945, + "step": 9945 + }, + { + "epoch": 0.8886903299305292, + "grad_norm": 0.42511335015296936, + "learning_rate": 3.2108126278331983e-06, + "loss": 0.9197, + "step": 9946 + }, + { + "epoch": 0.8887796814617911, + "grad_norm": 0.5028632283210754, + "learning_rate": 3.2057125816522483e-06, + "loss": 0.9817, + "step": 9947 + }, + { + "epoch": 0.8888690329930529, + "grad_norm": 0.40669888257980347, + "learning_rate": 3.2006164549916563e-06, + "loss": 1.0339, + "step": 9948 + }, + { + "epoch": 0.8889583845243147, + "grad_norm": 0.4078708291053772, + "learning_rate": 3.195524248278281e-06, + "loss": 0.9188, + "step": 9949 + }, + { + "epoch": 0.8890477360555766, + "grad_norm": 0.45386314392089844, + "learning_rate": 3.190435961938654e-06, + "loss": 0.9065, + "step": 9950 + }, + { + "epoch": 0.8891370875868385, + "grad_norm": 0.39312687516212463, + "learning_rate": 3.185351596398961e-06, + "loss": 0.9465, + "step": 9951 + }, + { + "epoch": 0.8892264391181004, + "grad_norm": 0.4432995319366455, + "learning_rate": 3.1802711520850957e-06, + "loss": 0.9804, + "step": 9952 + }, + { + "epoch": 0.8893157906493623, + "grad_norm": 0.48735445737838745, + "learning_rate": 3.1751946294225733e-06, + "loss": 0.8804, + "step": 9953 + }, + { + "epoch": 0.8894051421806242, + "grad_norm": 0.4991060793399811, + "learning_rate": 3.1701220288366197e-06, + "loss": 0.9357, + "step": 9954 + }, + { + "epoch": 0.8894944937118859, + "grad_norm": 0.5495728254318237, + "learning_rate": 3.165053350752112e-06, + "loss": 0.8973, + "step": 9955 + }, + { + "epoch": 0.8895838452431478, + "grad_norm": 0.5644074082374573, + "learning_rate": 3.159988595593616e-06, + "loss": 0.8905, + "step": 9956 + }, + { + "epoch": 0.8896731967744097, + "grad_norm": 0.5330132842063904, + "learning_rate": 3.1549277637853593e-06, + "loss": 0.8993, + "step": 9957 + }, + { + "epoch": 0.8897625483056716, + "grad_norm": 0.4881329834461212, + "learning_rate": 3.1498708557512246e-06, + "loss": 0.9377, + "step": 9958 + }, + { + "epoch": 0.8898518998369335, + "grad_norm": 0.43445295095443726, + "learning_rate": 3.1448178719147957e-06, + "loss": 0.9774, + "step": 9959 + }, + { + "epoch": 0.8899412513681954, + "grad_norm": 0.5099343657493591, + "learning_rate": 3.1397688126993065e-06, + "loss": 0.8734, + "step": 9960 + }, + { + "epoch": 0.8900306028994572, + "grad_norm": 0.4378454387187958, + "learning_rate": 3.134723678527679e-06, + "loss": 0.945, + "step": 9961 + }, + { + "epoch": 0.890119954430719, + "grad_norm": 0.45414188504219055, + "learning_rate": 3.1296824698224924e-06, + "loss": 0.9504, + "step": 9962 + }, + { + "epoch": 0.8902093059619809, + "grad_norm": 0.40855854749679565, + "learning_rate": 3.1246451870059977e-06, + "loss": 0.9502, + "step": 9963 + }, + { + "epoch": 0.8902986574932428, + "grad_norm": 0.521061360836029, + "learning_rate": 3.1196118305001243e-06, + "loss": 0.8663, + "step": 9964 + }, + { + "epoch": 0.8903880090245047, + "grad_norm": 0.4307143986225128, + "learning_rate": 3.114582400726468e-06, + "loss": 0.9124, + "step": 9965 + }, + { + "epoch": 0.8904773605557665, + "grad_norm": 0.5008573532104492, + "learning_rate": 3.109556898106297e-06, + "loss": 0.9045, + "step": 9966 + }, + { + "epoch": 0.8905667120870284, + "grad_norm": 0.5884765982627869, + "learning_rate": 3.1045353230605535e-06, + "loss": 0.9376, + "step": 9967 + }, + { + "epoch": 0.8906560636182903, + "grad_norm": 0.49903714656829834, + "learning_rate": 3.0995176760098445e-06, + "loss": 0.9345, + "step": 9968 + }, + { + "epoch": 0.8907454151495521, + "grad_norm": 0.4943690299987793, + "learning_rate": 3.094503957374456e-06, + "loss": 0.9906, + "step": 9969 + }, + { + "epoch": 0.890834766680814, + "grad_norm": 0.5311302542686462, + "learning_rate": 3.089494167574336e-06, + "loss": 0.9613, + "step": 9970 + }, + { + "epoch": 0.8909241182120758, + "grad_norm": 0.5208662748336792, + "learning_rate": 3.08448830702911e-06, + "loss": 0.9179, + "step": 9971 + }, + { + "epoch": 0.8910134697433377, + "grad_norm": 0.49993789196014404, + "learning_rate": 3.0794863761580805e-06, + "loss": 0.8284, + "step": 9972 + }, + { + "epoch": 0.8911028212745996, + "grad_norm": 0.5702754259109497, + "learning_rate": 3.074488375380197e-06, + "loss": 0.9488, + "step": 9973 + }, + { + "epoch": 0.8911921728058615, + "grad_norm": 0.41398748755455017, + "learning_rate": 3.0694943051140958e-06, + "loss": 0.9195, + "step": 9974 + }, + { + "epoch": 0.8912815243371234, + "grad_norm": 0.46768733859062195, + "learning_rate": 3.0645041657780927e-06, + "loss": 0.9542, + "step": 9975 + }, + { + "epoch": 0.8913708758683851, + "grad_norm": 0.47688964009284973, + "learning_rate": 3.0595179577901643e-06, + "loss": 0.8927, + "step": 9976 + }, + { + "epoch": 0.891460227399647, + "grad_norm": 0.501356840133667, + "learning_rate": 3.054535681567955e-06, + "loss": 0.9192, + "step": 9977 + }, + { + "epoch": 0.8915495789309089, + "grad_norm": 0.44670113921165466, + "learning_rate": 3.0495573375287854e-06, + "loss": 0.9633, + "step": 9978 + }, + { + "epoch": 0.8916389304621708, + "grad_norm": 0.5801134705543518, + "learning_rate": 3.044582926089645e-06, + "loss": 0.8739, + "step": 9979 + }, + { + "epoch": 0.8917282819934327, + "grad_norm": 0.5927140712738037, + "learning_rate": 3.039612447667195e-06, + "loss": 0.9109, + "step": 9980 + }, + { + "epoch": 0.8918176335246946, + "grad_norm": 0.49671000242233276, + "learning_rate": 3.034645902677763e-06, + "loss": 0.9923, + "step": 9981 + }, + { + "epoch": 0.8919069850559564, + "grad_norm": 0.46051478385925293, + "learning_rate": 3.0296832915373497e-06, + "loss": 0.9259, + "step": 9982 + }, + { + "epoch": 0.8919963365872182, + "grad_norm": 0.5306558609008789, + "learning_rate": 3.024724614661639e-06, + "loss": 0.913, + "step": 9983 + }, + { + "epoch": 0.8920856881184801, + "grad_norm": 0.3877553939819336, + "learning_rate": 3.0197698724659497e-06, + "loss": 1.001, + "step": 9984 + }, + { + "epoch": 0.892175039649742, + "grad_norm": 0.5105606913566589, + "learning_rate": 3.0148190653653096e-06, + "loss": 0.9371, + "step": 9985 + }, + { + "epoch": 0.8922643911810039, + "grad_norm": 0.5007322430610657, + "learning_rate": 3.009872193774399e-06, + "loss": 0.9932, + "step": 9986 + }, + { + "epoch": 0.8923537427122658, + "grad_norm": 0.5739557147026062, + "learning_rate": 3.0049292581075692e-06, + "loss": 0.8694, + "step": 9987 + }, + { + "epoch": 0.8924430942435276, + "grad_norm": 0.4719444215297699, + "learning_rate": 2.9999902587788507e-06, + "loss": 1.0219, + "step": 9988 + }, + { + "epoch": 0.8925324457747895, + "grad_norm": 0.4949089586734772, + "learning_rate": 2.9950551962019293e-06, + "loss": 0.9235, + "step": 9989 + }, + { + "epoch": 0.8926217973060513, + "grad_norm": 0.45871758460998535, + "learning_rate": 2.990124070790179e-06, + "loss": 0.9429, + "step": 9990 + }, + { + "epoch": 0.8927111488373132, + "grad_norm": 0.5253780484199524, + "learning_rate": 2.98519688295662e-06, + "loss": 0.939, + "step": 9991 + }, + { + "epoch": 0.8928005003685751, + "grad_norm": 0.4314812123775482, + "learning_rate": 2.9802736331139615e-06, + "loss": 0.9584, + "step": 9992 + }, + { + "epoch": 0.8928898518998369, + "grad_norm": 0.473132848739624, + "learning_rate": 2.9753543216745784e-06, + "loss": 0.9659, + "step": 9993 + }, + { + "epoch": 0.8929792034310988, + "grad_norm": 0.4422551989555359, + "learning_rate": 2.9704389490505303e-06, + "loss": 0.9317, + "step": 9994 + }, + { + "epoch": 0.8930685549623607, + "grad_norm": 0.5181037783622742, + "learning_rate": 2.9655275156535103e-06, + "loss": 0.9623, + "step": 9995 + }, + { + "epoch": 0.8931579064936226, + "grad_norm": 0.5638878345489502, + "learning_rate": 2.960620021894911e-06, + "loss": 0.908, + "step": 9996 + }, + { + "epoch": 0.8932472580248844, + "grad_norm": 0.5014554858207703, + "learning_rate": 2.955716468185793e-06, + "loss": 0.8998, + "step": 9997 + }, + { + "epoch": 0.8933366095561462, + "grad_norm": 0.46494272351264954, + "learning_rate": 2.950816854936872e-06, + "loss": 0.9615, + "step": 9998 + }, + { + "epoch": 0.8934259610874081, + "grad_norm": 0.4722679555416107, + "learning_rate": 2.9459211825585475e-06, + "loss": 0.9962, + "step": 9999 + }, + { + "epoch": 0.89351531261867, + "grad_norm": 0.6044051051139832, + "learning_rate": 2.941029451460886e-06, + "loss": 0.9006, + "step": 10000 + }, + { + "epoch": 0.8936046641499319, + "grad_norm": 0.4716538190841675, + "learning_rate": 2.936141662053621e-06, + "loss": 0.9416, + "step": 10001 + }, + { + "epoch": 0.8936940156811938, + "grad_norm": 0.5190637111663818, + "learning_rate": 2.931257814746158e-06, + "loss": 0.8654, + "step": 10002 + }, + { + "epoch": 0.8937833672124555, + "grad_norm": 0.46541985869407654, + "learning_rate": 2.926377909947575e-06, + "loss": 0.9391, + "step": 10003 + }, + { + "epoch": 0.8938727187437174, + "grad_norm": 0.46355336904525757, + "learning_rate": 2.9215019480666015e-06, + "loss": 0.8925, + "step": 10004 + }, + { + "epoch": 0.8939620702749793, + "grad_norm": 0.5161470770835876, + "learning_rate": 2.916629929511666e-06, + "loss": 0.8671, + "step": 10005 + }, + { + "epoch": 0.8940514218062412, + "grad_norm": 0.544810950756073, + "learning_rate": 2.911761854690842e-06, + "loss": 0.911, + "step": 10006 + }, + { + "epoch": 0.8941407733375031, + "grad_norm": 0.4915909469127655, + "learning_rate": 2.9068977240118867e-06, + "loss": 0.8727, + "step": 10007 + }, + { + "epoch": 0.894230124868765, + "grad_norm": 0.658275306224823, + "learning_rate": 2.9020375378822297e-06, + "loss": 0.9053, + "step": 10008 + }, + { + "epoch": 0.8943194764000268, + "grad_norm": 0.48863136768341064, + "learning_rate": 2.897181296708951e-06, + "loss": 0.8799, + "step": 10009 + }, + { + "epoch": 0.8944088279312886, + "grad_norm": 0.46332526206970215, + "learning_rate": 2.8923290008988193e-06, + "loss": 0.9081, + "step": 10010 + }, + { + "epoch": 0.8944981794625505, + "grad_norm": 0.4719219505786896, + "learning_rate": 2.8874806508582652e-06, + "loss": 0.9298, + "step": 10011 + }, + { + "epoch": 0.8945875309938124, + "grad_norm": 0.4817638099193573, + "learning_rate": 2.882636246993392e-06, + "loss": 0.9128, + "step": 10012 + }, + { + "epoch": 0.8946768825250743, + "grad_norm": 0.49703267216682434, + "learning_rate": 2.877795789709975e-06, + "loss": 0.8182, + "step": 10013 + }, + { + "epoch": 0.8947662340563362, + "grad_norm": 0.4652135372161865, + "learning_rate": 2.87295927941344e-06, + "loss": 0.9703, + "step": 10014 + }, + { + "epoch": 0.894855585587598, + "grad_norm": 0.4329572916030884, + "learning_rate": 2.868126716508901e-06, + "loss": 0.939, + "step": 10015 + }, + { + "epoch": 0.8949449371188599, + "grad_norm": 0.4363684356212616, + "learning_rate": 2.8632981014011463e-06, + "loss": 0.8893, + "step": 10016 + }, + { + "epoch": 0.8950342886501217, + "grad_norm": 0.564899742603302, + "learning_rate": 2.8584734344946073e-06, + "loss": 0.9229, + "step": 10017 + }, + { + "epoch": 0.8951236401813836, + "grad_norm": 0.5711920261383057, + "learning_rate": 2.853652716193417e-06, + "loss": 0.8848, + "step": 10018 + }, + { + "epoch": 0.8952129917126455, + "grad_norm": 0.4611048996448517, + "learning_rate": 2.8488359469013514e-06, + "loss": 0.8717, + "step": 10019 + }, + { + "epoch": 0.8953023432439073, + "grad_norm": 0.7279905676841736, + "learning_rate": 2.844023127021872e-06, + "loss": 0.7405, + "step": 10020 + }, + { + "epoch": 0.8953916947751692, + "grad_norm": 0.5130376219749451, + "learning_rate": 2.839214256958106e-06, + "loss": 0.9462, + "step": 10021 + }, + { + "epoch": 0.8954810463064311, + "grad_norm": 0.43503376841545105, + "learning_rate": 2.8344093371128424e-06, + "loss": 0.8974, + "step": 10022 + }, + { + "epoch": 0.895570397837693, + "grad_norm": 0.5059356689453125, + "learning_rate": 2.8296083678885477e-06, + "loss": 0.9436, + "step": 10023 + }, + { + "epoch": 0.8956597493689548, + "grad_norm": 0.5730863213539124, + "learning_rate": 2.8248113496873507e-06, + "loss": 0.8933, + "step": 10024 + }, + { + "epoch": 0.8957491009002166, + "grad_norm": 0.4714071750640869, + "learning_rate": 2.8200182829110523e-06, + "loss": 0.9942, + "step": 10025 + }, + { + "epoch": 0.8958384524314785, + "grad_norm": 0.4940183758735657, + "learning_rate": 2.8152291679611255e-06, + "loss": 0.9125, + "step": 10026 + }, + { + "epoch": 0.8959278039627404, + "grad_norm": 0.6629910469055176, + "learning_rate": 2.810444005238716e-06, + "loss": 0.8461, + "step": 10027 + }, + { + "epoch": 0.8960171554940023, + "grad_norm": 0.4740060865879059, + "learning_rate": 2.80566279514462e-06, + "loss": 1.0388, + "step": 10028 + }, + { + "epoch": 0.8961065070252642, + "grad_norm": 0.4990270733833313, + "learning_rate": 2.800885538079323e-06, + "loss": 0.8857, + "step": 10029 + }, + { + "epoch": 0.8961958585565261, + "grad_norm": 0.4073428213596344, + "learning_rate": 2.796112234442966e-06, + "loss": 0.9675, + "step": 10030 + }, + { + "epoch": 0.8962852100877878, + "grad_norm": 0.541317343711853, + "learning_rate": 2.791342884635362e-06, + "loss": 0.9653, + "step": 10031 + }, + { + "epoch": 0.8963745616190497, + "grad_norm": 0.3983585834503174, + "learning_rate": 2.7865774890560025e-06, + "loss": 0.9158, + "step": 10032 + }, + { + "epoch": 0.8964639131503116, + "grad_norm": 0.4200194776058197, + "learning_rate": 2.7818160481040465e-06, + "loss": 0.9852, + "step": 10033 + }, + { + "epoch": 0.8965532646815735, + "grad_norm": 0.48257651925086975, + "learning_rate": 2.7770585621782973e-06, + "loss": 0.9329, + "step": 10034 + }, + { + "epoch": 0.8966426162128354, + "grad_norm": 0.4446716010570526, + "learning_rate": 2.772305031677258e-06, + "loss": 0.9151, + "step": 10035 + }, + { + "epoch": 0.8967319677440972, + "grad_norm": 0.4650631546974182, + "learning_rate": 2.767555456999077e-06, + "loss": 0.956, + "step": 10036 + }, + { + "epoch": 0.8968213192753591, + "grad_norm": 0.5073647499084473, + "learning_rate": 2.762809838541591e-06, + "loss": 0.9766, + "step": 10037 + }, + { + "epoch": 0.8969106708066209, + "grad_norm": 0.5146875977516174, + "learning_rate": 2.758068176702294e-06, + "loss": 0.9526, + "step": 10038 + }, + { + "epoch": 0.8970000223378828, + "grad_norm": 0.5599371790885925, + "learning_rate": 2.7533304718783516e-06, + "loss": 0.934, + "step": 10039 + }, + { + "epoch": 0.8970893738691447, + "grad_norm": 0.5773612856864929, + "learning_rate": 2.74859672446659e-06, + "loss": 0.9475, + "step": 10040 + }, + { + "epoch": 0.8971787254004066, + "grad_norm": 0.5281670093536377, + "learning_rate": 2.7438669348635202e-06, + "loss": 0.9335, + "step": 10041 + }, + { + "epoch": 0.8972680769316684, + "grad_norm": 0.48857590556144714, + "learning_rate": 2.7391411034653094e-06, + "loss": 0.8603, + "step": 10042 + }, + { + "epoch": 0.8973574284629303, + "grad_norm": 0.4282703995704651, + "learning_rate": 2.734419230667801e-06, + "loss": 0.9902, + "step": 10043 + }, + { + "epoch": 0.8974467799941922, + "grad_norm": 0.5266483426094055, + "learning_rate": 2.7297013168664897e-06, + "loss": 0.8483, + "step": 10044 + }, + { + "epoch": 0.897536131525454, + "grad_norm": 0.5313611626625061, + "learning_rate": 2.7249873624565604e-06, + "loss": 0.8934, + "step": 10045 + }, + { + "epoch": 0.8976254830567159, + "grad_norm": 0.4362044632434845, + "learning_rate": 2.720277367832852e-06, + "loss": 0.9271, + "step": 10046 + }, + { + "epoch": 0.8977148345879777, + "grad_norm": 0.5316069722175598, + "learning_rate": 2.7155713333898825e-06, + "loss": 0.8928, + "step": 10047 + }, + { + "epoch": 0.8978041861192396, + "grad_norm": 0.5463095307350159, + "learning_rate": 2.7108692595218254e-06, + "loss": 0.932, + "step": 10048 + }, + { + "epoch": 0.8978935376505015, + "grad_norm": 0.475721150636673, + "learning_rate": 2.706171146622538e-06, + "loss": 0.9658, + "step": 10049 + }, + { + "epoch": 0.8979828891817634, + "grad_norm": 0.4900808036327362, + "learning_rate": 2.7014769950855334e-06, + "loss": 0.963, + "step": 10050 + }, + { + "epoch": 0.8980722407130253, + "grad_norm": 0.411359578371048, + "learning_rate": 2.6967868053039913e-06, + "loss": 0.972, + "step": 10051 + }, + { + "epoch": 0.898161592244287, + "grad_norm": 0.5783373713493347, + "learning_rate": 2.6921005776707755e-06, + "loss": 0.9393, + "step": 10052 + }, + { + "epoch": 0.8982509437755489, + "grad_norm": 0.4613523483276367, + "learning_rate": 2.6874183125784047e-06, + "loss": 0.9651, + "step": 10053 + }, + { + "epoch": 0.8983402953068108, + "grad_norm": 0.5736072063446045, + "learning_rate": 2.682740010419066e-06, + "loss": 0.8834, + "step": 10054 + }, + { + "epoch": 0.8984296468380727, + "grad_norm": 0.4773879051208496, + "learning_rate": 2.678065671584612e-06, + "loss": 0.9388, + "step": 10055 + }, + { + "epoch": 0.8985189983693346, + "grad_norm": 0.4866504669189453, + "learning_rate": 2.673395296466574e-06, + "loss": 0.9726, + "step": 10056 + }, + { + "epoch": 0.8986083499005965, + "grad_norm": 0.5097846984863281, + "learning_rate": 2.6687288854561455e-06, + "loss": 0.8879, + "step": 10057 + }, + { + "epoch": 0.8986977014318583, + "grad_norm": 0.6318731307983398, + "learning_rate": 2.664066438944185e-06, + "loss": 0.9752, + "step": 10058 + }, + { + "epoch": 0.8987870529631201, + "grad_norm": 0.47188690304756165, + "learning_rate": 2.6594079573212303e-06, + "loss": 0.8645, + "step": 10059 + }, + { + "epoch": 0.898876404494382, + "grad_norm": 0.43872636556625366, + "learning_rate": 2.654753440977481e-06, + "loss": 0.9638, + "step": 10060 + }, + { + "epoch": 0.8989657560256439, + "grad_norm": 0.521487295627594, + "learning_rate": 2.650102890302786e-06, + "loss": 0.957, + "step": 10061 + }, + { + "epoch": 0.8990551075569058, + "grad_norm": 0.5581962466239929, + "learning_rate": 2.6454563056866834e-06, + "loss": 0.855, + "step": 10062 + }, + { + "epoch": 0.8991444590881676, + "grad_norm": 0.43780362606048584, + "learning_rate": 2.640813687518384e-06, + "loss": 0.9888, + "step": 10063 + }, + { + "epoch": 0.8992338106194295, + "grad_norm": 0.46411171555519104, + "learning_rate": 2.6361750361867554e-06, + "loss": 0.8972, + "step": 10064 + }, + { + "epoch": 0.8993231621506913, + "grad_norm": 0.5126439332962036, + "learning_rate": 2.631540352080325e-06, + "loss": 0.8815, + "step": 10065 + }, + { + "epoch": 0.8994125136819532, + "grad_norm": 0.449725866317749, + "learning_rate": 2.626909635587299e-06, + "loss": 0.9732, + "step": 10066 + }, + { + "epoch": 0.8995018652132151, + "grad_norm": 0.47947466373443604, + "learning_rate": 2.6222828870955505e-06, + "loss": 0.8826, + "step": 10067 + }, + { + "epoch": 0.899591216744477, + "grad_norm": 0.4623918831348419, + "learning_rate": 2.6176601069926255e-06, + "loss": 0.9146, + "step": 10068 + }, + { + "epoch": 0.8996805682757388, + "grad_norm": 0.5010434985160828, + "learning_rate": 2.613041295665719e-06, + "loss": 0.9345, + "step": 10069 + }, + { + "epoch": 0.8997699198070007, + "grad_norm": 0.5494645833969116, + "learning_rate": 2.608426453501722e-06, + "loss": 0.8777, + "step": 10070 + }, + { + "epoch": 0.8998592713382626, + "grad_norm": 0.4975086450576782, + "learning_rate": 2.6038155808871587e-06, + "loss": 0.8943, + "step": 10071 + }, + { + "epoch": 0.8999486228695244, + "grad_norm": 0.4562755823135376, + "learning_rate": 2.5992086782082536e-06, + "loss": 0.8846, + "step": 10072 + }, + { + "epoch": 0.9000379744007863, + "grad_norm": 0.5983653664588928, + "learning_rate": 2.5946057458508756e-06, + "loss": 0.8868, + "step": 10073 + }, + { + "epoch": 0.9001273259320481, + "grad_norm": 0.586626410484314, + "learning_rate": 2.5900067842005772e-06, + "loss": 0.8883, + "step": 10074 + }, + { + "epoch": 0.90021667746331, + "grad_norm": 0.5138862729072571, + "learning_rate": 2.585411793642556e-06, + "loss": 0.9107, + "step": 10075 + }, + { + "epoch": 0.9003060289945719, + "grad_norm": 0.45086416602134705, + "learning_rate": 2.580820774561704e-06, + "loss": 0.9627, + "step": 10076 + }, + { + "epoch": 0.9003953805258338, + "grad_norm": 0.5956899523735046, + "learning_rate": 2.576233727342564e-06, + "loss": 0.9463, + "step": 10077 + }, + { + "epoch": 0.9004847320570957, + "grad_norm": 0.47443798184394836, + "learning_rate": 2.571650652369351e-06, + "loss": 0.9167, + "step": 10078 + }, + { + "epoch": 0.9005740835883574, + "grad_norm": 0.5572949051856995, + "learning_rate": 2.5670715500259403e-06, + "loss": 0.8379, + "step": 10079 + }, + { + "epoch": 0.9006634351196193, + "grad_norm": 0.6084146499633789, + "learning_rate": 2.5624964206958924e-06, + "loss": 0.8469, + "step": 10080 + }, + { + "epoch": 0.9007527866508812, + "grad_norm": 0.5597148537635803, + "learning_rate": 2.557925264762412e-06, + "loss": 0.8754, + "step": 10081 + }, + { + "epoch": 0.9008421381821431, + "grad_norm": 0.5465840101242065, + "learning_rate": 2.5533580826083926e-06, + "loss": 0.8959, + "step": 10082 + }, + { + "epoch": 0.900931489713405, + "grad_norm": 0.39822328090667725, + "learning_rate": 2.5487948746163726e-06, + "loss": 0.9557, + "step": 10083 + }, + { + "epoch": 0.9010208412446669, + "grad_norm": 0.45787888765335083, + "learning_rate": 2.544235641168585e-06, + "loss": 0.8887, + "step": 10084 + }, + { + "epoch": 0.9011101927759287, + "grad_norm": 0.5872011184692383, + "learning_rate": 2.5396803826468975e-06, + "loss": 0.8258, + "step": 10085 + }, + { + "epoch": 0.9011995443071905, + "grad_norm": 0.5175109505653381, + "learning_rate": 2.53512909943287e-06, + "loss": 0.9667, + "step": 10086 + }, + { + "epoch": 0.9012888958384524, + "grad_norm": 0.4502319097518921, + "learning_rate": 2.5305817919077157e-06, + "loss": 0.9462, + "step": 10087 + }, + { + "epoch": 0.9013782473697143, + "grad_norm": 0.4700741171836853, + "learning_rate": 2.526038460452329e-06, + "loss": 0.9298, + "step": 10088 + }, + { + "epoch": 0.9014675989009762, + "grad_norm": 0.4288919270038605, + "learning_rate": 2.521499105447256e-06, + "loss": 0.9719, + "step": 10089 + }, + { + "epoch": 0.901556950432238, + "grad_norm": 0.38474833965301514, + "learning_rate": 2.516963727272714e-06, + "loss": 1.0002, + "step": 10090 + }, + { + "epoch": 0.9016463019634999, + "grad_norm": 0.5012978911399841, + "learning_rate": 2.512432326308595e-06, + "loss": 0.9018, + "step": 10091 + }, + { + "epoch": 0.9017356534947618, + "grad_norm": 0.5238143801689148, + "learning_rate": 2.5079049029344492e-06, + "loss": 0.8814, + "step": 10092 + }, + { + "epoch": 0.9018250050260236, + "grad_norm": 0.4294685423374176, + "learning_rate": 2.503381457529508e-06, + "loss": 0.9577, + "step": 10093 + }, + { + "epoch": 0.9019143565572855, + "grad_norm": 0.5884724259376526, + "learning_rate": 2.498861990472634e-06, + "loss": 0.8814, + "step": 10094 + }, + { + "epoch": 0.9020037080885474, + "grad_norm": 0.40673425793647766, + "learning_rate": 2.494346502142397e-06, + "loss": 0.952, + "step": 10095 + }, + { + "epoch": 0.9020930596198092, + "grad_norm": 0.4494728744029999, + "learning_rate": 2.4898349929170116e-06, + "loss": 0.937, + "step": 10096 + }, + { + "epoch": 0.9021824111510711, + "grad_norm": 0.5147069692611694, + "learning_rate": 2.4853274631743807e-06, + "loss": 0.8933, + "step": 10097 + }, + { + "epoch": 0.902271762682333, + "grad_norm": 0.42955800890922546, + "learning_rate": 2.4808239132920297e-06, + "loss": 0.9895, + "step": 10098 + }, + { + "epoch": 0.9023611142135949, + "grad_norm": 0.5820243954658508, + "learning_rate": 2.4763243436472016e-06, + "loss": 0.8731, + "step": 10099 + }, + { + "epoch": 0.9024504657448567, + "grad_norm": 0.46939095854759216, + "learning_rate": 2.471828754616773e-06, + "loss": 0.9266, + "step": 10100 + }, + { + "epoch": 0.9025398172761185, + "grad_norm": 0.5100577473640442, + "learning_rate": 2.4673371465772978e-06, + "loss": 0.9469, + "step": 10101 + }, + { + "epoch": 0.9026291688073804, + "grad_norm": 0.5236694812774658, + "learning_rate": 2.4628495199050027e-06, + "loss": 0.9135, + "step": 10102 + }, + { + "epoch": 0.9027185203386423, + "grad_norm": 0.5193259716033936, + "learning_rate": 2.4583658749757656e-06, + "loss": 0.9942, + "step": 10103 + }, + { + "epoch": 0.9028078718699042, + "grad_norm": 0.553813099861145, + "learning_rate": 2.453886212165152e-06, + "loss": 0.9319, + "step": 10104 + }, + { + "epoch": 0.9028972234011661, + "grad_norm": 0.5031151175498962, + "learning_rate": 2.4494105318483674e-06, + "loss": 0.8901, + "step": 10105 + }, + { + "epoch": 0.902986574932428, + "grad_norm": 0.5096839070320129, + "learning_rate": 2.444938834400301e-06, + "loss": 0.8819, + "step": 10106 + }, + { + "epoch": 0.9030759264636897, + "grad_norm": 0.48195287585258484, + "learning_rate": 2.4404711201955088e-06, + "loss": 0.969, + "step": 10107 + }, + { + "epoch": 0.9031652779949516, + "grad_norm": 0.46852657198905945, + "learning_rate": 2.4360073896082138e-06, + "loss": 0.8956, + "step": 10108 + }, + { + "epoch": 0.9032546295262135, + "grad_norm": 0.4824883043766022, + "learning_rate": 2.4315476430122884e-06, + "loss": 0.9332, + "step": 10109 + }, + { + "epoch": 0.9033439810574754, + "grad_norm": 0.4670718312263489, + "learning_rate": 2.4270918807812958e-06, + "loss": 0.9048, + "step": 10110 + }, + { + "epoch": 0.9034333325887373, + "grad_norm": 0.4740527868270874, + "learning_rate": 2.422640103288443e-06, + "loss": 0.9462, + "step": 10111 + }, + { + "epoch": 0.9035226841199991, + "grad_norm": 0.49257439374923706, + "learning_rate": 2.418192310906625e-06, + "loss": 0.9087, + "step": 10112 + }, + { + "epoch": 0.903612035651261, + "grad_norm": 0.5299413204193115, + "learning_rate": 2.413748504008384e-06, + "loss": 0.8926, + "step": 10113 + }, + { + "epoch": 0.9037013871825228, + "grad_norm": 0.46102991700172424, + "learning_rate": 2.4093086829659495e-06, + "loss": 0.9209, + "step": 10114 + }, + { + "epoch": 0.9037907387137847, + "grad_norm": 0.5814657211303711, + "learning_rate": 2.404872848151185e-06, + "loss": 1.0147, + "step": 10115 + }, + { + "epoch": 0.9038800902450466, + "grad_norm": 0.5453367829322815, + "learning_rate": 2.4004409999356437e-06, + "loss": 0.8692, + "step": 10116 + }, + { + "epoch": 0.9039694417763084, + "grad_norm": 0.5346057415008545, + "learning_rate": 2.396013138690545e-06, + "loss": 0.9964, + "step": 10117 + }, + { + "epoch": 0.9040587933075703, + "grad_norm": 0.541984498500824, + "learning_rate": 2.39158926478677e-06, + "loss": 0.9529, + "step": 10118 + }, + { + "epoch": 0.9041481448388322, + "grad_norm": 0.43673083186149597, + "learning_rate": 2.3871693785948614e-06, + "loss": 1.0007, + "step": 10119 + }, + { + "epoch": 0.9042374963700941, + "grad_norm": 0.42175018787384033, + "learning_rate": 2.3827534804850336e-06, + "loss": 0.9554, + "step": 10120 + }, + { + "epoch": 0.9043268479013559, + "grad_norm": 0.5283427238464355, + "learning_rate": 2.3783415708271696e-06, + "loss": 0.8791, + "step": 10121 + }, + { + "epoch": 0.9044161994326178, + "grad_norm": 0.41755416989326477, + "learning_rate": 2.3739336499908005e-06, + "loss": 0.9335, + "step": 10122 + }, + { + "epoch": 0.9045055509638796, + "grad_norm": 0.4738496243953705, + "learning_rate": 2.3695297183451536e-06, + "loss": 0.9587, + "step": 10123 + }, + { + "epoch": 0.9045949024951415, + "grad_norm": 0.4655263423919678, + "learning_rate": 2.3651297762591006e-06, + "loss": 0.8765, + "step": 10124 + }, + { + "epoch": 0.9046842540264034, + "grad_norm": 0.4394261837005615, + "learning_rate": 2.3607338241011747e-06, + "loss": 0.9538, + "step": 10125 + }, + { + "epoch": 0.9047736055576653, + "grad_norm": 0.4117361605167389, + "learning_rate": 2.356341862239586e-06, + "loss": 1.0001, + "step": 10126 + }, + { + "epoch": 0.9048629570889272, + "grad_norm": 0.5142612457275391, + "learning_rate": 2.3519538910422134e-06, + "loss": 0.9423, + "step": 10127 + }, + { + "epoch": 0.9049523086201889, + "grad_norm": 0.5198079347610474, + "learning_rate": 2.3475699108765958e-06, + "loss": 0.9456, + "step": 10128 + }, + { + "epoch": 0.9050416601514508, + "grad_norm": 0.521613359451294, + "learning_rate": 2.3431899221099342e-06, + "loss": 0.9063, + "step": 10129 + }, + { + "epoch": 0.9051310116827127, + "grad_norm": 0.4279034435749054, + "learning_rate": 2.3388139251091067e-06, + "loss": 1.0047, + "step": 10130 + }, + { + "epoch": 0.9052203632139746, + "grad_norm": 0.5167081356048584, + "learning_rate": 2.334441920240643e-06, + "loss": 0.9027, + "step": 10131 + }, + { + "epoch": 0.9053097147452365, + "grad_norm": 0.494266152381897, + "learning_rate": 2.3300739078707446e-06, + "loss": 0.929, + "step": 10132 + }, + { + "epoch": 0.9053990662764984, + "grad_norm": 0.4619883596897125, + "learning_rate": 2.3257098883652795e-06, + "loss": 0.944, + "step": 10133 + }, + { + "epoch": 0.9054884178077601, + "grad_norm": 0.46336403489112854, + "learning_rate": 2.321349862089789e-06, + "loss": 0.9754, + "step": 10134 + }, + { + "epoch": 0.905577769339022, + "grad_norm": 0.5507116913795471, + "learning_rate": 2.3169938294094582e-06, + "loss": 0.8841, + "step": 10135 + }, + { + "epoch": 0.9056671208702839, + "grad_norm": 0.49154022336006165, + "learning_rate": 2.312641790689163e-06, + "loss": 0.887, + "step": 10136 + }, + { + "epoch": 0.9057564724015458, + "grad_norm": 0.5543971061706543, + "learning_rate": 2.3082937462934274e-06, + "loss": 0.876, + "step": 10137 + }, + { + "epoch": 0.9058458239328077, + "grad_norm": 0.5523365139961243, + "learning_rate": 2.3039496965864436e-06, + "loss": 0.9618, + "step": 10138 + }, + { + "epoch": 0.9059351754640695, + "grad_norm": 0.5370588898658752, + "learning_rate": 2.2996096419320824e-06, + "loss": 0.9194, + "step": 10139 + }, + { + "epoch": 0.9060245269953314, + "grad_norm": 0.4241883158683777, + "learning_rate": 2.2952735826938576e-06, + "loss": 0.8904, + "step": 10140 + }, + { + "epoch": 0.9061138785265932, + "grad_norm": 0.45826974511146545, + "learning_rate": 2.290941519234968e-06, + "loss": 0.9604, + "step": 10141 + }, + { + "epoch": 0.9062032300578551, + "grad_norm": 0.4920784533023834, + "learning_rate": 2.286613451918268e-06, + "loss": 0.9322, + "step": 10142 + }, + { + "epoch": 0.906292581589117, + "grad_norm": 0.4059630334377289, + "learning_rate": 2.2822893811062786e-06, + "loss": 0.9157, + "step": 10143 + }, + { + "epoch": 0.9063819331203788, + "grad_norm": 0.4405556321144104, + "learning_rate": 2.2779693071611986e-06, + "loss": 0.9764, + "step": 10144 + }, + { + "epoch": 0.9064712846516407, + "grad_norm": 0.4672204852104187, + "learning_rate": 2.273653230444861e-06, + "loss": 0.9871, + "step": 10145 + }, + { + "epoch": 0.9065606361829026, + "grad_norm": 0.5110070109367371, + "learning_rate": 2.269341151318788e-06, + "loss": 0.9669, + "step": 10146 + }, + { + "epoch": 0.9066499877141645, + "grad_norm": 0.6213862299919128, + "learning_rate": 2.2650330701441678e-06, + "loss": 0.9349, + "step": 10147 + }, + { + "epoch": 0.9067393392454263, + "grad_norm": 0.5088279247283936, + "learning_rate": 2.260728987281846e-06, + "loss": 0.895, + "step": 10148 + }, + { + "epoch": 0.9068286907766882, + "grad_norm": 0.5393834710121155, + "learning_rate": 2.2564289030923393e-06, + "loss": 0.9513, + "step": 10149 + }, + { + "epoch": 0.90691804230795, + "grad_norm": 0.4829711318016052, + "learning_rate": 2.2521328179358146e-06, + "loss": 0.8979, + "step": 10150 + }, + { + "epoch": 0.9070073938392119, + "grad_norm": 0.5531469583511353, + "learning_rate": 2.2478407321721296e-06, + "loss": 0.8851, + "step": 10151 + }, + { + "epoch": 0.9070967453704738, + "grad_norm": 0.45315778255462646, + "learning_rate": 2.243552646160779e-06, + "loss": 0.9384, + "step": 10152 + }, + { + "epoch": 0.9071860969017357, + "grad_norm": 0.5223442912101746, + "learning_rate": 2.239268560260943e-06, + "loss": 0.8615, + "step": 10153 + }, + { + "epoch": 0.9072754484329976, + "grad_norm": 0.47916871309280396, + "learning_rate": 2.234988474831462e-06, + "loss": 0.9782, + "step": 10154 + }, + { + "epoch": 0.9073647999642593, + "grad_norm": 0.4964684247970581, + "learning_rate": 2.230712390230838e-06, + "loss": 0.8816, + "step": 10155 + }, + { + "epoch": 0.9074541514955212, + "grad_norm": 0.5099236965179443, + "learning_rate": 2.226440306817229e-06, + "loss": 0.9071, + "step": 10156 + }, + { + "epoch": 0.9075435030267831, + "grad_norm": 0.48252931237220764, + "learning_rate": 2.222172224948471e-06, + "loss": 0.9396, + "step": 10157 + }, + { + "epoch": 0.907632854558045, + "grad_norm": 0.5542041063308716, + "learning_rate": 2.2179081449820672e-06, + "loss": 0.901, + "step": 10158 + }, + { + "epoch": 0.9077222060893069, + "grad_norm": 0.5302069783210754, + "learning_rate": 2.213648067275176e-06, + "loss": 0.8758, + "step": 10159 + }, + { + "epoch": 0.9078115576205688, + "grad_norm": 0.6605767607688904, + "learning_rate": 2.2093919921846283e-06, + "loss": 0.8491, + "step": 10160 + }, + { + "epoch": 0.9079009091518306, + "grad_norm": 0.47050368785858154, + "learning_rate": 2.2051399200669065e-06, + "loss": 0.9554, + "step": 10161 + }, + { + "epoch": 0.9079902606830924, + "grad_norm": 0.49454933404922485, + "learning_rate": 2.200891851278175e-06, + "loss": 0.8927, + "step": 10162 + }, + { + "epoch": 0.9080796122143543, + "grad_norm": 0.46944358944892883, + "learning_rate": 2.1966477861742607e-06, + "loss": 0.9581, + "step": 10163 + }, + { + "epoch": 0.9081689637456162, + "grad_norm": 0.48867207765579224, + "learning_rate": 2.1924077251106347e-06, + "loss": 1.0078, + "step": 10164 + }, + { + "epoch": 0.9082583152768781, + "grad_norm": 0.46018025279045105, + "learning_rate": 2.1881716684424568e-06, + "loss": 0.9385, + "step": 10165 + }, + { + "epoch": 0.9083476668081399, + "grad_norm": 0.4881027340888977, + "learning_rate": 2.183939616524533e-06, + "loss": 0.9308, + "step": 10166 + }, + { + "epoch": 0.9084370183394018, + "grad_norm": 0.4833429157733917, + "learning_rate": 2.1797115697113624e-06, + "loss": 1.0144, + "step": 10167 + }, + { + "epoch": 0.9085263698706637, + "grad_norm": 0.42278674244880676, + "learning_rate": 2.175487528357062e-06, + "loss": 0.9192, + "step": 10168 + }, + { + "epoch": 0.9086157214019255, + "grad_norm": 0.632034957408905, + "learning_rate": 2.17126749281546e-06, + "loss": 0.9533, + "step": 10169 + }, + { + "epoch": 0.9087050729331874, + "grad_norm": 0.5221367478370667, + "learning_rate": 2.1670514634400173e-06, + "loss": 0.9187, + "step": 10170 + }, + { + "epoch": 0.9087944244644492, + "grad_norm": 0.4272196292877197, + "learning_rate": 2.1628394405838803e-06, + "loss": 1.0073, + "step": 10171 + }, + { + "epoch": 0.9088837759957111, + "grad_norm": 0.44251886010169983, + "learning_rate": 2.1586314245998497e-06, + "loss": 0.9993, + "step": 10172 + }, + { + "epoch": 0.908973127526973, + "grad_norm": 0.6176732778549194, + "learning_rate": 2.1544274158403877e-06, + "loss": 0.8595, + "step": 10173 + }, + { + "epoch": 0.9090624790582349, + "grad_norm": 0.4189254343509674, + "learning_rate": 2.150227414657624e-06, + "loss": 0.9154, + "step": 10174 + }, + { + "epoch": 0.9091518305894968, + "grad_norm": 0.47814953327178955, + "learning_rate": 2.1460314214033662e-06, + "loss": 0.94, + "step": 10175 + }, + { + "epoch": 0.9092411821207586, + "grad_norm": 0.4514091908931732, + "learning_rate": 2.141839436429055e-06, + "loss": 0.9883, + "step": 10176 + }, + { + "epoch": 0.9093305336520204, + "grad_norm": 0.38325291872024536, + "learning_rate": 2.137651460085821e-06, + "loss": 0.9605, + "step": 10177 + }, + { + "epoch": 0.9094198851832823, + "grad_norm": 0.4543987512588501, + "learning_rate": 2.1334674927244556e-06, + "loss": 0.9641, + "step": 10178 + }, + { + "epoch": 0.9095092367145442, + "grad_norm": 0.4916648268699646, + "learning_rate": 2.1292875346954123e-06, + "loss": 0.9521, + "step": 10179 + }, + { + "epoch": 0.9095985882458061, + "grad_norm": 0.49808797240257263, + "learning_rate": 2.1251115863487934e-06, + "loss": 0.9676, + "step": 10180 + }, + { + "epoch": 0.909687939777068, + "grad_norm": 0.4320428669452667, + "learning_rate": 2.1209396480343977e-06, + "loss": 0.9669, + "step": 10181 + }, + { + "epoch": 0.9097772913083298, + "grad_norm": 0.5682074427604675, + "learning_rate": 2.1167717201016568e-06, + "loss": 0.8171, + "step": 10182 + }, + { + "epoch": 0.9098666428395916, + "grad_norm": 0.5299460291862488, + "learning_rate": 2.11260780289968e-06, + "loss": 0.9178, + "step": 10183 + }, + { + "epoch": 0.9099559943708535, + "grad_norm": 0.4480501711368561, + "learning_rate": 2.1084478967772494e-06, + "loss": 0.9736, + "step": 10184 + }, + { + "epoch": 0.9100453459021154, + "grad_norm": 0.5181623697280884, + "learning_rate": 2.1042920020827974e-06, + "loss": 0.9647, + "step": 10185 + }, + { + "epoch": 0.9101346974333773, + "grad_norm": 0.6131975650787354, + "learning_rate": 2.100140119164412e-06, + "loss": 0.9173, + "step": 10186 + }, + { + "epoch": 0.9102240489646392, + "grad_norm": 0.5780476927757263, + "learning_rate": 2.095992248369871e-06, + "loss": 0.9846, + "step": 10187 + }, + { + "epoch": 0.910313400495901, + "grad_norm": 0.4459938108921051, + "learning_rate": 2.0918483900466025e-06, + "loss": 0.9798, + "step": 10188 + }, + { + "epoch": 0.9104027520271629, + "grad_norm": 0.47280803322792053, + "learning_rate": 2.087708544541689e-06, + "loss": 0.9052, + "step": 10189 + }, + { + "epoch": 0.9104921035584247, + "grad_norm": 0.5585520267486572, + "learning_rate": 2.083572712201898e-06, + "loss": 0.8532, + "step": 10190 + }, + { + "epoch": 0.9105814550896866, + "grad_norm": 0.5118190050125122, + "learning_rate": 2.079440893373641e-06, + "loss": 0.9004, + "step": 10191 + }, + { + "epoch": 0.9106708066209485, + "grad_norm": 0.503633975982666, + "learning_rate": 2.075313088403008e-06, + "loss": 0.8772, + "step": 10192 + }, + { + "epoch": 0.9107601581522103, + "grad_norm": 0.45112094283103943, + "learning_rate": 2.071189297635745e-06, + "loss": 0.9163, + "step": 10193 + }, + { + "epoch": 0.9108495096834722, + "grad_norm": 0.4446631669998169, + "learning_rate": 2.067069521417264e-06, + "loss": 0.9806, + "step": 10194 + }, + { + "epoch": 0.9109388612147341, + "grad_norm": 0.525492250919342, + "learning_rate": 2.0629537600926395e-06, + "loss": 0.9365, + "step": 10195 + }, + { + "epoch": 0.9110282127459959, + "grad_norm": 0.4737085998058319, + "learning_rate": 2.0588420140066067e-06, + "loss": 1.0117, + "step": 10196 + }, + { + "epoch": 0.9111175642772578, + "grad_norm": 0.4597386121749878, + "learning_rate": 2.0547342835035733e-06, + "loss": 0.896, + "step": 10197 + }, + { + "epoch": 0.9112069158085196, + "grad_norm": 0.5008763074874878, + "learning_rate": 2.050630568927603e-06, + "loss": 0.9568, + "step": 10198 + }, + { + "epoch": 0.9112962673397815, + "grad_norm": 0.5139524340629578, + "learning_rate": 2.0465308706224207e-06, + "loss": 0.8624, + "step": 10199 + }, + { + "epoch": 0.9113856188710434, + "grad_norm": 0.5094760060310364, + "learning_rate": 2.0424351889314354e-06, + "loss": 0.9532, + "step": 10200 + }, + { + "epoch": 0.9114749704023053, + "grad_norm": 0.5391070246696472, + "learning_rate": 2.038343524197689e-06, + "loss": 0.8551, + "step": 10201 + }, + { + "epoch": 0.9115643219335672, + "grad_norm": 0.5024666786193848, + "learning_rate": 2.0342558767639074e-06, + "loss": 0.8772, + "step": 10202 + }, + { + "epoch": 0.911653673464829, + "grad_norm": 0.48186391592025757, + "learning_rate": 2.0301722469724726e-06, + "loss": 0.875, + "step": 10203 + }, + { + "epoch": 0.9117430249960908, + "grad_norm": 0.44918540120124817, + "learning_rate": 2.026092635165433e-06, + "loss": 0.9593, + "step": 10204 + }, + { + "epoch": 0.9118323765273527, + "grad_norm": 0.48004093766212463, + "learning_rate": 2.022017041684504e-06, + "loss": 0.9952, + "step": 10205 + }, + { + "epoch": 0.9119217280586146, + "grad_norm": 0.4600532352924347, + "learning_rate": 2.0179454668710575e-06, + "loss": 0.9583, + "step": 10206 + }, + { + "epoch": 0.9120110795898765, + "grad_norm": 0.4377675950527191, + "learning_rate": 2.0138779110661252e-06, + "loss": 0.8938, + "step": 10207 + }, + { + "epoch": 0.9121004311211384, + "grad_norm": 0.6013002395629883, + "learning_rate": 2.0098143746104135e-06, + "loss": 0.7857, + "step": 10208 + }, + { + "epoch": 0.9121897826524002, + "grad_norm": 0.5890505313873291, + "learning_rate": 2.005754857844283e-06, + "loss": 0.9102, + "step": 10209 + }, + { + "epoch": 0.912279134183662, + "grad_norm": 0.48680880665779114, + "learning_rate": 2.0016993611077726e-06, + "loss": 0.944, + "step": 10210 + }, + { + "epoch": 0.9123684857149239, + "grad_norm": 0.5501675009727478, + "learning_rate": 1.997647884740561e-06, + "loss": 0.872, + "step": 10211 + }, + { + "epoch": 0.9124578372461858, + "grad_norm": 0.46747952699661255, + "learning_rate": 1.9936004290820098e-06, + "loss": 0.9101, + "step": 10212 + }, + { + "epoch": 0.9125471887774477, + "grad_norm": 0.47848084568977356, + "learning_rate": 1.989556994471131e-06, + "loss": 0.9689, + "step": 10213 + }, + { + "epoch": 0.9126365403087096, + "grad_norm": 0.42791107296943665, + "learning_rate": 1.98551758124661e-06, + "loss": 0.9226, + "step": 10214 + }, + { + "epoch": 0.9127258918399714, + "grad_norm": 0.6010297536849976, + "learning_rate": 1.9814821897467973e-06, + "loss": 0.8729, + "step": 10215 + }, + { + "epoch": 0.9128152433712333, + "grad_norm": 0.5505849719047546, + "learning_rate": 1.977450820309684e-06, + "loss": 0.9354, + "step": 10216 + }, + { + "epoch": 0.9129045949024951, + "grad_norm": 0.7786350250244141, + "learning_rate": 1.973423473272945e-06, + "loss": 0.8643, + "step": 10217 + }, + { + "epoch": 0.912993946433757, + "grad_norm": 0.596895694732666, + "learning_rate": 1.9694001489739213e-06, + "loss": 0.9079, + "step": 10218 + }, + { + "epoch": 0.9130832979650189, + "grad_norm": 0.4010644555091858, + "learning_rate": 1.9653808477496038e-06, + "loss": 0.9816, + "step": 10219 + }, + { + "epoch": 0.9131726494962807, + "grad_norm": 0.537714421749115, + "learning_rate": 1.9613655699366464e-06, + "loss": 0.9397, + "step": 10220 + }, + { + "epoch": 0.9132620010275426, + "grad_norm": 0.4720018804073334, + "learning_rate": 1.957354315871385e-06, + "loss": 0.9971, + "step": 10221 + }, + { + "epoch": 0.9133513525588045, + "grad_norm": 0.45322003960609436, + "learning_rate": 1.953347085889795e-06, + "loss": 0.9162, + "step": 10222 + }, + { + "epoch": 0.9134407040900664, + "grad_norm": 0.5041544437408447, + "learning_rate": 1.9493438803275257e-06, + "loss": 0.9266, + "step": 10223 + }, + { + "epoch": 0.9135300556213282, + "grad_norm": 0.5631766319274902, + "learning_rate": 1.945344699519891e-06, + "loss": 0.896, + "step": 10224 + }, + { + "epoch": 0.91361940715259, + "grad_norm": 0.5237100124359131, + "learning_rate": 1.9413495438018736e-06, + "loss": 0.9955, + "step": 10225 + }, + { + "epoch": 0.9137087586838519, + "grad_norm": 0.5129988193511963, + "learning_rate": 1.937358413508089e-06, + "loss": 0.9253, + "step": 10226 + }, + { + "epoch": 0.9137981102151138, + "grad_norm": 0.45968538522720337, + "learning_rate": 1.933371308972848e-06, + "loss": 0.9623, + "step": 10227 + }, + { + "epoch": 0.9138874617463757, + "grad_norm": 0.5253444314002991, + "learning_rate": 1.929388230530116e-06, + "loss": 1.0036, + "step": 10228 + }, + { + "epoch": 0.9139768132776376, + "grad_norm": 0.5315740704536438, + "learning_rate": 1.9254091785135153e-06, + "loss": 0.89, + "step": 10229 + }, + { + "epoch": 0.9140661648088995, + "grad_norm": 0.4819380044937134, + "learning_rate": 1.9214341532563296e-06, + "loss": 0.936, + "step": 10230 + }, + { + "epoch": 0.9141555163401612, + "grad_norm": 0.5596315264701843, + "learning_rate": 1.91746315509152e-06, + "loss": 0.8871, + "step": 10231 + }, + { + "epoch": 0.9142448678714231, + "grad_norm": 0.4711841344833374, + "learning_rate": 1.913496184351693e-06, + "loss": 0.9442, + "step": 10232 + }, + { + "epoch": 0.914334219402685, + "grad_norm": 0.4977385997772217, + "learning_rate": 1.9095332413691326e-06, + "loss": 0.9563, + "step": 10233 + }, + { + "epoch": 0.9144235709339469, + "grad_norm": 0.592835545539856, + "learning_rate": 1.905574326475762e-06, + "loss": 0.8969, + "step": 10234 + }, + { + "epoch": 0.9145129224652088, + "grad_norm": 0.47582802176475525, + "learning_rate": 1.9016194400031884e-06, + "loss": 0.9354, + "step": 10235 + }, + { + "epoch": 0.9146022739964706, + "grad_norm": 0.4953663647174835, + "learning_rate": 1.8976685822826856e-06, + "loss": 0.9497, + "step": 10236 + }, + { + "epoch": 0.9146916255277325, + "grad_norm": 0.49911701679229736, + "learning_rate": 1.8937217536451778e-06, + "loss": 0.9374, + "step": 10237 + }, + { + "epoch": 0.9147809770589943, + "grad_norm": 0.5435790419578552, + "learning_rate": 1.8897789544212396e-06, + "loss": 0.8251, + "step": 10238 + }, + { + "epoch": 0.9148703285902562, + "grad_norm": 0.5200591683387756, + "learning_rate": 1.8858401849411344e-06, + "loss": 0.9244, + "step": 10239 + }, + { + "epoch": 0.9149596801215181, + "grad_norm": 0.6190335750579834, + "learning_rate": 1.8819054455347707e-06, + "loss": 0.8485, + "step": 10240 + }, + { + "epoch": 0.91504903165278, + "grad_norm": 0.5141425132751465, + "learning_rate": 1.877974736531729e-06, + "loss": 0.946, + "step": 10241 + }, + { + "epoch": 0.9151383831840418, + "grad_norm": 0.5247118473052979, + "learning_rate": 1.8740480582612519e-06, + "loss": 0.942, + "step": 10242 + }, + { + "epoch": 0.9152277347153037, + "grad_norm": 0.5821022391319275, + "learning_rate": 1.8701254110522315e-06, + "loss": 0.9661, + "step": 10243 + }, + { + "epoch": 0.9153170862465656, + "grad_norm": 0.6091264486312866, + "learning_rate": 1.8662067952332386e-06, + "loss": 0.9005, + "step": 10244 + }, + { + "epoch": 0.9154064377778274, + "grad_norm": 0.44241493940353394, + "learning_rate": 1.8622922111324937e-06, + "loss": 0.9736, + "step": 10245 + }, + { + "epoch": 0.9154957893090893, + "grad_norm": 0.4991171360015869, + "learning_rate": 1.8583816590778901e-06, + "loss": 0.9084, + "step": 10246 + }, + { + "epoch": 0.9155851408403511, + "grad_norm": 0.6471422910690308, + "learning_rate": 1.8544751393969716e-06, + "loss": 0.8957, + "step": 10247 + }, + { + "epoch": 0.915674492371613, + "grad_norm": 0.5588064193725586, + "learning_rate": 1.8505726524169598e-06, + "loss": 0.8469, + "step": 10248 + }, + { + "epoch": 0.9157638439028749, + "grad_norm": 0.4519459307193756, + "learning_rate": 1.8466741984647151e-06, + "loss": 0.9619, + "step": 10249 + }, + { + "epoch": 0.9158531954341368, + "grad_norm": 0.5813220739364624, + "learning_rate": 1.8427797778667931e-06, + "loss": 0.9573, + "step": 10250 + }, + { + "epoch": 0.9159425469653987, + "grad_norm": 0.4593013823032379, + "learning_rate": 1.8388893909493775e-06, + "loss": 1.0341, + "step": 10251 + }, + { + "epoch": 0.9160318984966604, + "grad_norm": 0.45809662342071533, + "learning_rate": 1.8350030380383355e-06, + "loss": 0.957, + "step": 10252 + }, + { + "epoch": 0.9161212500279223, + "grad_norm": 0.58559250831604, + "learning_rate": 1.831120719459195e-06, + "loss": 0.8983, + "step": 10253 + }, + { + "epoch": 0.9162106015591842, + "grad_norm": 0.4311509132385254, + "learning_rate": 1.8272424355371353e-06, + "loss": 0.903, + "step": 10254 + }, + { + "epoch": 0.9162999530904461, + "grad_norm": 0.49322575330734253, + "learning_rate": 1.8233681865970077e-06, + "loss": 0.937, + "step": 10255 + }, + { + "epoch": 0.916389304621708, + "grad_norm": 0.4198033809661865, + "learning_rate": 1.8194979729633244e-06, + "loss": 1.0386, + "step": 10256 + }, + { + "epoch": 0.9164786561529699, + "grad_norm": 0.4912882149219513, + "learning_rate": 1.8156317949602486e-06, + "loss": 0.9599, + "step": 10257 + }, + { + "epoch": 0.9165680076842316, + "grad_norm": 0.42434144020080566, + "learning_rate": 1.8117696529116213e-06, + "loss": 0.943, + "step": 10258 + }, + { + "epoch": 0.9166573592154935, + "grad_norm": 0.5602464079856873, + "learning_rate": 1.807911547140928e-06, + "loss": 0.851, + "step": 10259 + }, + { + "epoch": 0.9167467107467554, + "grad_norm": 0.4524661898612976, + "learning_rate": 1.8040574779713382e-06, + "loss": 1.0004, + "step": 10260 + }, + { + "epoch": 0.9168360622780173, + "grad_norm": 0.5537035465240479, + "learning_rate": 1.8002074457256658e-06, + "loss": 0.9157, + "step": 10261 + }, + { + "epoch": 0.9169254138092792, + "grad_norm": 0.48981863260269165, + "learning_rate": 1.796361450726397e-06, + "loss": 0.9785, + "step": 10262 + }, + { + "epoch": 0.917014765340541, + "grad_norm": 0.6159743070602417, + "learning_rate": 1.7925194932956635e-06, + "loss": 0.9111, + "step": 10263 + }, + { + "epoch": 0.9171041168718029, + "grad_norm": 0.40217313170433044, + "learning_rate": 1.7886815737552797e-06, + "loss": 0.9608, + "step": 10264 + }, + { + "epoch": 0.9171934684030647, + "grad_norm": 0.4525063633918762, + "learning_rate": 1.784847692426711e-06, + "loss": 1.0066, + "step": 10265 + }, + { + "epoch": 0.9172828199343266, + "grad_norm": 0.6385212540626526, + "learning_rate": 1.7810178496310891e-06, + "loss": 0.8665, + "step": 10266 + }, + { + "epoch": 0.9173721714655885, + "grad_norm": 0.4889377951622009, + "learning_rate": 1.7771920456891966e-06, + "loss": 0.9201, + "step": 10267 + }, + { + "epoch": 0.9174615229968504, + "grad_norm": 0.5491172671318054, + "learning_rate": 1.7733702809214825e-06, + "loss": 0.8458, + "step": 10268 + }, + { + "epoch": 0.9175508745281122, + "grad_norm": 0.48966601490974426, + "learning_rate": 1.7695525556480686e-06, + "loss": 0.9615, + "step": 10269 + }, + { + "epoch": 0.9176402260593741, + "grad_norm": 0.5428771376609802, + "learning_rate": 1.7657388701887379e-06, + "loss": 0.9203, + "step": 10270 + }, + { + "epoch": 0.917729577590636, + "grad_norm": 0.45766574144363403, + "learning_rate": 1.7619292248629071e-06, + "loss": 0.9307, + "step": 10271 + }, + { + "epoch": 0.9178189291218978, + "grad_norm": 0.49701622128486633, + "learning_rate": 1.7581236199896879e-06, + "loss": 0.868, + "step": 10272 + }, + { + "epoch": 0.9179082806531597, + "grad_norm": 0.4536900520324707, + "learning_rate": 1.754322055887836e-06, + "loss": 0.9504, + "step": 10273 + }, + { + "epoch": 0.9179976321844215, + "grad_norm": 0.5021615028381348, + "learning_rate": 1.750524532875769e-06, + "loss": 0.9365, + "step": 10274 + }, + { + "epoch": 0.9180869837156834, + "grad_norm": 0.4921415448188782, + "learning_rate": 1.7467310512715774e-06, + "loss": 0.966, + "step": 10275 + }, + { + "epoch": 0.9181763352469453, + "grad_norm": 0.5077376961708069, + "learning_rate": 1.742941611393012e-06, + "loss": 0.9469, + "step": 10276 + }, + { + "epoch": 0.9182656867782072, + "grad_norm": 0.45509567856788635, + "learning_rate": 1.7391562135574634e-06, + "loss": 0.9654, + "step": 10277 + }, + { + "epoch": 0.9183550383094691, + "grad_norm": 0.4772971570491791, + "learning_rate": 1.7353748580820061e-06, + "loss": 0.9162, + "step": 10278 + }, + { + "epoch": 0.9184443898407308, + "grad_norm": 0.5425154566764832, + "learning_rate": 1.7315975452833645e-06, + "loss": 0.9155, + "step": 10279 + }, + { + "epoch": 0.9185337413719927, + "grad_norm": 0.5885468125343323, + "learning_rate": 1.727824275477935e-06, + "loss": 0.9321, + "step": 10280 + }, + { + "epoch": 0.9186230929032546, + "grad_norm": 0.41930609941482544, + "learning_rate": 1.7240550489817653e-06, + "loss": 0.9593, + "step": 10281 + }, + { + "epoch": 0.9187124444345165, + "grad_norm": 0.5219174027442932, + "learning_rate": 1.7202898661105748e-06, + "loss": 0.9323, + "step": 10282 + }, + { + "epoch": 0.9188017959657784, + "grad_norm": 0.5876883268356323, + "learning_rate": 1.716528727179728e-06, + "loss": 0.9279, + "step": 10283 + }, + { + "epoch": 0.9188911474970403, + "grad_norm": 0.5892945528030396, + "learning_rate": 1.712771632504262e-06, + "loss": 0.8964, + "step": 10284 + }, + { + "epoch": 0.9189804990283021, + "grad_norm": 0.4626266062259674, + "learning_rate": 1.7090185823988857e-06, + "loss": 0.907, + "step": 10285 + }, + { + "epoch": 0.9190698505595639, + "grad_norm": 0.6098506450653076, + "learning_rate": 1.7052695771779481e-06, + "loss": 0.8302, + "step": 10286 + }, + { + "epoch": 0.9191592020908258, + "grad_norm": 0.5348356366157532, + "learning_rate": 1.7015246171554644e-06, + "loss": 0.9675, + "step": 10287 + }, + { + "epoch": 0.9192485536220877, + "grad_norm": 0.46181049942970276, + "learning_rate": 1.697783702645117e-06, + "loss": 0.849, + "step": 10288 + }, + { + "epoch": 0.9193379051533496, + "grad_norm": 0.5558209419250488, + "learning_rate": 1.69404683396025e-06, + "loss": 0.9733, + "step": 10289 + }, + { + "epoch": 0.9194272566846114, + "grad_norm": 0.48256248235702515, + "learning_rate": 1.6903140114138627e-06, + "loss": 0.948, + "step": 10290 + }, + { + "epoch": 0.9195166082158733, + "grad_norm": 0.6082019805908203, + "learning_rate": 1.6865852353186218e-06, + "loss": 0.9981, + "step": 10291 + }, + { + "epoch": 0.9196059597471352, + "grad_norm": 0.4192001223564148, + "learning_rate": 1.6828605059868552e-06, + "loss": 1.004, + "step": 10292 + }, + { + "epoch": 0.919695311278397, + "grad_norm": 0.5222572684288025, + "learning_rate": 1.6791398237305412e-06, + "loss": 0.9626, + "step": 10293 + }, + { + "epoch": 0.9197846628096589, + "grad_norm": 0.48702099919319153, + "learning_rate": 1.6754231888613304e-06, + "loss": 0.9927, + "step": 10294 + }, + { + "epoch": 0.9198740143409208, + "grad_norm": 0.5022723078727722, + "learning_rate": 1.6717106016905348e-06, + "loss": 0.9374, + "step": 10295 + }, + { + "epoch": 0.9199633658721826, + "grad_norm": 0.3965294063091278, + "learning_rate": 1.6680020625291227e-06, + "loss": 0.9798, + "step": 10296 + }, + { + "epoch": 0.9200527174034445, + "grad_norm": 0.48680707812309265, + "learning_rate": 1.6642975716877118e-06, + "loss": 0.9358, + "step": 10297 + }, + { + "epoch": 0.9201420689347064, + "grad_norm": 0.5684248805046082, + "learning_rate": 1.6605971294766044e-06, + "loss": 0.848, + "step": 10298 + }, + { + "epoch": 0.9202314204659683, + "grad_norm": 0.5401350855827332, + "learning_rate": 1.6569007362057465e-06, + "loss": 0.9419, + "step": 10299 + }, + { + "epoch": 0.9203207719972301, + "grad_norm": 0.5058586001396179, + "learning_rate": 1.653208392184752e-06, + "loss": 0.877, + "step": 10300 + }, + { + "epoch": 0.9204101235284919, + "grad_norm": 0.4828777611255646, + "learning_rate": 1.6495200977228897e-06, + "loss": 0.9071, + "step": 10301 + }, + { + "epoch": 0.9204994750597538, + "grad_norm": 0.5211459994316101, + "learning_rate": 1.6458358531291074e-06, + "loss": 0.926, + "step": 10302 + }, + { + "epoch": 0.9205888265910157, + "grad_norm": 0.4530414044857025, + "learning_rate": 1.6421556587119913e-06, + "loss": 0.9432, + "step": 10303 + }, + { + "epoch": 0.9206781781222776, + "grad_norm": 0.4439500868320465, + "learning_rate": 1.6384795147797894e-06, + "loss": 0.9568, + "step": 10304 + }, + { + "epoch": 0.9207675296535395, + "grad_norm": 0.43074101209640503, + "learning_rate": 1.6348074216404273e-06, + "loss": 1.1121, + "step": 10305 + }, + { + "epoch": 0.9208568811848014, + "grad_norm": 0.5717933177947998, + "learning_rate": 1.6311393796014819e-06, + "loss": 0.8781, + "step": 10306 + }, + { + "epoch": 0.9209462327160631, + "grad_norm": 0.4109342396259308, + "learning_rate": 1.62747538897019e-06, + "loss": 0.9698, + "step": 10307 + }, + { + "epoch": 0.921035584247325, + "grad_norm": 0.4915206730365753, + "learning_rate": 1.6238154500534452e-06, + "loss": 0.9776, + "step": 10308 + }, + { + "epoch": 0.9211249357785869, + "grad_norm": 0.5269554853439331, + "learning_rate": 1.620159563157808e-06, + "loss": 0.8828, + "step": 10309 + }, + { + "epoch": 0.9212142873098488, + "grad_norm": 0.446888267993927, + "learning_rate": 1.6165077285895002e-06, + "loss": 0.9208, + "step": 10310 + }, + { + "epoch": 0.9213036388411107, + "grad_norm": 0.4172409772872925, + "learning_rate": 1.6128599466543993e-06, + "loss": 0.9789, + "step": 10311 + }, + { + "epoch": 0.9213929903723725, + "grad_norm": 0.5068113803863525, + "learning_rate": 1.6092162176580494e-06, + "loss": 0.8979, + "step": 10312 + }, + { + "epoch": 0.9214823419036344, + "grad_norm": 0.5210028290748596, + "learning_rate": 1.6055765419056456e-06, + "loss": 0.9481, + "step": 10313 + }, + { + "epoch": 0.9215716934348962, + "grad_norm": 0.48873379826545715, + "learning_rate": 1.6019409197020607e-06, + "loss": 0.9693, + "step": 10314 + }, + { + "epoch": 0.9216610449661581, + "grad_norm": 0.4486122727394104, + "learning_rate": 1.5983093513518066e-06, + "loss": 0.946, + "step": 10315 + }, + { + "epoch": 0.92175039649742, + "grad_norm": 0.38552728295326233, + "learning_rate": 1.5946818371590787e-06, + "loss": 0.9608, + "step": 10316 + }, + { + "epoch": 0.9218397480286818, + "grad_norm": 0.5378084182739258, + "learning_rate": 1.591058377427701e-06, + "loss": 0.9168, + "step": 10317 + }, + { + "epoch": 0.9219290995599437, + "grad_norm": 0.48840612173080444, + "learning_rate": 1.587438972461186e-06, + "loss": 0.9724, + "step": 10318 + }, + { + "epoch": 0.9220184510912056, + "grad_norm": 0.5533527135848999, + "learning_rate": 1.5838236225626968e-06, + "loss": 0.8688, + "step": 10319 + }, + { + "epoch": 0.9221078026224674, + "grad_norm": 0.6588174700737, + "learning_rate": 1.5802123280350633e-06, + "loss": 0.8956, + "step": 10320 + }, + { + "epoch": 0.9221971541537293, + "grad_norm": 0.4313110411167145, + "learning_rate": 1.5766050891807604e-06, + "loss": 0.9738, + "step": 10321 + }, + { + "epoch": 0.9222865056849912, + "grad_norm": 0.44930917024612427, + "learning_rate": 1.5730019063019407e-06, + "loss": 1.0021, + "step": 10322 + }, + { + "epoch": 0.922375857216253, + "grad_norm": 0.6756159067153931, + "learning_rate": 1.5694027797004073e-06, + "loss": 0.8297, + "step": 10323 + }, + { + "epoch": 0.9224652087475149, + "grad_norm": 0.5268403887748718, + "learning_rate": 1.5658077096776192e-06, + "loss": 0.9241, + "step": 10324 + }, + { + "epoch": 0.9225545602787768, + "grad_norm": 0.4503045082092285, + "learning_rate": 1.562216696534713e-06, + "loss": 0.8687, + "step": 10325 + }, + { + "epoch": 0.9226439118100387, + "grad_norm": 0.5141546130180359, + "learning_rate": 1.5586297405724648e-06, + "loss": 0.8619, + "step": 10326 + }, + { + "epoch": 0.9227332633413005, + "grad_norm": 0.5367921590805054, + "learning_rate": 1.5550468420913288e-06, + "loss": 0.9026, + "step": 10327 + }, + { + "epoch": 0.9228226148725623, + "grad_norm": 0.575573205947876, + "learning_rate": 1.5514680013913984e-06, + "loss": 0.897, + "step": 10328 + }, + { + "epoch": 0.9229119664038242, + "grad_norm": 0.44511479139328003, + "learning_rate": 1.5478932187724504e-06, + "loss": 0.9417, + "step": 10329 + }, + { + "epoch": 0.9230013179350861, + "grad_norm": 0.6052654981613159, + "learning_rate": 1.5443224945339063e-06, + "loss": 0.9232, + "step": 10330 + }, + { + "epoch": 0.923090669466348, + "grad_norm": 0.4598505198955536, + "learning_rate": 1.5407558289748547e-06, + "loss": 1.0025, + "step": 10331 + }, + { + "epoch": 0.9231800209976099, + "grad_norm": 0.5528203845024109, + "learning_rate": 1.53719322239404e-06, + "loss": 0.9608, + "step": 10332 + }, + { + "epoch": 0.9232693725288718, + "grad_norm": 0.47037777304649353, + "learning_rate": 1.5336346750898678e-06, + "loss": 0.9687, + "step": 10333 + }, + { + "epoch": 0.9233587240601335, + "grad_norm": 0.5850094556808472, + "learning_rate": 1.530080187360411e-06, + "loss": 0.9053, + "step": 10334 + }, + { + "epoch": 0.9234480755913954, + "grad_norm": 0.552905797958374, + "learning_rate": 1.5265297595033868e-06, + "loss": 0.9073, + "step": 10335 + }, + { + "epoch": 0.9235374271226573, + "grad_norm": 0.5310145020484924, + "learning_rate": 1.5229833918161906e-06, + "loss": 0.9366, + "step": 10336 + }, + { + "epoch": 0.9236267786539192, + "grad_norm": 0.4303600490093231, + "learning_rate": 1.5194410845958574e-06, + "loss": 0.9336, + "step": 10337 + }, + { + "epoch": 0.9237161301851811, + "grad_norm": 0.44541823863983154, + "learning_rate": 1.5159028381390994e-06, + "loss": 0.9732, + "step": 10338 + }, + { + "epoch": 0.9238054817164429, + "grad_norm": 0.6362563967704773, + "learning_rate": 1.5123686527422854e-06, + "loss": 0.9495, + "step": 10339 + }, + { + "epoch": 0.9238948332477048, + "grad_norm": 0.4896201491355896, + "learning_rate": 1.5088385287014395e-06, + "loss": 0.8939, + "step": 10340 + }, + { + "epoch": 0.9239841847789666, + "grad_norm": 0.5114855766296387, + "learning_rate": 1.5053124663122419e-06, + "loss": 0.8782, + "step": 10341 + }, + { + "epoch": 0.9240735363102285, + "grad_norm": 0.4664299190044403, + "learning_rate": 1.5017904658700398e-06, + "loss": 0.9548, + "step": 10342 + }, + { + "epoch": 0.9241628878414904, + "grad_norm": 0.5291177034378052, + "learning_rate": 1.4982725276698418e-06, + "loss": 0.8842, + "step": 10343 + }, + { + "epoch": 0.9242522393727522, + "grad_norm": 0.5422632694244385, + "learning_rate": 1.4947586520063062e-06, + "loss": 0.8602, + "step": 10344 + }, + { + "epoch": 0.9243415909040141, + "grad_norm": 0.6073907613754272, + "learning_rate": 1.49124883917377e-06, + "loss": 0.8669, + "step": 10345 + }, + { + "epoch": 0.924430942435276, + "grad_norm": 0.6418290734291077, + "learning_rate": 1.4877430894662036e-06, + "loss": 0.8869, + "step": 10346 + }, + { + "epoch": 0.9245202939665379, + "grad_norm": 0.634617030620575, + "learning_rate": 1.4842414031772612e-06, + "loss": 0.8845, + "step": 10347 + }, + { + "epoch": 0.9246096454977997, + "grad_norm": 0.5437284708023071, + "learning_rate": 1.480743780600241e-06, + "loss": 0.9179, + "step": 10348 + }, + { + "epoch": 0.9246989970290616, + "grad_norm": 0.44016095995903015, + "learning_rate": 1.4772502220281093e-06, + "loss": 0.9627, + "step": 10349 + }, + { + "epoch": 0.9247883485603234, + "grad_norm": 0.39443960785865784, + "learning_rate": 1.4737607277534815e-06, + "loss": 0.9157, + "step": 10350 + }, + { + "epoch": 0.9248777000915853, + "grad_norm": 0.4765508770942688, + "learning_rate": 1.4702752980686462e-06, + "loss": 0.973, + "step": 10351 + }, + { + "epoch": 0.9249670516228472, + "grad_norm": 0.5301365256309509, + "learning_rate": 1.4667939332655478e-06, + "loss": 0.8771, + "step": 10352 + }, + { + "epoch": 0.9250564031541091, + "grad_norm": 0.48352834582328796, + "learning_rate": 1.4633166336357807e-06, + "loss": 0.9077, + "step": 10353 + }, + { + "epoch": 0.925145754685371, + "grad_norm": 0.540468156337738, + "learning_rate": 1.4598433994706117e-06, + "loss": 0.9076, + "step": 10354 + }, + { + "epoch": 0.9252351062166327, + "grad_norm": 0.42415082454681396, + "learning_rate": 1.4563742310609529e-06, + "loss": 0.9269, + "step": 10355 + }, + { + "epoch": 0.9253244577478946, + "grad_norm": 0.4903179407119751, + "learning_rate": 1.4529091286973995e-06, + "loss": 0.933, + "step": 10356 + }, + { + "epoch": 0.9254138092791565, + "grad_norm": 0.5395703315734863, + "learning_rate": 1.4494480926701803e-06, + "loss": 0.8883, + "step": 10357 + }, + { + "epoch": 0.9255031608104184, + "grad_norm": 0.5688163042068481, + "learning_rate": 1.4459911232691914e-06, + "loss": 0.9469, + "step": 10358 + }, + { + "epoch": 0.9255925123416803, + "grad_norm": 0.41127413511276245, + "learning_rate": 1.4425382207839954e-06, + "loss": 1.0157, + "step": 10359 + }, + { + "epoch": 0.9256818638729422, + "grad_norm": 0.46226775646209717, + "learning_rate": 1.4390893855038057e-06, + "loss": 0.9535, + "step": 10360 + }, + { + "epoch": 0.925771215404204, + "grad_norm": 0.5173824429512024, + "learning_rate": 1.4356446177175077e-06, + "loss": 0.9242, + "step": 10361 + }, + { + "epoch": 0.9258605669354658, + "grad_norm": 0.518804669380188, + "learning_rate": 1.432203917713626e-06, + "loss": 0.9252, + "step": 10362 + }, + { + "epoch": 0.9259499184667277, + "grad_norm": 0.5390114188194275, + "learning_rate": 1.4287672857803636e-06, + "loss": 0.8615, + "step": 10363 + }, + { + "epoch": 0.9260392699979896, + "grad_norm": 0.48673346638679504, + "learning_rate": 1.4253347222055735e-06, + "loss": 0.9964, + "step": 10364 + }, + { + "epoch": 0.9261286215292515, + "grad_norm": 0.5194094181060791, + "learning_rate": 1.4219062272767703e-06, + "loss": 0.9008, + "step": 10365 + }, + { + "epoch": 0.9262179730605133, + "grad_norm": 0.5350120067596436, + "learning_rate": 1.418481801281124e-06, + "loss": 0.8815, + "step": 10366 + }, + { + "epoch": 0.9263073245917752, + "grad_norm": 0.4165932536125183, + "learning_rate": 1.4150614445054778e-06, + "loss": 0.9316, + "step": 10367 + }, + { + "epoch": 0.9263966761230371, + "grad_norm": 0.6019562482833862, + "learning_rate": 1.411645157236302e-06, + "loss": 0.9976, + "step": 10368 + }, + { + "epoch": 0.9264860276542989, + "grad_norm": 0.5063771605491638, + "learning_rate": 1.4082329397597626e-06, + "loss": 0.972, + "step": 10369 + }, + { + "epoch": 0.9265753791855608, + "grad_norm": 0.46804505586624146, + "learning_rate": 1.4048247923616642e-06, + "loss": 0.9119, + "step": 10370 + }, + { + "epoch": 0.9266647307168226, + "grad_norm": 0.4946081340312958, + "learning_rate": 1.4014207153274783e-06, + "loss": 0.9554, + "step": 10371 + }, + { + "epoch": 0.9267540822480845, + "grad_norm": 0.5020267963409424, + "learning_rate": 1.3980207089423326e-06, + "loss": 0.9053, + "step": 10372 + }, + { + "epoch": 0.9268434337793464, + "grad_norm": 0.5129795670509338, + "learning_rate": 1.3946247734910156e-06, + "loss": 0.9016, + "step": 10373 + }, + { + "epoch": 0.9269327853106083, + "grad_norm": 0.43974798917770386, + "learning_rate": 1.3912329092579668e-06, + "loss": 0.963, + "step": 10374 + }, + { + "epoch": 0.9270221368418702, + "grad_norm": 0.4542407691478729, + "learning_rate": 1.387845116527292e-06, + "loss": 0.9474, + "step": 10375 + }, + { + "epoch": 0.927111488373132, + "grad_norm": 0.418317973613739, + "learning_rate": 1.3844613955827535e-06, + "loss": 0.9627, + "step": 10376 + }, + { + "epoch": 0.9272008399043938, + "grad_norm": 0.5148499011993408, + "learning_rate": 1.3810817467077852e-06, + "loss": 0.9356, + "step": 10377 + }, + { + "epoch": 0.9272901914356557, + "grad_norm": 0.44719183444976807, + "learning_rate": 1.377706170185461e-06, + "loss": 0.9662, + "step": 10378 + }, + { + "epoch": 0.9273795429669176, + "grad_norm": 0.48192569613456726, + "learning_rate": 1.3743346662985157e-06, + "loss": 0.9405, + "step": 10379 + }, + { + "epoch": 0.9274688944981795, + "grad_norm": 0.6048439741134644, + "learning_rate": 1.3709672353293568e-06, + "loss": 0.9381, + "step": 10380 + }, + { + "epoch": 0.9275582460294414, + "grad_norm": 0.4958132207393646, + "learning_rate": 1.3676038775600364e-06, + "loss": 0.9396, + "step": 10381 + }, + { + "epoch": 0.9276475975607033, + "grad_norm": 0.4941558539867401, + "learning_rate": 1.3642445932722792e-06, + "loss": 0.9128, + "step": 10382 + }, + { + "epoch": 0.927736949091965, + "grad_norm": 0.5161193609237671, + "learning_rate": 1.36088938274746e-06, + "loss": 0.9273, + "step": 10383 + }, + { + "epoch": 0.9278263006232269, + "grad_norm": 0.47243648767471313, + "learning_rate": 1.3575382462666042e-06, + "loss": 0.9353, + "step": 10384 + }, + { + "epoch": 0.9279156521544888, + "grad_norm": 0.432420551776886, + "learning_rate": 1.3541911841104149e-06, + "loss": 0.9181, + "step": 10385 + }, + { + "epoch": 0.9280050036857507, + "grad_norm": 0.507323145866394, + "learning_rate": 1.3508481965592401e-06, + "loss": 0.8991, + "step": 10386 + }, + { + "epoch": 0.9280943552170126, + "grad_norm": 0.5412149429321289, + "learning_rate": 1.3475092838930947e-06, + "loss": 0.8649, + "step": 10387 + }, + { + "epoch": 0.9281837067482744, + "grad_norm": 0.42463141679763794, + "learning_rate": 1.344174446391644e-06, + "loss": 0.9389, + "step": 10388 + }, + { + "epoch": 0.9282730582795362, + "grad_norm": 0.4768441319465637, + "learning_rate": 1.3408436843342142e-06, + "loss": 0.9983, + "step": 10389 + }, + { + "epoch": 0.9283624098107981, + "grad_norm": 0.584004282951355, + "learning_rate": 1.3375169979997992e-06, + "loss": 0.9239, + "step": 10390 + }, + { + "epoch": 0.92845176134206, + "grad_norm": 0.4398382604122162, + "learning_rate": 1.3341943876670371e-06, + "loss": 0.9366, + "step": 10391 + }, + { + "epoch": 0.9285411128733219, + "grad_norm": 0.4435214102268219, + "learning_rate": 1.3308758536142384e-06, + "loss": 0.9742, + "step": 10392 + }, + { + "epoch": 0.9286304644045837, + "grad_norm": 0.5455980896949768, + "learning_rate": 1.327561396119359e-06, + "loss": 0.953, + "step": 10393 + }, + { + "epoch": 0.9287198159358456, + "grad_norm": 0.4934171438217163, + "learning_rate": 1.3242510154600207e-06, + "loss": 0.8277, + "step": 10394 + }, + { + "epoch": 0.9288091674671075, + "grad_norm": 0.4403308629989624, + "learning_rate": 1.3209447119135132e-06, + "loss": 0.9771, + "step": 10395 + }, + { + "epoch": 0.9288985189983693, + "grad_norm": 0.600942075252533, + "learning_rate": 1.3176424857567648e-06, + "loss": 0.8728, + "step": 10396 + }, + { + "epoch": 0.9289878705296312, + "grad_norm": 0.6132327318191528, + "learning_rate": 1.3143443372663767e-06, + "loss": 0.9608, + "step": 10397 + }, + { + "epoch": 0.929077222060893, + "grad_norm": 0.7262205481529236, + "learning_rate": 1.3110502667185997e-06, + "loss": 0.8809, + "step": 10398 + }, + { + "epoch": 0.9291665735921549, + "grad_norm": 0.49000728130340576, + "learning_rate": 1.3077602743893523e-06, + "loss": 0.8944, + "step": 10399 + }, + { + "epoch": 0.9292559251234168, + "grad_norm": 0.5410964488983154, + "learning_rate": 1.3044743605541975e-06, + "loss": 0.9214, + "step": 10400 + }, + { + "epoch": 0.9293452766546787, + "grad_norm": 0.46295785903930664, + "learning_rate": 1.3011925254883761e-06, + "loss": 0.9303, + "step": 10401 + }, + { + "epoch": 0.9294346281859406, + "grad_norm": 0.4503743052482605, + "learning_rate": 1.2979147694667738e-06, + "loss": 0.9973, + "step": 10402 + }, + { + "epoch": 0.9295239797172024, + "grad_norm": 0.4440877437591553, + "learning_rate": 1.2946410927639374e-06, + "loss": 0.9027, + "step": 10403 + }, + { + "epoch": 0.9296133312484642, + "grad_norm": 0.5882024765014648, + "learning_rate": 1.29137149565407e-06, + "loss": 0.8308, + "step": 10404 + }, + { + "epoch": 0.9297026827797261, + "grad_norm": 0.6086094975471497, + "learning_rate": 1.2881059784110362e-06, + "loss": 0.9265, + "step": 10405 + }, + { + "epoch": 0.929792034310988, + "grad_norm": 0.522862434387207, + "learning_rate": 1.284844541308361e-06, + "loss": 0.9177, + "step": 10406 + }, + { + "epoch": 0.9298813858422499, + "grad_norm": 0.42290636897087097, + "learning_rate": 1.2815871846192152e-06, + "loss": 0.9461, + "step": 10407 + }, + { + "epoch": 0.9299707373735118, + "grad_norm": 0.47796404361724854, + "learning_rate": 1.278333908616447e-06, + "loss": 0.9295, + "step": 10408 + }, + { + "epoch": 0.9300600889047737, + "grad_norm": 0.46141406893730164, + "learning_rate": 1.2750847135725496e-06, + "loss": 0.9564, + "step": 10409 + }, + { + "epoch": 0.9301494404360354, + "grad_norm": 0.5057287216186523, + "learning_rate": 1.2718395997596833e-06, + "loss": 0.9667, + "step": 10410 + }, + { + "epoch": 0.9302387919672973, + "grad_norm": 0.5116837620735168, + "learning_rate": 1.268598567449647e-06, + "loss": 0.9412, + "step": 10411 + }, + { + "epoch": 0.9303281434985592, + "grad_norm": 0.47364234924316406, + "learning_rate": 1.2653616169139237e-06, + "loss": 0.9878, + "step": 10412 + }, + { + "epoch": 0.9304174950298211, + "grad_norm": 0.5040470957756042, + "learning_rate": 1.262128748423641e-06, + "loss": 1.0276, + "step": 10413 + }, + { + "epoch": 0.930506846561083, + "grad_norm": 0.5426703095436096, + "learning_rate": 1.2588999622495768e-06, + "loss": 0.9327, + "step": 10414 + }, + { + "epoch": 0.9305961980923448, + "grad_norm": 0.5746845602989197, + "learning_rate": 1.2556752586621868e-06, + "loss": 0.9452, + "step": 10415 + }, + { + "epoch": 0.9306855496236067, + "grad_norm": 0.5168527364730835, + "learning_rate": 1.2524546379315717e-06, + "loss": 0.9003, + "step": 10416 + }, + { + "epoch": 0.9307749011548685, + "grad_norm": 0.40214788913726807, + "learning_rate": 1.2492381003274934e-06, + "loss": 0.9048, + "step": 10417 + }, + { + "epoch": 0.9308642526861304, + "grad_norm": 0.43776369094848633, + "learning_rate": 1.2460256461193754e-06, + "loss": 0.8901, + "step": 10418 + }, + { + "epoch": 0.9309536042173923, + "grad_norm": 0.4549417197704315, + "learning_rate": 1.2428172755762802e-06, + "loss": 0.9345, + "step": 10419 + }, + { + "epoch": 0.9310429557486541, + "grad_norm": 0.6516943573951721, + "learning_rate": 1.2396129889669595e-06, + "loss": 0.8175, + "step": 10420 + }, + { + "epoch": 0.931132307279916, + "grad_norm": 0.3830028772354126, + "learning_rate": 1.236412786559793e-06, + "loss": 0.9797, + "step": 10421 + }, + { + "epoch": 0.9312216588111779, + "grad_norm": 0.5979812741279602, + "learning_rate": 1.2332166686228386e-06, + "loss": 0.8904, + "step": 10422 + }, + { + "epoch": 0.9313110103424398, + "grad_norm": 0.44422391057014465, + "learning_rate": 1.2300246354238042e-06, + "loss": 0.9199, + "step": 10423 + }, + { + "epoch": 0.9314003618737016, + "grad_norm": 0.45250019431114197, + "learning_rate": 1.2268366872300597e-06, + "loss": 1.0105, + "step": 10424 + }, + { + "epoch": 0.9314897134049634, + "grad_norm": 0.5378450155258179, + "learning_rate": 1.2236528243086298e-06, + "loss": 0.9126, + "step": 10425 + }, + { + "epoch": 0.9315790649362253, + "grad_norm": 0.518375039100647, + "learning_rate": 1.2204730469261906e-06, + "loss": 0.835, + "step": 10426 + }, + { + "epoch": 0.9316684164674872, + "grad_norm": 0.6360508799552917, + "learning_rate": 1.217297355349084e-06, + "loss": 0.7857, + "step": 10427 + }, + { + "epoch": 0.9317577679987491, + "grad_norm": 0.43396756052970886, + "learning_rate": 1.21412574984332e-06, + "loss": 0.9583, + "step": 10428 + }, + { + "epoch": 0.931847119530011, + "grad_norm": 0.4555058777332306, + "learning_rate": 1.210958230674536e-06, + "loss": 0.939, + "step": 10429 + }, + { + "epoch": 0.9319364710612729, + "grad_norm": 0.38424840569496155, + "learning_rate": 1.2077947981080584e-06, + "loss": 1.0483, + "step": 10430 + }, + { + "epoch": 0.9320258225925346, + "grad_norm": 0.5196928977966309, + "learning_rate": 1.2046354524088477e-06, + "loss": 0.901, + "step": 10431 + }, + { + "epoch": 0.9321151741237965, + "grad_norm": 0.443092942237854, + "learning_rate": 1.2014801938415422e-06, + "loss": 0.9166, + "step": 10432 + }, + { + "epoch": 0.9322045256550584, + "grad_norm": 0.6846402883529663, + "learning_rate": 1.1983290226704247e-06, + "loss": 0.9144, + "step": 10433 + }, + { + "epoch": 0.9322938771863203, + "grad_norm": 0.5252601504325867, + "learning_rate": 1.1951819391594398e-06, + "loss": 0.837, + "step": 10434 + }, + { + "epoch": 0.9323832287175822, + "grad_norm": 0.48273909091949463, + "learning_rate": 1.1920389435721935e-06, + "loss": 0.9412, + "step": 10435 + }, + { + "epoch": 0.932472580248844, + "grad_norm": 0.4483881890773773, + "learning_rate": 1.188900036171936e-06, + "loss": 0.9079, + "step": 10436 + }, + { + "epoch": 0.9325619317801059, + "grad_norm": 0.5480309128761292, + "learning_rate": 1.1857652172215905e-06, + "loss": 0.8815, + "step": 10437 + }, + { + "epoch": 0.9326512833113677, + "grad_norm": 0.4683678150177002, + "learning_rate": 1.1826344869837359e-06, + "loss": 0.9372, + "step": 10438 + }, + { + "epoch": 0.9327406348426296, + "grad_norm": 0.43585410714149475, + "learning_rate": 1.1795078457205956e-06, + "loss": 0.9516, + "step": 10439 + }, + { + "epoch": 0.9328299863738915, + "grad_norm": 0.5120673775672913, + "learning_rate": 1.17638529369406e-06, + "loss": 0.8705, + "step": 10440 + }, + { + "epoch": 0.9329193379051534, + "grad_norm": 0.4925757944583893, + "learning_rate": 1.1732668311656815e-06, + "loss": 0.9006, + "step": 10441 + }, + { + "epoch": 0.9330086894364152, + "grad_norm": 0.5412935614585876, + "learning_rate": 1.1701524583966562e-06, + "loss": 0.9419, + "step": 10442 + }, + { + "epoch": 0.9330980409676771, + "grad_norm": 0.5587424039840698, + "learning_rate": 1.1670421756478589e-06, + "loss": 0.9397, + "step": 10443 + }, + { + "epoch": 0.933187392498939, + "grad_norm": 0.6443805694580078, + "learning_rate": 1.163935983179798e-06, + "loss": 0.9046, + "step": 10444 + }, + { + "epoch": 0.9332767440302008, + "grad_norm": 0.4743395149707794, + "learning_rate": 1.1608338812526487e-06, + "loss": 0.8918, + "step": 10445 + }, + { + "epoch": 0.9333660955614627, + "grad_norm": 0.44435209035873413, + "learning_rate": 1.157735870126253e-06, + "loss": 0.9825, + "step": 10446 + }, + { + "epoch": 0.9334554470927245, + "grad_norm": 0.46846985816955566, + "learning_rate": 1.1546419500601036e-06, + "loss": 0.9809, + "step": 10447 + }, + { + "epoch": 0.9335447986239864, + "grad_norm": 0.5460909008979797, + "learning_rate": 1.1515521213133429e-06, + "loss": 0.9302, + "step": 10448 + }, + { + "epoch": 0.9336341501552483, + "grad_norm": 0.5399797558784485, + "learning_rate": 1.1484663841447807e-06, + "loss": 0.9177, + "step": 10449 + }, + { + "epoch": 0.9337235016865102, + "grad_norm": 0.6021638512611389, + "learning_rate": 1.1453847388128712e-06, + "loss": 0.964, + "step": 10450 + }, + { + "epoch": 0.933812853217772, + "grad_norm": 0.4933503568172455, + "learning_rate": 1.1423071855757473e-06, + "loss": 0.8803, + "step": 10451 + }, + { + "epoch": 0.9339022047490338, + "grad_norm": 0.5711500644683838, + "learning_rate": 1.13923372469118e-06, + "loss": 0.9169, + "step": 10452 + }, + { + "epoch": 0.9339915562802957, + "grad_norm": 0.49056991934776306, + "learning_rate": 1.136164356416608e-06, + "loss": 0.8962, + "step": 10453 + }, + { + "epoch": 0.9340809078115576, + "grad_norm": 0.5031728148460388, + "learning_rate": 1.1330990810091257e-06, + "loss": 0.963, + "step": 10454 + }, + { + "epoch": 0.9341702593428195, + "grad_norm": 0.42288509011268616, + "learning_rate": 1.1300378987254723e-06, + "loss": 1.0058, + "step": 10455 + }, + { + "epoch": 0.9342596108740814, + "grad_norm": 0.4871813952922821, + "learning_rate": 1.1269808098220647e-06, + "loss": 0.9285, + "step": 10456 + }, + { + "epoch": 0.9343489624053433, + "grad_norm": 0.537005603313446, + "learning_rate": 1.1239278145549648e-06, + "loss": 0.9045, + "step": 10457 + }, + { + "epoch": 0.934438313936605, + "grad_norm": 0.4379669725894928, + "learning_rate": 1.1208789131798958e-06, + "loss": 0.9574, + "step": 10458 + }, + { + "epoch": 0.9345276654678669, + "grad_norm": 0.4513357877731323, + "learning_rate": 1.1178341059522256e-06, + "loss": 0.9283, + "step": 10459 + }, + { + "epoch": 0.9346170169991288, + "grad_norm": 0.653385579586029, + "learning_rate": 1.114793393126995e-06, + "loss": 0.8258, + "step": 10460 + }, + { + "epoch": 0.9347063685303907, + "grad_norm": 0.44483664631843567, + "learning_rate": 1.1117567749588997e-06, + "loss": 0.9546, + "step": 10461 + }, + { + "epoch": 0.9347957200616526, + "grad_norm": 0.4962851405143738, + "learning_rate": 1.1087242517022812e-06, + "loss": 0.9296, + "step": 10462 + }, + { + "epoch": 0.9348850715929145, + "grad_norm": 0.4276025891304016, + "learning_rate": 1.1056958236111525e-06, + "loss": 0.9155, + "step": 10463 + }, + { + "epoch": 0.9349744231241763, + "grad_norm": 0.5110339522361755, + "learning_rate": 1.1026714909391778e-06, + "loss": 0.9607, + "step": 10464 + }, + { + "epoch": 0.9350637746554381, + "grad_norm": 0.42139366269111633, + "learning_rate": 1.0996512539396707e-06, + "loss": 1.0041, + "step": 10465 + }, + { + "epoch": 0.9351531261867, + "grad_norm": 0.4701647162437439, + "learning_rate": 1.0966351128656072e-06, + "loss": 0.9622, + "step": 10466 + }, + { + "epoch": 0.9352424777179619, + "grad_norm": 0.47479528188705444, + "learning_rate": 1.093623067969629e-06, + "loss": 0.9374, + "step": 10467 + }, + { + "epoch": 0.9353318292492238, + "grad_norm": 0.5557815432548523, + "learning_rate": 1.0906151195040294e-06, + "loss": 0.92, + "step": 10468 + }, + { + "epoch": 0.9354211807804856, + "grad_norm": 0.46990975737571716, + "learning_rate": 1.087611267720745e-06, + "loss": 0.912, + "step": 10469 + }, + { + "epoch": 0.9355105323117475, + "grad_norm": 0.5090728402137756, + "learning_rate": 1.0846115128713862e-06, + "loss": 0.9633, + "step": 10470 + }, + { + "epoch": 0.9355998838430094, + "grad_norm": 0.45003437995910645, + "learning_rate": 1.081615855207213e-06, + "loss": 0.9622, + "step": 10471 + }, + { + "epoch": 0.9356892353742712, + "grad_norm": 0.5591026544570923, + "learning_rate": 1.0786242949791415e-06, + "loss": 0.9682, + "step": 10472 + }, + { + "epoch": 0.9357785869055331, + "grad_norm": 0.48961174488067627, + "learning_rate": 1.0756368324377542e-06, + "loss": 0.9561, + "step": 10473 + }, + { + "epoch": 0.9358679384367949, + "grad_norm": 0.6354184746742249, + "learning_rate": 1.0726534678332733e-06, + "loss": 0.8864, + "step": 10474 + }, + { + "epoch": 0.9359572899680568, + "grad_norm": 0.3932569921016693, + "learning_rate": 1.069674201415599e-06, + "loss": 0.9948, + "step": 10475 + }, + { + "epoch": 0.9360466414993187, + "grad_norm": 0.4889991283416748, + "learning_rate": 1.0666990334342707e-06, + "loss": 0.9149, + "step": 10476 + }, + { + "epoch": 0.9361359930305806, + "grad_norm": 0.46199727058410645, + "learning_rate": 1.0637279641384834e-06, + "loss": 0.9391, + "step": 10477 + }, + { + "epoch": 0.9362253445618425, + "grad_norm": 0.5159189701080322, + "learning_rate": 1.0607609937771046e-06, + "loss": 0.9332, + "step": 10478 + }, + { + "epoch": 0.9363146960931042, + "grad_norm": 0.5201767086982727, + "learning_rate": 1.0577981225986467e-06, + "loss": 0.8602, + "step": 10479 + }, + { + "epoch": 0.9364040476243661, + "grad_norm": 0.4806266725063324, + "learning_rate": 1.0548393508512887e-06, + "loss": 0.9179, + "step": 10480 + }, + { + "epoch": 0.936493399155628, + "grad_norm": 0.4578525722026825, + "learning_rate": 1.0518846787828496e-06, + "loss": 0.9347, + "step": 10481 + }, + { + "epoch": 0.9365827506868899, + "grad_norm": 0.4183438718318939, + "learning_rate": 1.0489341066408142e-06, + "loss": 0.9347, + "step": 10482 + }, + { + "epoch": 0.9366721022181518, + "grad_norm": 0.5696753859519958, + "learning_rate": 1.0459876346723297e-06, + "loss": 0.8983, + "step": 10483 + }, + { + "epoch": 0.9367614537494137, + "grad_norm": 0.46501290798187256, + "learning_rate": 1.0430452631241928e-06, + "loss": 0.9691, + "step": 10484 + }, + { + "epoch": 0.9368508052806755, + "grad_norm": 0.5632017254829407, + "learning_rate": 1.0401069922428619e-06, + "loss": 0.9061, + "step": 10485 + }, + { + "epoch": 0.9369401568119373, + "grad_norm": 0.4606979787349701, + "learning_rate": 1.0371728222744402e-06, + "loss": 0.9337, + "step": 10486 + }, + { + "epoch": 0.9370295083431992, + "grad_norm": 0.42771193385124207, + "learning_rate": 1.0342427534647036e-06, + "loss": 0.9453, + "step": 10487 + }, + { + "epoch": 0.9371188598744611, + "grad_norm": 0.49479082226753235, + "learning_rate": 1.0313167860590777e-06, + "loss": 0.9413, + "step": 10488 + }, + { + "epoch": 0.937208211405723, + "grad_norm": 0.5891922116279602, + "learning_rate": 1.0283949203026332e-06, + "loss": 0.934, + "step": 10489 + }, + { + "epoch": 0.9372975629369849, + "grad_norm": 0.4088734984397888, + "learning_rate": 1.0254771564401189e-06, + "loss": 0.9396, + "step": 10490 + }, + { + "epoch": 0.9373869144682467, + "grad_norm": 0.5687556862831116, + "learning_rate": 1.0225634947159223e-06, + "loss": 0.9473, + "step": 10491 + }, + { + "epoch": 0.9374762659995086, + "grad_norm": 0.46405094861984253, + "learning_rate": 1.019653935374093e-06, + "loss": 0.9307, + "step": 10492 + }, + { + "epoch": 0.9375656175307704, + "grad_norm": 0.451276570558548, + "learning_rate": 1.0167484786583359e-06, + "loss": 0.9777, + "step": 10493 + }, + { + "epoch": 0.9376549690620323, + "grad_norm": 0.46850451827049255, + "learning_rate": 1.013847124812023e-06, + "loss": 0.9389, + "step": 10494 + }, + { + "epoch": 0.9377443205932942, + "grad_norm": 0.454631507396698, + "learning_rate": 1.010949874078171e-06, + "loss": 0.8892, + "step": 10495 + }, + { + "epoch": 0.937833672124556, + "grad_norm": 0.6396118402481079, + "learning_rate": 1.0080567266994466e-06, + "loss": 0.8246, + "step": 10496 + }, + { + "epoch": 0.9379230236558179, + "grad_norm": 0.5484038591384888, + "learning_rate": 1.00516768291819e-06, + "loss": 0.8602, + "step": 10497 + }, + { + "epoch": 0.9380123751870798, + "grad_norm": 0.5306549072265625, + "learning_rate": 1.0022827429763903e-06, + "loss": 1.019, + "step": 10498 + }, + { + "epoch": 0.9381017267183417, + "grad_norm": 0.5081790089607239, + "learning_rate": 9.99401907115688e-07, + "loss": 0.8561, + "step": 10499 + }, + { + "epoch": 0.9381910782496035, + "grad_norm": 0.43956875801086426, + "learning_rate": 9.96525175577384e-07, + "loss": 0.9825, + "step": 10500 + }, + { + "epoch": 0.9382804297808653, + "grad_norm": 0.6813711524009705, + "learning_rate": 9.936525486024362e-07, + "loss": 0.7838, + "step": 10501 + }, + { + "epoch": 0.9383697813121272, + "grad_norm": 0.5399671792984009, + "learning_rate": 9.907840264314572e-07, + "loss": 0.943, + "step": 10502 + }, + { + "epoch": 0.9384591328433891, + "grad_norm": 0.4913422763347626, + "learning_rate": 9.87919609304716e-07, + "loss": 0.9302, + "step": 10503 + }, + { + "epoch": 0.938548484374651, + "grad_norm": 0.5128286480903625, + "learning_rate": 9.850592974621375e-07, + "loss": 0.9291, + "step": 10504 + }, + { + "epoch": 0.9386378359059129, + "grad_norm": 0.6238715052604675, + "learning_rate": 9.822030911433023e-07, + "loss": 0.9145, + "step": 10505 + }, + { + "epoch": 0.9387271874371748, + "grad_norm": 0.5400130748748779, + "learning_rate": 9.793509905874576e-07, + "loss": 0.9463, + "step": 10506 + }, + { + "epoch": 0.9388165389684365, + "grad_norm": 0.534290075302124, + "learning_rate": 9.765029960334849e-07, + "loss": 0.9139, + "step": 10507 + }, + { + "epoch": 0.9389058904996984, + "grad_norm": 0.48370060324668884, + "learning_rate": 9.736591077199374e-07, + "loss": 0.9766, + "step": 10508 + }, + { + "epoch": 0.9389952420309603, + "grad_norm": 0.44061046838760376, + "learning_rate": 9.708193258850306e-07, + "loss": 0.9452, + "step": 10509 + }, + { + "epoch": 0.9390845935622222, + "grad_norm": 0.4897972643375397, + "learning_rate": 9.679836507666185e-07, + "loss": 0.9245, + "step": 10510 + }, + { + "epoch": 0.9391739450934841, + "grad_norm": 0.5403768420219421, + "learning_rate": 9.65152082602211e-07, + "loss": 0.9311, + "step": 10511 + }, + { + "epoch": 0.939263296624746, + "grad_norm": 0.4956395626068115, + "learning_rate": 9.62324621628996e-07, + "loss": 0.9089, + "step": 10512 + }, + { + "epoch": 0.9393526481560077, + "grad_norm": 0.5809354782104492, + "learning_rate": 9.595012680838012e-07, + "loss": 0.8998, + "step": 10513 + }, + { + "epoch": 0.9394419996872696, + "grad_norm": 0.46071991324424744, + "learning_rate": 9.566820222031036e-07, + "loss": 0.8987, + "step": 10514 + }, + { + "epoch": 0.9395313512185315, + "grad_norm": 0.49813321232795715, + "learning_rate": 9.538668842230537e-07, + "loss": 0.9903, + "step": 10515 + }, + { + "epoch": 0.9396207027497934, + "grad_norm": 0.47896721959114075, + "learning_rate": 9.510558543794457e-07, + "loss": 1.0244, + "step": 10516 + }, + { + "epoch": 0.9397100542810553, + "grad_norm": 0.4987740218639374, + "learning_rate": 9.482489329077304e-07, + "loss": 0.9312, + "step": 10517 + }, + { + "epoch": 0.9397994058123171, + "grad_norm": 0.4500676989555359, + "learning_rate": 9.454461200430253e-07, + "loss": 0.9262, + "step": 10518 + }, + { + "epoch": 0.939888757343579, + "grad_norm": 0.4563513398170471, + "learning_rate": 9.426474160200926e-07, + "loss": 0.9153, + "step": 10519 + }, + { + "epoch": 0.9399781088748408, + "grad_norm": 0.4408057928085327, + "learning_rate": 9.398528210733504e-07, + "loss": 0.9599, + "step": 10520 + }, + { + "epoch": 0.9400674604061027, + "grad_norm": 0.5273951888084412, + "learning_rate": 9.370623354368779e-07, + "loss": 0.9386, + "step": 10521 + }, + { + "epoch": 0.9401568119373646, + "grad_norm": 0.6240958571434021, + "learning_rate": 9.342759593444106e-07, + "loss": 0.8736, + "step": 10522 + }, + { + "epoch": 0.9402461634686264, + "grad_norm": 0.48620718717575073, + "learning_rate": 9.314936930293283e-07, + "loss": 0.9117, + "step": 10523 + }, + { + "epoch": 0.9403355149998883, + "grad_norm": 0.45210275053977966, + "learning_rate": 9.287155367246891e-07, + "loss": 0.9074, + "step": 10524 + }, + { + "epoch": 0.9404248665311502, + "grad_norm": 0.5615553259849548, + "learning_rate": 9.259414906631791e-07, + "loss": 0.9937, + "step": 10525 + }, + { + "epoch": 0.9405142180624121, + "grad_norm": 0.46976426243782043, + "learning_rate": 9.23171555077168e-07, + "loss": 1.0001, + "step": 10526 + }, + { + "epoch": 0.9406035695936739, + "grad_norm": 0.5349940061569214, + "learning_rate": 9.204057301986535e-07, + "loss": 0.8587, + "step": 10527 + }, + { + "epoch": 0.9406929211249357, + "grad_norm": 0.48549193143844604, + "learning_rate": 9.176440162593169e-07, + "loss": 0.9925, + "step": 10528 + }, + { + "epoch": 0.9407822726561976, + "grad_norm": 0.48987144231796265, + "learning_rate": 9.148864134904733e-07, + "loss": 0.9652, + "step": 10529 + }, + { + "epoch": 0.9408716241874595, + "grad_norm": 0.4827273190021515, + "learning_rate": 9.12132922123099e-07, + "loss": 0.8644, + "step": 10530 + }, + { + "epoch": 0.9409609757187214, + "grad_norm": 0.5220755934715271, + "learning_rate": 9.093835423878317e-07, + "loss": 0.9001, + "step": 10531 + }, + { + "epoch": 0.9410503272499833, + "grad_norm": 0.4924890995025635, + "learning_rate": 9.066382745149649e-07, + "loss": 0.9662, + "step": 10532 + }, + { + "epoch": 0.9411396787812452, + "grad_norm": 0.5399982929229736, + "learning_rate": 9.038971187344369e-07, + "loss": 0.8605, + "step": 10533 + }, + { + "epoch": 0.9412290303125069, + "grad_norm": 0.5098146200180054, + "learning_rate": 9.011600752758531e-07, + "loss": 0.9553, + "step": 10534 + }, + { + "epoch": 0.9413183818437688, + "grad_norm": 0.42846280336380005, + "learning_rate": 8.984271443684633e-07, + "loss": 0.9544, + "step": 10535 + }, + { + "epoch": 0.9414077333750307, + "grad_norm": 0.5252984762191772, + "learning_rate": 8.9569832624119e-07, + "loss": 0.8778, + "step": 10536 + }, + { + "epoch": 0.9414970849062926, + "grad_norm": 0.5305891036987305, + "learning_rate": 8.929736211226003e-07, + "loss": 0.8921, + "step": 10537 + }, + { + "epoch": 0.9415864364375545, + "grad_norm": 0.4615854024887085, + "learning_rate": 8.902530292409062e-07, + "loss": 0.9217, + "step": 10538 + }, + { + "epoch": 0.9416757879688163, + "grad_norm": 0.45759010314941406, + "learning_rate": 8.87536550824003e-07, + "loss": 0.9712, + "step": 10539 + }, + { + "epoch": 0.9417651395000782, + "grad_norm": 0.48792535066604614, + "learning_rate": 8.848241860994089e-07, + "loss": 0.876, + "step": 10540 + }, + { + "epoch": 0.94185449103134, + "grad_norm": 0.4174993932247162, + "learning_rate": 8.821159352943143e-07, + "loss": 0.9367, + "step": 10541 + }, + { + "epoch": 0.9419438425626019, + "grad_norm": 0.4825289845466614, + "learning_rate": 8.794117986355766e-07, + "loss": 0.9969, + "step": 10542 + }, + { + "epoch": 0.9420331940938638, + "grad_norm": 0.44167813658714294, + "learning_rate": 8.767117763496813e-07, + "loss": 0.8798, + "step": 10543 + }, + { + "epoch": 0.9421225456251257, + "grad_norm": 0.5294803380966187, + "learning_rate": 8.740158686627975e-07, + "loss": 0.8875, + "step": 10544 + }, + { + "epoch": 0.9422118971563875, + "grad_norm": 0.4376986622810364, + "learning_rate": 8.71324075800728e-07, + "loss": 0.9939, + "step": 10545 + }, + { + "epoch": 0.9423012486876494, + "grad_norm": 0.5412419438362122, + "learning_rate": 8.686363979889478e-07, + "loss": 0.9387, + "step": 10546 + }, + { + "epoch": 0.9423906002189113, + "grad_norm": 0.5909743309020996, + "learning_rate": 8.659528354525603e-07, + "loss": 0.8535, + "step": 10547 + }, + { + "epoch": 0.9424799517501731, + "grad_norm": 0.430073082447052, + "learning_rate": 8.632733884163635e-07, + "loss": 0.9364, + "step": 10548 + }, + { + "epoch": 0.942569303281435, + "grad_norm": 0.5627753734588623, + "learning_rate": 8.605980571047723e-07, + "loss": 0.9279, + "step": 10549 + }, + { + "epoch": 0.9426586548126968, + "grad_norm": 0.526467502117157, + "learning_rate": 8.579268417418851e-07, + "loss": 0.9335, + "step": 10550 + }, + { + "epoch": 0.9427480063439587, + "grad_norm": 0.5456862449645996, + "learning_rate": 8.552597425514508e-07, + "loss": 0.8866, + "step": 10551 + }, + { + "epoch": 0.9428373578752206, + "grad_norm": 0.5069810152053833, + "learning_rate": 8.525967597568463e-07, + "loss": 0.9052, + "step": 10552 + }, + { + "epoch": 0.9429267094064825, + "grad_norm": 0.625331461429596, + "learning_rate": 8.499378935811431e-07, + "loss": 0.9148, + "step": 10553 + }, + { + "epoch": 0.9430160609377444, + "grad_norm": 0.5902383327484131, + "learning_rate": 8.472831442470408e-07, + "loss": 0.802, + "step": 10554 + }, + { + "epoch": 0.9431054124690061, + "grad_norm": 0.42086061835289, + "learning_rate": 8.446325119769061e-07, + "loss": 0.9733, + "step": 10555 + }, + { + "epoch": 0.943194764000268, + "grad_norm": 0.5746432542800903, + "learning_rate": 8.419859969927557e-07, + "loss": 0.8944, + "step": 10556 + }, + { + "epoch": 0.9432841155315299, + "grad_norm": 0.5022989511489868, + "learning_rate": 8.393435995162624e-07, + "loss": 0.9403, + "step": 10557 + }, + { + "epoch": 0.9433734670627918, + "grad_norm": 0.443338543176651, + "learning_rate": 8.367053197687602e-07, + "loss": 0.9092, + "step": 10558 + }, + { + "epoch": 0.9434628185940537, + "grad_norm": 0.592194676399231, + "learning_rate": 8.340711579712391e-07, + "loss": 0.8213, + "step": 10559 + }, + { + "epoch": 0.9435521701253156, + "grad_norm": 0.47716042399406433, + "learning_rate": 8.314411143443168e-07, + "loss": 0.9157, + "step": 10560 + }, + { + "epoch": 0.9436415216565774, + "grad_norm": 0.49664196372032166, + "learning_rate": 8.288151891083062e-07, + "loss": 0.8716, + "step": 10561 + }, + { + "epoch": 0.9437308731878392, + "grad_norm": 0.471483439207077, + "learning_rate": 8.261933824831481e-07, + "loss": 0.9732, + "step": 10562 + }, + { + "epoch": 0.9438202247191011, + "grad_norm": 0.4351029396057129, + "learning_rate": 8.235756946884554e-07, + "loss": 1.0083, + "step": 10563 + }, + { + "epoch": 0.943909576250363, + "grad_norm": 0.4930981993675232, + "learning_rate": 8.209621259434753e-07, + "loss": 0.9445, + "step": 10564 + }, + { + "epoch": 0.9439989277816249, + "grad_norm": 0.42323318123817444, + "learning_rate": 8.183526764671267e-07, + "loss": 0.9145, + "step": 10565 + }, + { + "epoch": 0.9440882793128867, + "grad_norm": 0.4111577570438385, + "learning_rate": 8.157473464779852e-07, + "loss": 0.9204, + "step": 10566 + }, + { + "epoch": 0.9441776308441486, + "grad_norm": 0.6320559978485107, + "learning_rate": 8.13146136194265e-07, + "loss": 0.9821, + "step": 10567 + }, + { + "epoch": 0.9442669823754105, + "grad_norm": 0.47050777077674866, + "learning_rate": 8.105490458338527e-07, + "loss": 0.9344, + "step": 10568 + }, + { + "epoch": 0.9443563339066723, + "grad_norm": 0.6935842037200928, + "learning_rate": 8.079560756142857e-07, + "loss": 0.9204, + "step": 10569 + }, + { + "epoch": 0.9444456854379342, + "grad_norm": 0.5569193363189697, + "learning_rate": 8.0536722575274e-07, + "loss": 0.8738, + "step": 10570 + }, + { + "epoch": 0.944535036969196, + "grad_norm": 0.4778384864330292, + "learning_rate": 8.027824964660646e-07, + "loss": 0.924, + "step": 10571 + }, + { + "epoch": 0.9446243885004579, + "grad_norm": 0.4366244375705719, + "learning_rate": 8.00201887970764e-07, + "loss": 0.9712, + "step": 10572 + }, + { + "epoch": 0.9447137400317198, + "grad_norm": 0.6287685036659241, + "learning_rate": 7.976254004829875e-07, + "loss": 0.9883, + "step": 10573 + }, + { + "epoch": 0.9448030915629817, + "grad_norm": 0.464234858751297, + "learning_rate": 7.950530342185402e-07, + "loss": 0.9353, + "step": 10574 + }, + { + "epoch": 0.9448924430942435, + "grad_norm": 0.5004445314407349, + "learning_rate": 7.924847893928888e-07, + "loss": 0.8881, + "step": 10575 + }, + { + "epoch": 0.9449817946255054, + "grad_norm": 0.5838055610656738, + "learning_rate": 7.8992066622115e-07, + "loss": 0.9222, + "step": 10576 + }, + { + "epoch": 0.9450711461567672, + "grad_norm": 0.4792827367782593, + "learning_rate": 7.873606649180965e-07, + "loss": 0.9294, + "step": 10577 + }, + { + "epoch": 0.9451604976880291, + "grad_norm": 0.4360794723033905, + "learning_rate": 7.848047856981622e-07, + "loss": 0.9488, + "step": 10578 + }, + { + "epoch": 0.945249849219291, + "grad_norm": 0.5515472888946533, + "learning_rate": 7.822530287754204e-07, + "loss": 0.8643, + "step": 10579 + }, + { + "epoch": 0.9453392007505529, + "grad_norm": 0.6533846855163574, + "learning_rate": 7.797053943636112e-07, + "loss": 0.9626, + "step": 10580 + }, + { + "epoch": 0.9454285522818148, + "grad_norm": 0.5607351660728455, + "learning_rate": 7.771618826761252e-07, + "loss": 0.9153, + "step": 10581 + }, + { + "epoch": 0.9455179038130765, + "grad_norm": 0.5081503987312317, + "learning_rate": 7.746224939260083e-07, + "loss": 0.8538, + "step": 10582 + }, + { + "epoch": 0.9456072553443384, + "grad_norm": 0.5000874996185303, + "learning_rate": 7.720872283259684e-07, + "loss": 0.9195, + "step": 10583 + }, + { + "epoch": 0.9456966068756003, + "grad_norm": 0.49381589889526367, + "learning_rate": 7.695560860883467e-07, + "loss": 0.9285, + "step": 10584 + }, + { + "epoch": 0.9457859584068622, + "grad_norm": 0.54331374168396, + "learning_rate": 7.670290674251679e-07, + "loss": 0.9005, + "step": 10585 + }, + { + "epoch": 0.9458753099381241, + "grad_norm": 0.5094651579856873, + "learning_rate": 7.64506172548085e-07, + "loss": 0.8645, + "step": 10586 + }, + { + "epoch": 0.945964661469386, + "grad_norm": 0.5358061194419861, + "learning_rate": 7.619874016684237e-07, + "loss": 0.8771, + "step": 10587 + }, + { + "epoch": 0.9460540130006478, + "grad_norm": 0.45969656109809875, + "learning_rate": 7.594727549971592e-07, + "loss": 0.9333, + "step": 10588 + }, + { + "epoch": 0.9461433645319096, + "grad_norm": 0.5184633135795593, + "learning_rate": 7.569622327449177e-07, + "loss": 0.8593, + "step": 10589 + }, + { + "epoch": 0.9462327160631715, + "grad_norm": 0.6350348591804504, + "learning_rate": 7.54455835121981e-07, + "loss": 0.8568, + "step": 10590 + }, + { + "epoch": 0.9463220675944334, + "grad_norm": 0.45934930443763733, + "learning_rate": 7.519535623382867e-07, + "loss": 0.8905, + "step": 10591 + }, + { + "epoch": 0.9464114191256953, + "grad_norm": 0.6196146607398987, + "learning_rate": 7.494554146034338e-07, + "loss": 0.9422, + "step": 10592 + }, + { + "epoch": 0.9465007706569571, + "grad_norm": 0.4321631193161011, + "learning_rate": 7.46961392126655e-07, + "loss": 0.9287, + "step": 10593 + }, + { + "epoch": 0.946590122188219, + "grad_norm": 0.494315505027771, + "learning_rate": 7.444714951168663e-07, + "loss": 0.8973, + "step": 10594 + }, + { + "epoch": 0.9466794737194809, + "grad_norm": 0.4586873948574066, + "learning_rate": 7.419857237826122e-07, + "loss": 0.8857, + "step": 10595 + }, + { + "epoch": 0.9467688252507427, + "grad_norm": 0.46470096707344055, + "learning_rate": 7.395040783321039e-07, + "loss": 0.9417, + "step": 10596 + }, + { + "epoch": 0.9468581767820046, + "grad_norm": 0.45211324095726013, + "learning_rate": 7.37026558973214e-07, + "loss": 0.8929, + "step": 10597 + }, + { + "epoch": 0.9469475283132665, + "grad_norm": 0.5554351210594177, + "learning_rate": 7.345531659134486e-07, + "loss": 0.9703, + "step": 10598 + }, + { + "epoch": 0.9470368798445283, + "grad_norm": 0.46897411346435547, + "learning_rate": 7.320838993599921e-07, + "loss": 0.9272, + "step": 10599 + }, + { + "epoch": 0.9471262313757902, + "grad_norm": 0.6147266030311584, + "learning_rate": 7.29618759519668e-07, + "loss": 0.98, + "step": 10600 + }, + { + "epoch": 0.9472155829070521, + "grad_norm": 0.5554401278495789, + "learning_rate": 7.271577465989555e-07, + "loss": 0.9312, + "step": 10601 + }, + { + "epoch": 0.947304934438314, + "grad_norm": 0.5745187401771545, + "learning_rate": 7.247008608039952e-07, + "loss": 0.8978, + "step": 10602 + }, + { + "epoch": 0.9473942859695758, + "grad_norm": 0.46855631470680237, + "learning_rate": 7.222481023405725e-07, + "loss": 0.949, + "step": 10603 + }, + { + "epoch": 0.9474836375008376, + "grad_norm": 0.5807746052742004, + "learning_rate": 7.197994714141343e-07, + "loss": 0.8789, + "step": 10604 + }, + { + "epoch": 0.9475729890320995, + "grad_norm": 0.539803147315979, + "learning_rate": 7.173549682297775e-07, + "loss": 1.0313, + "step": 10605 + }, + { + "epoch": 0.9476623405633614, + "grad_norm": 0.45081114768981934, + "learning_rate": 7.149145929922607e-07, + "loss": 0.9441, + "step": 10606 + }, + { + "epoch": 0.9477516920946233, + "grad_norm": 0.44997450709342957, + "learning_rate": 7.124783459059869e-07, + "loss": 0.9809, + "step": 10607 + }, + { + "epoch": 0.9478410436258852, + "grad_norm": 0.43726733326911926, + "learning_rate": 7.100462271750153e-07, + "loss": 0.9145, + "step": 10608 + }, + { + "epoch": 0.947930395157147, + "grad_norm": 0.5305030345916748, + "learning_rate": 7.076182370030715e-07, + "loss": 0.9287, + "step": 10609 + }, + { + "epoch": 0.9480197466884088, + "grad_norm": 0.3688342869281769, + "learning_rate": 7.051943755935208e-07, + "loss": 0.9817, + "step": 10610 + }, + { + "epoch": 0.9481090982196707, + "grad_norm": 0.40400731563568115, + "learning_rate": 7.027746431493787e-07, + "loss": 0.9076, + "step": 10611 + }, + { + "epoch": 0.9481984497509326, + "grad_norm": 0.5000659823417664, + "learning_rate": 7.003590398733329e-07, + "loss": 0.9145, + "step": 10612 + }, + { + "epoch": 0.9482878012821945, + "grad_norm": 0.4885746240615845, + "learning_rate": 6.97947565967716e-07, + "loss": 0.9152, + "step": 10613 + }, + { + "epoch": 0.9483771528134564, + "grad_norm": 0.48515942692756653, + "learning_rate": 6.955402216345108e-07, + "loss": 0.9315, + "step": 10614 + }, + { + "epoch": 0.9484665043447182, + "grad_norm": 0.43972206115722656, + "learning_rate": 6.931370070753618e-07, + "loss": 0.9436, + "step": 10615 + }, + { + "epoch": 0.9485558558759801, + "grad_norm": 0.5218010544776917, + "learning_rate": 6.907379224915633e-07, + "loss": 0.9323, + "step": 10616 + }, + { + "epoch": 0.9486452074072419, + "grad_norm": 0.5221048593521118, + "learning_rate": 6.883429680840602e-07, + "loss": 0.9423, + "step": 10617 + }, + { + "epoch": 0.9487345589385038, + "grad_norm": 0.5904656648635864, + "learning_rate": 6.859521440534533e-07, + "loss": 0.8302, + "step": 10618 + }, + { + "epoch": 0.9488239104697657, + "grad_norm": 0.4238061010837555, + "learning_rate": 6.8356545060001e-07, + "loss": 1.0147, + "step": 10619 + }, + { + "epoch": 0.9489132620010275, + "grad_norm": 0.47028911113739014, + "learning_rate": 6.81182887923637e-07, + "loss": 0.8621, + "step": 10620 + }, + { + "epoch": 0.9490026135322894, + "grad_norm": 0.5632848143577576, + "learning_rate": 6.788044562238971e-07, + "loss": 0.9579, + "step": 10621 + }, + { + "epoch": 0.9490919650635513, + "grad_norm": 0.6260432004928589, + "learning_rate": 6.764301557000086e-07, + "loss": 0.8357, + "step": 10622 + }, + { + "epoch": 0.9491813165948132, + "grad_norm": 0.6236041784286499, + "learning_rate": 6.740599865508457e-07, + "loss": 0.8724, + "step": 10623 + }, + { + "epoch": 0.949270668126075, + "grad_norm": 0.4447486400604248, + "learning_rate": 6.716939489749329e-07, + "loss": 0.9633, + "step": 10624 + }, + { + "epoch": 0.9493600196573369, + "grad_norm": 0.43622735142707825, + "learning_rate": 6.693320431704564e-07, + "loss": 0.93, + "step": 10625 + }, + { + "epoch": 0.9494493711885987, + "grad_norm": 0.5855457186698914, + "learning_rate": 6.669742693352521e-07, + "loss": 0.8923, + "step": 10626 + }, + { + "epoch": 0.9495387227198606, + "grad_norm": 0.4751369059085846, + "learning_rate": 6.64620627666801e-07, + "loss": 0.8902, + "step": 10627 + }, + { + "epoch": 0.9496280742511225, + "grad_norm": 0.5162234902381897, + "learning_rate": 6.622711183622454e-07, + "loss": 0.9346, + "step": 10628 + }, + { + "epoch": 0.9497174257823844, + "grad_norm": 0.3996429145336151, + "learning_rate": 6.599257416183946e-07, + "loss": 0.9754, + "step": 10629 + }, + { + "epoch": 0.9498067773136463, + "grad_norm": 0.4350222647190094, + "learning_rate": 6.575844976316859e-07, + "loss": 0.9303, + "step": 10630 + }, + { + "epoch": 0.949896128844908, + "grad_norm": 0.4600478708744049, + "learning_rate": 6.552473865982289e-07, + "loss": 0.8937, + "step": 10631 + }, + { + "epoch": 0.9499854803761699, + "grad_norm": 0.6685845851898193, + "learning_rate": 6.52914408713784e-07, + "loss": 0.9852, + "step": 10632 + }, + { + "epoch": 0.9500748319074318, + "grad_norm": 0.43094709515571594, + "learning_rate": 6.505855641737502e-07, + "loss": 1.0011, + "step": 10633 + }, + { + "epoch": 0.9501641834386937, + "grad_norm": 0.4459003806114197, + "learning_rate": 6.482608531732104e-07, + "loss": 0.9094, + "step": 10634 + }, + { + "epoch": 0.9502535349699556, + "grad_norm": 0.4444451630115509, + "learning_rate": 6.4594027590687e-07, + "loss": 0.993, + "step": 10635 + }, + { + "epoch": 0.9503428865012175, + "grad_norm": 0.5474777817726135, + "learning_rate": 6.436238325691125e-07, + "loss": 0.8881, + "step": 10636 + }, + { + "epoch": 0.9504322380324792, + "grad_norm": 0.5109720826148987, + "learning_rate": 6.413115233539601e-07, + "loss": 0.9903, + "step": 10637 + }, + { + "epoch": 0.9505215895637411, + "grad_norm": 0.4545588195323944, + "learning_rate": 6.390033484550917e-07, + "loss": 0.9167, + "step": 10638 + }, + { + "epoch": 0.950610941095003, + "grad_norm": 0.5279081463813782, + "learning_rate": 6.366993080658413e-07, + "loss": 0.9207, + "step": 10639 + }, + { + "epoch": 0.9507002926262649, + "grad_norm": 0.5103138089179993, + "learning_rate": 6.343994023792043e-07, + "loss": 0.9425, + "step": 10640 + }, + { + "epoch": 0.9507896441575268, + "grad_norm": 0.49605733156204224, + "learning_rate": 6.32103631587816e-07, + "loss": 0.898, + "step": 10641 + }, + { + "epoch": 0.9508789956887886, + "grad_norm": 0.5339604616165161, + "learning_rate": 6.298119958839721e-07, + "loss": 0.8929, + "step": 10642 + }, + { + "epoch": 0.9509683472200505, + "grad_norm": 0.48836469650268555, + "learning_rate": 6.275244954596193e-07, + "loss": 0.8902, + "step": 10643 + }, + { + "epoch": 0.9510576987513123, + "grad_norm": 0.4708765149116516, + "learning_rate": 6.252411305063599e-07, + "loss": 0.9173, + "step": 10644 + }, + { + "epoch": 0.9511470502825742, + "grad_norm": 0.47018665075302124, + "learning_rate": 6.229619012154575e-07, + "loss": 0.9882, + "step": 10645 + }, + { + "epoch": 0.9512364018138361, + "grad_norm": 0.5642026662826538, + "learning_rate": 6.206868077778149e-07, + "loss": 0.9281, + "step": 10646 + }, + { + "epoch": 0.951325753345098, + "grad_norm": 0.6329473853111267, + "learning_rate": 6.18415850384002e-07, + "loss": 0.887, + "step": 10647 + }, + { + "epoch": 0.9514151048763598, + "grad_norm": 0.47655338048934937, + "learning_rate": 6.161490292242278e-07, + "loss": 0.908, + "step": 10648 + }, + { + "epoch": 0.9515044564076217, + "grad_norm": 0.4613625109195709, + "learning_rate": 6.138863444883735e-07, + "loss": 0.9321, + "step": 10649 + }, + { + "epoch": 0.9515938079388836, + "grad_norm": 0.5077817440032959, + "learning_rate": 6.116277963659489e-07, + "loss": 0.986, + "step": 10650 + }, + { + "epoch": 0.9516831594701454, + "grad_norm": 0.4817694425582886, + "learning_rate": 6.093733850461358e-07, + "loss": 0.9819, + "step": 10651 + }, + { + "epoch": 0.9517725110014073, + "grad_norm": 0.6474151015281677, + "learning_rate": 6.071231107177722e-07, + "loss": 0.8317, + "step": 10652 + }, + { + "epoch": 0.9518618625326691, + "grad_norm": 0.529495894908905, + "learning_rate": 6.048769735693404e-07, + "loss": 0.872, + "step": 10653 + }, + { + "epoch": 0.951951214063931, + "grad_norm": 0.574099600315094, + "learning_rate": 6.026349737889736e-07, + "loss": 0.9059, + "step": 10654 + }, + { + "epoch": 0.9520405655951929, + "grad_norm": 0.5554847717285156, + "learning_rate": 6.003971115644657e-07, + "loss": 0.9797, + "step": 10655 + }, + { + "epoch": 0.9521299171264548, + "grad_norm": 0.5695211887359619, + "learning_rate": 5.981633870832614e-07, + "loss": 0.9082, + "step": 10656 + }, + { + "epoch": 0.9522192686577167, + "grad_norm": 0.6171822547912598, + "learning_rate": 5.959338005324611e-07, + "loss": 0.9138, + "step": 10657 + }, + { + "epoch": 0.9523086201889784, + "grad_norm": 0.4547090232372284, + "learning_rate": 5.937083520988151e-07, + "loss": 0.9651, + "step": 10658 + }, + { + "epoch": 0.9523979717202403, + "grad_norm": 0.5358104705810547, + "learning_rate": 5.914870419687247e-07, + "loss": 0.9548, + "step": 10659 + }, + { + "epoch": 0.9524873232515022, + "grad_norm": 0.5056635737419128, + "learning_rate": 5.892698703282517e-07, + "loss": 0.8873, + "step": 10660 + }, + { + "epoch": 0.9525766747827641, + "grad_norm": 0.5599133968353271, + "learning_rate": 5.870568373631091e-07, + "loss": 0.8852, + "step": 10661 + }, + { + "epoch": 0.952666026314026, + "grad_norm": 0.4533610939979553, + "learning_rate": 5.848479432586596e-07, + "loss": 0.9582, + "step": 10662 + }, + { + "epoch": 0.9527553778452879, + "grad_norm": 0.42300939559936523, + "learning_rate": 5.826431881999217e-07, + "loss": 0.938, + "step": 10663 + }, + { + "epoch": 0.9528447293765497, + "grad_norm": 0.525570273399353, + "learning_rate": 5.804425723715701e-07, + "loss": 0.9239, + "step": 10664 + }, + { + "epoch": 0.9529340809078115, + "grad_norm": 0.5064711570739746, + "learning_rate": 5.782460959579239e-07, + "loss": 0.8886, + "step": 10665 + }, + { + "epoch": 0.9530234324390734, + "grad_norm": 0.4518652558326721, + "learning_rate": 5.760537591429694e-07, + "loss": 0.9666, + "step": 10666 + }, + { + "epoch": 0.9531127839703353, + "grad_norm": 0.4667336344718933, + "learning_rate": 5.738655621103317e-07, + "loss": 0.9292, + "step": 10667 + }, + { + "epoch": 0.9532021355015972, + "grad_norm": 0.4580380916595459, + "learning_rate": 5.716815050432978e-07, + "loss": 0.934, + "step": 10668 + }, + { + "epoch": 0.953291487032859, + "grad_norm": 0.5494727492332458, + "learning_rate": 5.6950158812481e-07, + "loss": 0.9544, + "step": 10669 + }, + { + "epoch": 0.9533808385641209, + "grad_norm": 0.5223135948181152, + "learning_rate": 5.673258115374502e-07, + "loss": 0.8561, + "step": 10670 + }, + { + "epoch": 0.9534701900953828, + "grad_norm": 0.5879161953926086, + "learning_rate": 5.651541754634726e-07, + "loss": 0.8332, + "step": 10671 + }, + { + "epoch": 0.9535595416266446, + "grad_norm": 0.4340555965900421, + "learning_rate": 5.629866800847649e-07, + "loss": 0.9383, + "step": 10672 + }, + { + "epoch": 0.9536488931579065, + "grad_norm": 0.46510595083236694, + "learning_rate": 5.608233255828876e-07, + "loss": 0.8908, + "step": 10673 + }, + { + "epoch": 0.9537382446891683, + "grad_norm": 0.5728726983070374, + "learning_rate": 5.586641121390401e-07, + "loss": 0.8977, + "step": 10674 + }, + { + "epoch": 0.9538275962204302, + "grad_norm": 0.4889596402645111, + "learning_rate": 5.565090399340778e-07, + "loss": 0.9271, + "step": 10675 + }, + { + "epoch": 0.9539169477516921, + "grad_norm": 0.46741414070129395, + "learning_rate": 5.543581091485117e-07, + "loss": 0.9841, + "step": 10676 + }, + { + "epoch": 0.954006299282954, + "grad_norm": 0.48655492067337036, + "learning_rate": 5.522113199625145e-07, + "loss": 0.942, + "step": 10677 + }, + { + "epoch": 0.9540956508142159, + "grad_norm": 0.4470749497413635, + "learning_rate": 5.500686725558868e-07, + "loss": 0.9452, + "step": 10678 + }, + { + "epoch": 0.9541850023454777, + "grad_norm": 0.4536135494709015, + "learning_rate": 5.479301671081072e-07, + "loss": 0.9409, + "step": 10679 + }, + { + "epoch": 0.9542743538767395, + "grad_norm": 0.4575904905796051, + "learning_rate": 5.457958037982991e-07, + "loss": 0.9012, + "step": 10680 + }, + { + "epoch": 0.9543637054080014, + "grad_norm": 0.4889681935310364, + "learning_rate": 5.436655828052417e-07, + "loss": 0.971, + "step": 10681 + }, + { + "epoch": 0.9544530569392633, + "grad_norm": 0.443453311920166, + "learning_rate": 5.415395043073535e-07, + "loss": 0.9073, + "step": 10682 + }, + { + "epoch": 0.9545424084705252, + "grad_norm": 0.4644291400909424, + "learning_rate": 5.394175684827196e-07, + "loss": 0.9093, + "step": 10683 + }, + { + "epoch": 0.9546317600017871, + "grad_norm": 0.5638459920883179, + "learning_rate": 5.372997755090759e-07, + "loss": 0.8817, + "step": 10684 + }, + { + "epoch": 0.954721111533049, + "grad_norm": 0.42751070857048035, + "learning_rate": 5.351861255638135e-07, + "loss": 1.0156, + "step": 10685 + }, + { + "epoch": 0.9548104630643107, + "grad_norm": 0.5429770350456238, + "learning_rate": 5.330766188239689e-07, + "loss": 0.9907, + "step": 10686 + }, + { + "epoch": 0.9548998145955726, + "grad_norm": 0.5348584651947021, + "learning_rate": 5.309712554662338e-07, + "loss": 0.8703, + "step": 10687 + }, + { + "epoch": 0.9549891661268345, + "grad_norm": 0.5381792783737183, + "learning_rate": 5.288700356669618e-07, + "loss": 0.9238, + "step": 10688 + }, + { + "epoch": 0.9550785176580964, + "grad_norm": 0.5914027690887451, + "learning_rate": 5.267729596021509e-07, + "loss": 0.9369, + "step": 10689 + }, + { + "epoch": 0.9551678691893583, + "grad_norm": 0.573617696762085, + "learning_rate": 5.246800274474439e-07, + "loss": 0.8617, + "step": 10690 + }, + { + "epoch": 0.9552572207206201, + "grad_norm": 0.4635196030139923, + "learning_rate": 5.225912393781617e-07, + "loss": 0.9357, + "step": 10691 + }, + { + "epoch": 0.955346572251882, + "grad_norm": 0.4225331246852875, + "learning_rate": 5.205065955692534e-07, + "loss": 0.9849, + "step": 10692 + }, + { + "epoch": 0.9554359237831438, + "grad_norm": 0.5266596078872681, + "learning_rate": 5.184260961953236e-07, + "loss": 0.9057, + "step": 10693 + }, + { + "epoch": 0.9555252753144057, + "grad_norm": 0.48360317945480347, + "learning_rate": 5.163497414306495e-07, + "loss": 0.897, + "step": 10694 + }, + { + "epoch": 0.9556146268456676, + "grad_norm": 0.44401875138282776, + "learning_rate": 5.142775314491422e-07, + "loss": 0.9219, + "step": 10695 + }, + { + "epoch": 0.9557039783769294, + "grad_norm": 0.4794129431247711, + "learning_rate": 5.122094664243681e-07, + "loss": 0.9605, + "step": 10696 + }, + { + "epoch": 0.9557933299081913, + "grad_norm": 0.5471349358558655, + "learning_rate": 5.101455465295557e-07, + "loss": 0.9078, + "step": 10697 + }, + { + "epoch": 0.9558826814394532, + "grad_norm": 0.5558369755744934, + "learning_rate": 5.080857719375776e-07, + "loss": 0.9239, + "step": 10698 + }, + { + "epoch": 0.9559720329707151, + "grad_norm": 0.39528143405914307, + "learning_rate": 5.060301428209624e-07, + "loss": 0.9493, + "step": 10699 + }, + { + "epoch": 0.9560613845019769, + "grad_norm": 0.44044265151023865, + "learning_rate": 5.039786593518892e-07, + "loss": 0.9668, + "step": 10700 + }, + { + "epoch": 0.9561507360332387, + "grad_norm": 0.44840747117996216, + "learning_rate": 5.019313217021982e-07, + "loss": 0.943, + "step": 10701 + }, + { + "epoch": 0.9562400875645006, + "grad_norm": 0.4926467537879944, + "learning_rate": 4.998881300433688e-07, + "loss": 0.9684, + "step": 10702 + }, + { + "epoch": 0.9563294390957625, + "grad_norm": 0.44300875067710876, + "learning_rate": 4.978490845465367e-07, + "loss": 0.9856, + "step": 10703 + }, + { + "epoch": 0.9564187906270244, + "grad_norm": 0.4651431143283844, + "learning_rate": 4.958141853825038e-07, + "loss": 0.9457, + "step": 10704 + }, + { + "epoch": 0.9565081421582863, + "grad_norm": 0.4982956051826477, + "learning_rate": 4.937834327217061e-07, + "loss": 0.8796, + "step": 10705 + }, + { + "epoch": 0.956597493689548, + "grad_norm": 0.46135175228118896, + "learning_rate": 4.917568267342465e-07, + "loss": 0.8973, + "step": 10706 + }, + { + "epoch": 0.9566868452208099, + "grad_norm": 0.5590856075286865, + "learning_rate": 4.897343675898669e-07, + "loss": 0.9275, + "step": 10707 + }, + { + "epoch": 0.9567761967520718, + "grad_norm": 0.5813520550727844, + "learning_rate": 4.877160554579818e-07, + "loss": 0.9401, + "step": 10708 + }, + { + "epoch": 0.9568655482833337, + "grad_norm": 0.44653165340423584, + "learning_rate": 4.857018905076394e-07, + "loss": 1.0116, + "step": 10709 + }, + { + "epoch": 0.9569548998145956, + "grad_norm": 0.47590041160583496, + "learning_rate": 4.836918729075435e-07, + "loss": 0.9912, + "step": 10710 + }, + { + "epoch": 0.9570442513458575, + "grad_norm": 0.42629966139793396, + "learning_rate": 4.81686002826065e-07, + "loss": 0.9783, + "step": 10711 + }, + { + "epoch": 0.9571336028771193, + "grad_norm": 0.5415294766426086, + "learning_rate": 4.796842804312085e-07, + "loss": 0.9848, + "step": 10712 + }, + { + "epoch": 0.9572229544083811, + "grad_norm": 0.5828223824501038, + "learning_rate": 4.776867058906453e-07, + "loss": 0.8597, + "step": 10713 + }, + { + "epoch": 0.957312305939643, + "grad_norm": 0.5175209045410156, + "learning_rate": 4.756932793716862e-07, + "loss": 0.9513, + "step": 10714 + }, + { + "epoch": 0.9574016574709049, + "grad_norm": 0.5043903589248657, + "learning_rate": 4.737040010413085e-07, + "loss": 0.8341, + "step": 10715 + }, + { + "epoch": 0.9574910090021668, + "grad_norm": 0.47045809030532837, + "learning_rate": 4.717188710661291e-07, + "loss": 0.8929, + "step": 10716 + }, + { + "epoch": 0.9575803605334287, + "grad_norm": 0.44731298089027405, + "learning_rate": 4.697378896124316e-07, + "loss": 0.9371, + "step": 10717 + }, + { + "epoch": 0.9576697120646905, + "grad_norm": 0.47598275542259216, + "learning_rate": 4.677610568461388e-07, + "loss": 0.9035, + "step": 10718 + }, + { + "epoch": 0.9577590635959524, + "grad_norm": 0.49408817291259766, + "learning_rate": 4.657883729328405e-07, + "loss": 0.8944, + "step": 10719 + }, + { + "epoch": 0.9578484151272142, + "grad_norm": 0.41959306597709656, + "learning_rate": 4.638198380377545e-07, + "loss": 0.9322, + "step": 10720 + }, + { + "epoch": 0.9579377666584761, + "grad_norm": 0.43749403953552246, + "learning_rate": 4.6185545232577676e-07, + "loss": 0.9512, + "step": 10721 + }, + { + "epoch": 0.958027118189738, + "grad_norm": 0.426736980676651, + "learning_rate": 4.5989521596144226e-07, + "loss": 0.9449, + "step": 10722 + }, + { + "epoch": 0.9581164697209998, + "grad_norm": 0.5225173830986023, + "learning_rate": 4.579391291089419e-07, + "loss": 0.8694, + "step": 10723 + }, + { + "epoch": 0.9582058212522617, + "grad_norm": 0.6672084927558899, + "learning_rate": 4.559871919321279e-07, + "loss": 0.8843, + "step": 10724 + }, + { + "epoch": 0.9582951727835236, + "grad_norm": 0.4476062059402466, + "learning_rate": 4.5403940459448067e-07, + "loss": 0.898, + "step": 10725 + }, + { + "epoch": 0.9583845243147855, + "grad_norm": 0.4649253189563751, + "learning_rate": 4.5209576725915304e-07, + "loss": 0.8641, + "step": 10726 + }, + { + "epoch": 0.9584738758460473, + "grad_norm": 0.4393841028213501, + "learning_rate": 4.501562800889536e-07, + "loss": 0.9845, + "step": 10727 + }, + { + "epoch": 0.9585632273773091, + "grad_norm": 0.5080018639564514, + "learning_rate": 4.482209432463247e-07, + "loss": 0.8999, + "step": 10728 + }, + { + "epoch": 0.958652578908571, + "grad_norm": 0.5580213665962219, + "learning_rate": 4.46289756893381e-07, + "loss": 0.9335, + "step": 10729 + }, + { + "epoch": 0.9587419304398329, + "grad_norm": 0.6076408624649048, + "learning_rate": 4.443627211918711e-07, + "loss": 0.9558, + "step": 10730 + }, + { + "epoch": 0.9588312819710948, + "grad_norm": 0.4416392147541046, + "learning_rate": 4.424398363032101e-07, + "loss": 1.0056, + "step": 10731 + }, + { + "epoch": 0.9589206335023567, + "grad_norm": 0.4628840684890747, + "learning_rate": 4.405211023884581e-07, + "loss": 0.9612, + "step": 10732 + }, + { + "epoch": 0.9590099850336186, + "grad_norm": 0.5556047558784485, + "learning_rate": 4.3860651960832557e-07, + "loss": 1.0094, + "step": 10733 + }, + { + "epoch": 0.9590993365648803, + "grad_norm": 0.4615866541862488, + "learning_rate": 4.3669608812318965e-07, + "loss": 0.9931, + "step": 10734 + }, + { + "epoch": 0.9591886880961422, + "grad_norm": 0.4226515293121338, + "learning_rate": 4.347898080930557e-07, + "loss": 0.909, + "step": 10735 + }, + { + "epoch": 0.9592780396274041, + "grad_norm": 0.4294814169406891, + "learning_rate": 4.3288767967760715e-07, + "loss": 0.9615, + "step": 10736 + }, + { + "epoch": 0.959367391158666, + "grad_norm": 0.5078641772270203, + "learning_rate": 4.3098970303616646e-07, + "loss": 0.8305, + "step": 10737 + }, + { + "epoch": 0.9594567426899279, + "grad_norm": 0.5030995607376099, + "learning_rate": 4.2909587832770103e-07, + "loss": 1.0022, + "step": 10738 + }, + { + "epoch": 0.9595460942211897, + "grad_norm": 0.5276926159858704, + "learning_rate": 4.27206205710845e-07, + "loss": 0.8767, + "step": 10739 + }, + { + "epoch": 0.9596354457524516, + "grad_norm": 0.450339138507843, + "learning_rate": 4.2532068534387737e-07, + "loss": 0.9567, + "step": 10740 + }, + { + "epoch": 0.9597247972837134, + "grad_norm": 0.4343004822731018, + "learning_rate": 4.2343931738473284e-07, + "loss": 0.9784, + "step": 10741 + }, + { + "epoch": 0.9598141488149753, + "grad_norm": 0.45386016368865967, + "learning_rate": 4.215621019909854e-07, + "loss": 0.8928, + "step": 10742 + }, + { + "epoch": 0.9599035003462372, + "grad_norm": 0.6192027926445007, + "learning_rate": 4.196890393198871e-07, + "loss": 0.9318, + "step": 10743 + }, + { + "epoch": 0.959992851877499, + "grad_norm": 0.5403250455856323, + "learning_rate": 4.1782012952831796e-07, + "loss": 0.841, + "step": 10744 + }, + { + "epoch": 0.9600822034087609, + "grad_norm": 0.525173544883728, + "learning_rate": 4.159553727728194e-07, + "loss": 0.9311, + "step": 10745 + }, + { + "epoch": 0.9601715549400228, + "grad_norm": 0.5325625538825989, + "learning_rate": 4.140947692095887e-07, + "loss": 0.8706, + "step": 10746 + }, + { + "epoch": 0.9602609064712847, + "grad_norm": 0.49182307720184326, + "learning_rate": 4.1223831899446785e-07, + "loss": 0.9359, + "step": 10747 + }, + { + "epoch": 0.9603502580025465, + "grad_norm": 0.444618821144104, + "learning_rate": 4.103860222829603e-07, + "loss": 0.9907, + "step": 10748 + }, + { + "epoch": 0.9604396095338084, + "grad_norm": 0.4664936065673828, + "learning_rate": 4.0853787923020303e-07, + "loss": 1.0142, + "step": 10749 + }, + { + "epoch": 0.9605289610650702, + "grad_norm": 0.6530784964561462, + "learning_rate": 4.066938899910111e-07, + "loss": 0.8257, + "step": 10750 + }, + { + "epoch": 0.9606183125963321, + "grad_norm": 0.5376946330070496, + "learning_rate": 4.048540547198332e-07, + "loss": 0.8809, + "step": 10751 + }, + { + "epoch": 0.960707664127594, + "grad_norm": 0.4881064295768738, + "learning_rate": 4.030183735707682e-07, + "loss": 0.9446, + "step": 10752 + }, + { + "epoch": 0.9607970156588559, + "grad_norm": 0.46255356073379517, + "learning_rate": 4.011868466975821e-07, + "loss": 0.9665, + "step": 10753 + }, + { + "epoch": 0.9608863671901178, + "grad_norm": 0.5297724008560181, + "learning_rate": 3.9935947425368546e-07, + "loss": 0.9761, + "step": 10754 + }, + { + "epoch": 0.9609757187213795, + "grad_norm": 0.5168519616127014, + "learning_rate": 3.9753625639213366e-07, + "loss": 0.9226, + "step": 10755 + }, + { + "epoch": 0.9610650702526414, + "grad_norm": 0.5492185354232788, + "learning_rate": 3.9571719326564894e-07, + "loss": 0.8866, + "step": 10756 + }, + { + "epoch": 0.9611544217839033, + "grad_norm": 0.5541948676109314, + "learning_rate": 3.939022850265928e-07, + "loss": 0.9604, + "step": 10757 + }, + { + "epoch": 0.9612437733151652, + "grad_norm": 0.6160182952880859, + "learning_rate": 3.920915318269824e-07, + "loss": 0.8829, + "step": 10758 + }, + { + "epoch": 0.9613331248464271, + "grad_norm": 0.5799499154090881, + "learning_rate": 3.902849338184911e-07, + "loss": 0.8777, + "step": 10759 + }, + { + "epoch": 0.961422476377689, + "grad_norm": 0.5014825463294983, + "learning_rate": 3.8848249115243097e-07, + "loss": 0.9618, + "step": 10760 + }, + { + "epoch": 0.9615118279089508, + "grad_norm": 0.4629524052143097, + "learning_rate": 3.866842039797869e-07, + "loss": 0.949, + "step": 10761 + }, + { + "epoch": 0.9616011794402126, + "grad_norm": 0.46119171380996704, + "learning_rate": 3.848900724511828e-07, + "loss": 0.9926, + "step": 10762 + }, + { + "epoch": 0.9616905309714745, + "grad_norm": 0.5495138764381409, + "learning_rate": 3.8310009671689297e-07, + "loss": 0.9226, + "step": 10763 + }, + { + "epoch": 0.9617798825027364, + "grad_norm": 0.49449822306632996, + "learning_rate": 3.813142769268474e-07, + "loss": 0.9972, + "step": 10764 + }, + { + "epoch": 0.9618692340339983, + "grad_norm": 0.4808973968029022, + "learning_rate": 3.7953261323063205e-07, + "loss": 0.9218, + "step": 10765 + }, + { + "epoch": 0.9619585855652601, + "grad_norm": 0.45000120997428894, + "learning_rate": 3.7775510577747195e-07, + "loss": 0.9093, + "step": 10766 + }, + { + "epoch": 0.962047937096522, + "grad_norm": 0.42627647519111633, + "learning_rate": 3.759817547162536e-07, + "loss": 0.9849, + "step": 10767 + }, + { + "epoch": 0.9621372886277838, + "grad_norm": 0.4252340793609619, + "learning_rate": 3.742125601955249e-07, + "loss": 0.9683, + "step": 10768 + }, + { + "epoch": 0.9622266401590457, + "grad_norm": 0.45607298612594604, + "learning_rate": 3.724475223634616e-07, + "loss": 1.0085, + "step": 10769 + }, + { + "epoch": 0.9623159916903076, + "grad_norm": 0.5643563866615295, + "learning_rate": 3.706866413679122e-07, + "loss": 0.9372, + "step": 10770 + }, + { + "epoch": 0.9624053432215695, + "grad_norm": 0.5665273666381836, + "learning_rate": 3.689299173563643e-07, + "loss": 0.9523, + "step": 10771 + }, + { + "epoch": 0.9624946947528313, + "grad_norm": 0.5675973892211914, + "learning_rate": 3.6717735047597233e-07, + "loss": 0.8598, + "step": 10772 + }, + { + "epoch": 0.9625840462840932, + "grad_norm": 0.48373866081237793, + "learning_rate": 3.6542894087351896e-07, + "loss": 0.9075, + "step": 10773 + }, + { + "epoch": 0.9626733978153551, + "grad_norm": 0.5747187733650208, + "learning_rate": 3.6368468869545926e-07, + "loss": 0.94, + "step": 10774 + }, + { + "epoch": 0.9627627493466169, + "grad_norm": 0.4350042939186096, + "learning_rate": 3.619445940878929e-07, + "loss": 0.958, + "step": 10775 + }, + { + "epoch": 0.9628521008778788, + "grad_norm": 0.5669698119163513, + "learning_rate": 3.6020865719657016e-07, + "loss": 0.9641, + "step": 10776 + }, + { + "epoch": 0.9629414524091406, + "grad_norm": 0.5079529881477356, + "learning_rate": 3.584768781668968e-07, + "loss": 0.9328, + "step": 10777 + }, + { + "epoch": 0.9630308039404025, + "grad_norm": 0.6118546724319458, + "learning_rate": 3.5674925714391796e-07, + "loss": 0.8199, + "step": 10778 + }, + { + "epoch": 0.9631201554716644, + "grad_norm": 0.4477030038833618, + "learning_rate": 3.5502579427235673e-07, + "loss": 0.9591, + "step": 10779 + }, + { + "epoch": 0.9632095070029263, + "grad_norm": 0.454915314912796, + "learning_rate": 3.5330648969655876e-07, + "loss": 0.9534, + "step": 10780 + }, + { + "epoch": 0.9632988585341882, + "grad_norm": 0.5509199500083923, + "learning_rate": 3.515913435605367e-07, + "loss": 0.8643, + "step": 10781 + }, + { + "epoch": 0.96338821006545, + "grad_norm": 0.4031422734260559, + "learning_rate": 3.4988035600795886e-07, + "loss": 0.9174, + "step": 10782 + }, + { + "epoch": 0.9634775615967118, + "grad_norm": 0.636800229549408, + "learning_rate": 3.481735271821274e-07, + "loss": 1.0021, + "step": 10783 + }, + { + "epoch": 0.9635669131279737, + "grad_norm": 0.4733976721763611, + "learning_rate": 3.464708572260167e-07, + "loss": 0.9752, + "step": 10784 + }, + { + "epoch": 0.9636562646592356, + "grad_norm": 0.4569736123085022, + "learning_rate": 3.44772346282235e-07, + "loss": 0.933, + "step": 10785 + }, + { + "epoch": 0.9637456161904975, + "grad_norm": 0.49324387311935425, + "learning_rate": 3.4307799449306286e-07, + "loss": 0.9589, + "step": 10786 + }, + { + "epoch": 0.9638349677217594, + "grad_norm": 0.5697484016418457, + "learning_rate": 3.413878020004091e-07, + "loss": 0.9579, + "step": 10787 + }, + { + "epoch": 0.9639243192530212, + "grad_norm": 0.5456252694129944, + "learning_rate": 3.397017689458548e-07, + "loss": 0.8727, + "step": 10788 + }, + { + "epoch": 0.964013670784283, + "grad_norm": 0.5522463917732239, + "learning_rate": 3.3801989547061484e-07, + "loss": 0.9692, + "step": 10789 + }, + { + "epoch": 0.9641030223155449, + "grad_norm": 0.5635389685630798, + "learning_rate": 3.363421817155654e-07, + "loss": 0.8955, + "step": 10790 + }, + { + "epoch": 0.9641923738468068, + "grad_norm": 0.525848388671875, + "learning_rate": 3.3466862782123853e-07, + "loss": 0.8466, + "step": 10791 + }, + { + "epoch": 0.9642817253780687, + "grad_norm": 0.5157281160354614, + "learning_rate": 3.3299923392780543e-07, + "loss": 1.019, + "step": 10792 + }, + { + "epoch": 0.9643710769093305, + "grad_norm": 0.6579477787017822, + "learning_rate": 3.3133400017509865e-07, + "loss": 0.8579, + "step": 10793 + }, + { + "epoch": 0.9644604284405924, + "grad_norm": 0.5229929089546204, + "learning_rate": 3.2967292670260106e-07, + "loss": 0.889, + "step": 10794 + }, + { + "epoch": 0.9645497799718543, + "grad_norm": 0.5078433752059937, + "learning_rate": 3.280160136494459e-07, + "loss": 0.9574, + "step": 10795 + }, + { + "epoch": 0.9646391315031161, + "grad_norm": 0.5875293016433716, + "learning_rate": 3.2636326115441097e-07, + "loss": 0.8764, + "step": 10796 + }, + { + "epoch": 0.964728483034378, + "grad_norm": 0.47622236609458923, + "learning_rate": 3.247146693559355e-07, + "loss": 0.9398, + "step": 10797 + }, + { + "epoch": 0.9648178345656399, + "grad_norm": 0.5416803956031799, + "learning_rate": 3.2307023839210914e-07, + "loss": 0.9299, + "step": 10798 + }, + { + "epoch": 0.9649071860969017, + "grad_norm": 0.47890692949295044, + "learning_rate": 3.214299684006661e-07, + "loss": 0.9296, + "step": 10799 + }, + { + "epoch": 0.9649965376281636, + "grad_norm": 0.42355430126190186, + "learning_rate": 3.197938595189964e-07, + "loss": 0.9806, + "step": 10800 + }, + { + "epoch": 0.9650858891594255, + "grad_norm": 0.46457356214523315, + "learning_rate": 3.1816191188415166e-07, + "loss": 0.9908, + "step": 10801 + }, + { + "epoch": 0.9651752406906874, + "grad_norm": 0.5232931971549988, + "learning_rate": 3.1653412563281135e-07, + "loss": 0.8684, + "step": 10802 + }, + { + "epoch": 0.9652645922219492, + "grad_norm": 0.4481331408023834, + "learning_rate": 3.1491050090132757e-07, + "loss": 0.9863, + "step": 10803 + }, + { + "epoch": 0.965353943753211, + "grad_norm": 0.5186441540718079, + "learning_rate": 3.1329103782569145e-07, + "loss": 0.8764, + "step": 10804 + }, + { + "epoch": 0.9654432952844729, + "grad_norm": 0.5062096118927002, + "learning_rate": 3.116757365415557e-07, + "loss": 0.8986, + "step": 10805 + }, + { + "epoch": 0.9655326468157348, + "grad_norm": 0.42634356021881104, + "learning_rate": 3.1006459718421755e-07, + "loss": 0.9799, + "step": 10806 + }, + { + "epoch": 0.9656219983469967, + "grad_norm": 0.4971635937690735, + "learning_rate": 3.0845761988862464e-07, + "loss": 0.8796, + "step": 10807 + }, + { + "epoch": 0.9657113498782586, + "grad_norm": 0.5731960535049438, + "learning_rate": 3.068548047893804e-07, + "loss": 0.8347, + "step": 10808 + }, + { + "epoch": 0.9658007014095205, + "grad_norm": 0.6698244214057922, + "learning_rate": 3.0525615202073863e-07, + "loss": 0.8799, + "step": 10809 + }, + { + "epoch": 0.9658900529407822, + "grad_norm": 0.4804188907146454, + "learning_rate": 3.036616617165977e-07, + "loss": 0.959, + "step": 10810 + }, + { + "epoch": 0.9659794044720441, + "grad_norm": 0.45491814613342285, + "learning_rate": 3.020713340105175e-07, + "loss": 0.9305, + "step": 10811 + }, + { + "epoch": 0.966068756003306, + "grad_norm": 0.43385374546051025, + "learning_rate": 3.0048516903571357e-07, + "loss": 0.9343, + "step": 10812 + }, + { + "epoch": 0.9661581075345679, + "grad_norm": 0.42967671155929565, + "learning_rate": 2.989031669250297e-07, + "loss": 0.978, + "step": 10813 + }, + { + "epoch": 0.9662474590658298, + "grad_norm": 0.4216921031475067, + "learning_rate": 2.973253278109767e-07, + "loss": 1.0218, + "step": 10814 + }, + { + "epoch": 0.9663368105970916, + "grad_norm": 0.45912298560142517, + "learning_rate": 2.957516518257264e-07, + "loss": 0.9561, + "step": 10815 + }, + { + "epoch": 0.9664261621283535, + "grad_norm": 0.4412069022655487, + "learning_rate": 2.9418213910107907e-07, + "loss": 1.0146, + "step": 10816 + }, + { + "epoch": 0.9665155136596153, + "grad_norm": 0.4734534025192261, + "learning_rate": 2.9261678976850726e-07, + "loss": 1.0074, + "step": 10817 + }, + { + "epoch": 0.9666048651908772, + "grad_norm": 0.44548165798187256, + "learning_rate": 2.910556039591228e-07, + "loss": 0.9156, + "step": 10818 + }, + { + "epoch": 0.9666942167221391, + "grad_norm": 0.45493119955062866, + "learning_rate": 2.894985818036877e-07, + "loss": 0.9426, + "step": 10819 + }, + { + "epoch": 0.966783568253401, + "grad_norm": 0.4516558051109314, + "learning_rate": 2.879457234326255e-07, + "loss": 0.9776, + "step": 10820 + }, + { + "epoch": 0.9668729197846628, + "grad_norm": 0.4548396170139313, + "learning_rate": 2.8639702897599873e-07, + "loss": 0.8973, + "step": 10821 + }, + { + "epoch": 0.9669622713159247, + "grad_norm": 0.4782199263572693, + "learning_rate": 2.8485249856353147e-07, + "loss": 0.8619, + "step": 10822 + }, + { + "epoch": 0.9670516228471866, + "grad_norm": 0.6326056718826294, + "learning_rate": 2.833121323245924e-07, + "loss": 0.8101, + "step": 10823 + }, + { + "epoch": 0.9671409743784484, + "grad_norm": 0.4548356533050537, + "learning_rate": 2.817759303882006e-07, + "loss": 0.9549, + "step": 10824 + }, + { + "epoch": 0.9672303259097103, + "grad_norm": 0.5176000595092773, + "learning_rate": 2.802438928830364e-07, + "loss": 0.8751, + "step": 10825 + }, + { + "epoch": 0.9673196774409721, + "grad_norm": 0.4941912293434143, + "learning_rate": 2.7871601993741947e-07, + "loss": 0.9054, + "step": 10826 + }, + { + "epoch": 0.967409028972234, + "grad_norm": 0.5169671773910522, + "learning_rate": 2.7719231167933067e-07, + "loss": 0.9378, + "step": 10827 + }, + { + "epoch": 0.9674983805034959, + "grad_norm": 0.5568271279335022, + "learning_rate": 2.7567276823639023e-07, + "loss": 0.9216, + "step": 10828 + }, + { + "epoch": 0.9675877320347578, + "grad_norm": 0.6154254674911499, + "learning_rate": 2.741573897358796e-07, + "loss": 0.7888, + "step": 10829 + }, + { + "epoch": 0.9676770835660196, + "grad_norm": 0.4801642894744873, + "learning_rate": 2.72646176304725e-07, + "loss": 0.9863, + "step": 10830 + }, + { + "epoch": 0.9677664350972814, + "grad_norm": 0.5245612263679504, + "learning_rate": 2.711391280695086e-07, + "loss": 0.9012, + "step": 10831 + }, + { + "epoch": 0.9678557866285433, + "grad_norm": 0.5288980603218079, + "learning_rate": 2.6963624515646266e-07, + "loss": 0.9981, + "step": 10832 + }, + { + "epoch": 0.9679451381598052, + "grad_norm": 0.3883173167705536, + "learning_rate": 2.681375276914644e-07, + "loss": 0.9628, + "step": 10833 + }, + { + "epoch": 0.9680344896910671, + "grad_norm": 0.6051060557365417, + "learning_rate": 2.666429758000577e-07, + "loss": 0.8418, + "step": 10834 + }, + { + "epoch": 0.968123841222329, + "grad_norm": 0.4276251196861267, + "learning_rate": 2.651525896074203e-07, + "loss": 0.9092, + "step": 10835 + }, + { + "epoch": 0.9682131927535909, + "grad_norm": 0.48356980085372925, + "learning_rate": 2.636663692383856e-07, + "loss": 1.0608, + "step": 10836 + }, + { + "epoch": 0.9683025442848526, + "grad_norm": 0.5755243897438049, + "learning_rate": 2.62184314817443e-07, + "loss": 0.831, + "step": 10837 + }, + { + "epoch": 0.9683918958161145, + "grad_norm": 0.4929053783416748, + "learning_rate": 2.6070642646873757e-07, + "loss": 0.9273, + "step": 10838 + }, + { + "epoch": 0.9684812473473764, + "grad_norm": 0.48126858472824097, + "learning_rate": 2.5923270431604804e-07, + "loss": 0.9546, + "step": 10839 + }, + { + "epoch": 0.9685705988786383, + "grad_norm": 0.524333119392395, + "learning_rate": 2.577631484828147e-07, + "loss": 0.8945, + "step": 10840 + }, + { + "epoch": 0.9686599504099002, + "grad_norm": 0.5595930218696594, + "learning_rate": 2.5629775909213337e-07, + "loss": 0.9558, + "step": 10841 + }, + { + "epoch": 0.968749301941162, + "grad_norm": 0.507494330406189, + "learning_rate": 2.5483653626675043e-07, + "loss": 1.0194, + "step": 10842 + }, + { + "epoch": 0.9688386534724239, + "grad_norm": 0.43305492401123047, + "learning_rate": 2.5337948012904566e-07, + "loss": 0.9712, + "step": 10843 + }, + { + "epoch": 0.9689280050036857, + "grad_norm": 0.46509480476379395, + "learning_rate": 2.5192659080107704e-07, + "loss": 0.9215, + "step": 10844 + }, + { + "epoch": 0.9690173565349476, + "grad_norm": 0.44424694776535034, + "learning_rate": 2.5047786840452504e-07, + "loss": 0.9837, + "step": 10845 + }, + { + "epoch": 0.9691067080662095, + "grad_norm": 0.44861480593681335, + "learning_rate": 2.4903331306074804e-07, + "loss": 0.9725, + "step": 10846 + }, + { + "epoch": 0.9691960595974713, + "grad_norm": 0.4651372730731964, + "learning_rate": 2.475929248907383e-07, + "loss": 0.9143, + "step": 10847 + }, + { + "epoch": 0.9692854111287332, + "grad_norm": 0.5831353664398193, + "learning_rate": 2.4615670401514356e-07, + "loss": 0.9379, + "step": 10848 + }, + { + "epoch": 0.9693747626599951, + "grad_norm": 0.45759260654449463, + "learning_rate": 2.4472465055426217e-07, + "loss": 0.9993, + "step": 10849 + }, + { + "epoch": 0.969464114191257, + "grad_norm": 0.6619382500648499, + "learning_rate": 2.432967646280426e-07, + "loss": 0.8364, + "step": 10850 + }, + { + "epoch": 0.9695534657225188, + "grad_norm": 0.3795022964477539, + "learning_rate": 2.4187304635608923e-07, + "loss": 0.9585, + "step": 10851 + }, + { + "epoch": 0.9696428172537807, + "grad_norm": 0.5020806193351746, + "learning_rate": 2.4045349585765097e-07, + "loss": 0.9079, + "step": 10852 + }, + { + "epoch": 0.9697321687850425, + "grad_norm": 0.4140533208847046, + "learning_rate": 2.3903811325163283e-07, + "loss": 0.9358, + "step": 10853 + }, + { + "epoch": 0.9698215203163044, + "grad_norm": 0.6230595707893372, + "learning_rate": 2.3762689865658438e-07, + "loss": 0.8465, + "step": 10854 + }, + { + "epoch": 0.9699108718475663, + "grad_norm": 0.5708101391792297, + "learning_rate": 2.3621985219071108e-07, + "loss": 0.8622, + "step": 10855 + }, + { + "epoch": 0.9700002233788282, + "grad_norm": 0.5391978025436401, + "learning_rate": 2.3481697397187418e-07, + "loss": 0.905, + "step": 10856 + }, + { + "epoch": 0.9700895749100901, + "grad_norm": 0.4433526396751404, + "learning_rate": 2.334182641175686e-07, + "loss": 0.9798, + "step": 10857 + }, + { + "epoch": 0.9701789264413518, + "grad_norm": 0.669391393661499, + "learning_rate": 2.320237227449562e-07, + "loss": 0.8246, + "step": 10858 + }, + { + "epoch": 0.9702682779726137, + "grad_norm": 0.47016438841819763, + "learning_rate": 2.3063334997084907e-07, + "loss": 0.8787, + "step": 10859 + }, + { + "epoch": 0.9703576295038756, + "grad_norm": 0.5404382944107056, + "learning_rate": 2.2924714591170403e-07, + "loss": 0.9036, + "step": 10860 + }, + { + "epoch": 0.9704469810351375, + "grad_norm": 0.44831082224845886, + "learning_rate": 2.2786511068362826e-07, + "loss": 0.9216, + "step": 10861 + }, + { + "epoch": 0.9705363325663994, + "grad_norm": 0.44715404510498047, + "learning_rate": 2.2648724440237913e-07, + "loss": 0.9903, + "step": 10862 + }, + { + "epoch": 0.9706256840976613, + "grad_norm": 0.5612984895706177, + "learning_rate": 2.2511354718336986e-07, + "loss": 0.8568, + "step": 10863 + }, + { + "epoch": 0.9707150356289231, + "grad_norm": 0.45841625332832336, + "learning_rate": 2.2374401914166953e-07, + "loss": 0.9588, + "step": 10864 + }, + { + "epoch": 0.9708043871601849, + "grad_norm": 0.4593586325645447, + "learning_rate": 2.2237866039198085e-07, + "loss": 0.9923, + "step": 10865 + }, + { + "epoch": 0.9708937386914468, + "grad_norm": 0.46852678060531616, + "learning_rate": 2.2101747104866788e-07, + "loss": 0.9463, + "step": 10866 + }, + { + "epoch": 0.9709830902227087, + "grad_norm": 0.4527819752693176, + "learning_rate": 2.1966045122575052e-07, + "loss": 0.9759, + "step": 10867 + }, + { + "epoch": 0.9710724417539706, + "grad_norm": 0.42089274525642395, + "learning_rate": 2.1830760103688784e-07, + "loss": 0.9125, + "step": 10868 + }, + { + "epoch": 0.9711617932852324, + "grad_norm": 0.4603272080421448, + "learning_rate": 2.1695892059540035e-07, + "loss": 0.9218, + "step": 10869 + }, + { + "epoch": 0.9712511448164943, + "grad_norm": 0.444743275642395, + "learning_rate": 2.156144100142532e-07, + "loss": 0.9502, + "step": 10870 + }, + { + "epoch": 0.9713404963477562, + "grad_norm": 0.5526580810546875, + "learning_rate": 2.1427406940606187e-07, + "loss": 0.9189, + "step": 10871 + }, + { + "epoch": 0.971429847879018, + "grad_norm": 0.4762706160545349, + "learning_rate": 2.1293789888309212e-07, + "loss": 0.9151, + "step": 10872 + }, + { + "epoch": 0.9715191994102799, + "grad_norm": 0.5218154191970825, + "learning_rate": 2.116058985572711e-07, + "loss": 0.8963, + "step": 10873 + }, + { + "epoch": 0.9716085509415417, + "grad_norm": 0.5006052851676941, + "learning_rate": 2.1027806854015954e-07, + "loss": 0.9783, + "step": 10874 + }, + { + "epoch": 0.9716979024728036, + "grad_norm": 0.45393121242523193, + "learning_rate": 2.0895440894297402e-07, + "loss": 0.9236, + "step": 10875 + }, + { + "epoch": 0.9717872540040655, + "grad_norm": 0.5509636998176575, + "learning_rate": 2.0763491987659812e-07, + "loss": 0.9322, + "step": 10876 + }, + { + "epoch": 0.9718766055353274, + "grad_norm": 0.49217623472213745, + "learning_rate": 2.0631960145154338e-07, + "loss": 0.9342, + "step": 10877 + }, + { + "epoch": 0.9719659570665893, + "grad_norm": 0.48069292306900024, + "learning_rate": 2.050084537779884e-07, + "loss": 0.9461, + "step": 10878 + }, + { + "epoch": 0.972055308597851, + "grad_norm": 0.4449180066585541, + "learning_rate": 2.0370147696574526e-07, + "loss": 0.9321, + "step": 10879 + }, + { + "epoch": 0.9721446601291129, + "grad_norm": 0.5779398083686829, + "learning_rate": 2.0239867112429868e-07, + "loss": 0.9169, + "step": 10880 + }, + { + "epoch": 0.9722340116603748, + "grad_norm": 0.5261817574501038, + "learning_rate": 2.0110003636276687e-07, + "loss": 0.9289, + "step": 10881 + }, + { + "epoch": 0.9723233631916367, + "grad_norm": 0.5714827179908752, + "learning_rate": 1.9980557278992397e-07, + "loss": 0.8583, + "step": 10882 + }, + { + "epoch": 0.9724127147228986, + "grad_norm": 0.5161953568458557, + "learning_rate": 1.9851528051419988e-07, + "loss": 0.9715, + "step": 10883 + }, + { + "epoch": 0.9725020662541605, + "grad_norm": 0.5811159610748291, + "learning_rate": 1.9722915964366372e-07, + "loss": 0.8745, + "step": 10884 + }, + { + "epoch": 0.9725914177854224, + "grad_norm": 0.4967767000198364, + "learning_rate": 1.95947210286046e-07, + "loss": 0.9639, + "step": 10885 + }, + { + "epoch": 0.9726807693166841, + "grad_norm": 0.538653552532196, + "learning_rate": 1.9466943254872193e-07, + "loss": 0.9367, + "step": 10886 + }, + { + "epoch": 0.972770120847946, + "grad_norm": 0.5293512344360352, + "learning_rate": 1.9339582653871703e-07, + "loss": 0.9583, + "step": 10887 + }, + { + "epoch": 0.9728594723792079, + "grad_norm": 0.4885563254356384, + "learning_rate": 1.9212639236271256e-07, + "loss": 0.8537, + "step": 10888 + }, + { + "epoch": 0.9729488239104698, + "grad_norm": 0.5381324291229248, + "learning_rate": 1.908611301270402e-07, + "loss": 0.9268, + "step": 10889 + }, + { + "epoch": 0.9730381754417317, + "grad_norm": 0.550106406211853, + "learning_rate": 1.8960003993767073e-07, + "loss": 0.9345, + "step": 10890 + }, + { + "epoch": 0.9731275269729935, + "grad_norm": 0.534343421459198, + "learning_rate": 1.8834312190024183e-07, + "loss": 0.8775, + "step": 10891 + }, + { + "epoch": 0.9732168785042553, + "grad_norm": 0.5603974461555481, + "learning_rate": 1.8709037612003045e-07, + "loss": 0.9408, + "step": 10892 + }, + { + "epoch": 0.9733062300355172, + "grad_norm": 0.3925410509109497, + "learning_rate": 1.8584180270196926e-07, + "loss": 0.9445, + "step": 10893 + }, + { + "epoch": 0.9733955815667791, + "grad_norm": 0.44486695528030396, + "learning_rate": 1.8459740175063577e-07, + "loss": 0.9394, + "step": 10894 + }, + { + "epoch": 0.973484933098041, + "grad_norm": 0.5443863868713379, + "learning_rate": 1.8335717337026326e-07, + "loss": 0.9071, + "step": 10895 + }, + { + "epoch": 0.9735742846293028, + "grad_norm": 0.504984974861145, + "learning_rate": 1.8212111766473528e-07, + "loss": 0.8573, + "step": 10896 + }, + { + "epoch": 0.9736636361605647, + "grad_norm": 0.49120062589645386, + "learning_rate": 1.8088923473758568e-07, + "loss": 0.929, + "step": 10897 + }, + { + "epoch": 0.9737529876918266, + "grad_norm": 0.4665651321411133, + "learning_rate": 1.7966152469199305e-07, + "loss": 0.9986, + "step": 10898 + }, + { + "epoch": 0.9738423392230884, + "grad_norm": 0.46804356575012207, + "learning_rate": 1.7843798763079733e-07, + "loss": 0.9408, + "step": 10899 + }, + { + "epoch": 0.9739316907543503, + "grad_norm": 0.49248629808425903, + "learning_rate": 1.7721862365647767e-07, + "loss": 0.8993, + "step": 10900 + }, + { + "epoch": 0.9740210422856121, + "grad_norm": 0.5523046255111694, + "learning_rate": 1.7600343287116904e-07, + "loss": 0.9051, + "step": 10901 + }, + { + "epoch": 0.974110393816874, + "grad_norm": 0.46335333585739136, + "learning_rate": 1.7479241537666225e-07, + "loss": 0.9142, + "step": 10902 + }, + { + "epoch": 0.9741997453481359, + "grad_norm": 0.4312766194343567, + "learning_rate": 1.7358557127438723e-07, + "loss": 0.9891, + "step": 10903 + }, + { + "epoch": 0.9742890968793978, + "grad_norm": 0.4239676594734192, + "learning_rate": 1.7238290066543538e-07, + "loss": 0.9441, + "step": 10904 + }, + { + "epoch": 0.9743784484106597, + "grad_norm": 0.4999157190322876, + "learning_rate": 1.7118440365053722e-07, + "loss": 0.9123, + "step": 10905 + }, + { + "epoch": 0.9744677999419215, + "grad_norm": 0.5276448130607605, + "learning_rate": 1.6999008033007913e-07, + "loss": 0.8139, + "step": 10906 + }, + { + "epoch": 0.9745571514731833, + "grad_norm": 0.44836434721946716, + "learning_rate": 1.6879993080410327e-07, + "loss": 0.9032, + "step": 10907 + }, + { + "epoch": 0.9746465030044452, + "grad_norm": 0.49218374490737915, + "learning_rate": 1.6761395517230215e-07, + "loss": 1.0147, + "step": 10908 + }, + { + "epoch": 0.9747358545357071, + "grad_norm": 0.4299551248550415, + "learning_rate": 1.6643215353400188e-07, + "loss": 0.9801, + "step": 10909 + }, + { + "epoch": 0.974825206066969, + "grad_norm": 0.4688587784767151, + "learning_rate": 1.6525452598819547e-07, + "loss": 0.8839, + "step": 10910 + }, + { + "epoch": 0.9749145575982309, + "grad_norm": 0.509926438331604, + "learning_rate": 1.6408107263352069e-07, + "loss": 0.9189, + "step": 10911 + }, + { + "epoch": 0.9750039091294928, + "grad_norm": 0.6090724468231201, + "learning_rate": 1.6291179356827114e-07, + "loss": 0.9022, + "step": 10912 + }, + { + "epoch": 0.9750932606607545, + "grad_norm": 0.5029346942901611, + "learning_rate": 1.617466888903907e-07, + "loss": 0.9822, + "step": 10913 + }, + { + "epoch": 0.9751826121920164, + "grad_norm": 0.594288170337677, + "learning_rate": 1.6058575869745686e-07, + "loss": 0.8495, + "step": 10914 + }, + { + "epoch": 0.9752719637232783, + "grad_norm": 0.4893123507499695, + "learning_rate": 1.5942900308671405e-07, + "loss": 0.9206, + "step": 10915 + }, + { + "epoch": 0.9753613152545402, + "grad_norm": 0.4604051113128662, + "learning_rate": 1.58276422155057e-07, + "loss": 0.9293, + "step": 10916 + }, + { + "epoch": 0.9754506667858021, + "grad_norm": 0.4965328872203827, + "learning_rate": 1.5712801599902515e-07, + "loss": 0.8888, + "step": 10917 + }, + { + "epoch": 0.9755400183170639, + "grad_norm": 0.47335171699523926, + "learning_rate": 1.5598378471480267e-07, + "loss": 0.9663, + "step": 10918 + }, + { + "epoch": 0.9756293698483258, + "grad_norm": 0.4554191827774048, + "learning_rate": 1.548437283982407e-07, + "loss": 0.9362, + "step": 10919 + }, + { + "epoch": 0.9757187213795876, + "grad_norm": 0.4438236951828003, + "learning_rate": 1.5370784714482944e-07, + "loss": 0.9775, + "step": 10920 + }, + { + "epoch": 0.9758080729108495, + "grad_norm": 0.5050124526023865, + "learning_rate": 1.5257614104970952e-07, + "loss": 0.9673, + "step": 10921 + }, + { + "epoch": 0.9758974244421114, + "grad_norm": 0.4388897716999054, + "learning_rate": 1.5144861020767176e-07, + "loss": 0.9194, + "step": 10922 + }, + { + "epoch": 0.9759867759733732, + "grad_norm": 0.6075052618980408, + "learning_rate": 1.5032525471316284e-07, + "loss": 0.8963, + "step": 10923 + }, + { + "epoch": 0.9760761275046351, + "grad_norm": 0.4434504806995392, + "learning_rate": 1.4920607466026858e-07, + "loss": 0.9343, + "step": 10924 + }, + { + "epoch": 0.976165479035897, + "grad_norm": 0.4541495442390442, + "learning_rate": 1.4809107014274182e-07, + "loss": 0.9642, + "step": 10925 + }, + { + "epoch": 0.9762548305671589, + "grad_norm": 0.49039971828460693, + "learning_rate": 1.4698024125396892e-07, + "loss": 0.9018, + "step": 10926 + }, + { + "epoch": 0.9763441820984207, + "grad_norm": 0.46627819538116455, + "learning_rate": 1.4587358808699215e-07, + "loss": 1.0147, + "step": 10927 + }, + { + "epoch": 0.9764335336296825, + "grad_norm": 0.4772813618183136, + "learning_rate": 1.447711107345151e-07, + "loss": 0.9074, + "step": 10928 + }, + { + "epoch": 0.9765228851609444, + "grad_norm": 0.606759250164032, + "learning_rate": 1.436728092888695e-07, + "loss": 0.8712, + "step": 10929 + }, + { + "epoch": 0.9766122366922063, + "grad_norm": 0.4552794098854065, + "learning_rate": 1.4257868384206508e-07, + "loss": 0.9339, + "step": 10930 + }, + { + "epoch": 0.9767015882234682, + "grad_norm": 0.5520709753036499, + "learning_rate": 1.4148873448573408e-07, + "loss": 0.869, + "step": 10931 + }, + { + "epoch": 0.9767909397547301, + "grad_norm": 0.49193331599235535, + "learning_rate": 1.4040296131117013e-07, + "loss": 0.9923, + "step": 10932 + }, + { + "epoch": 0.976880291285992, + "grad_norm": 0.5167152881622314, + "learning_rate": 1.393213644093283e-07, + "loss": 0.8921, + "step": 10933 + }, + { + "epoch": 0.9769696428172537, + "grad_norm": 0.5435425639152527, + "learning_rate": 1.382439438707972e-07, + "loss": 0.9036, + "step": 10934 + }, + { + "epoch": 0.9770589943485156, + "grad_norm": 0.5024846196174622, + "learning_rate": 1.3717069978582687e-07, + "loss": 0.899, + "step": 10935 + }, + { + "epoch": 0.9771483458797775, + "grad_norm": 0.4377569854259491, + "learning_rate": 1.361016322443065e-07, + "loss": 0.9288, + "step": 10936 + }, + { + "epoch": 0.9772376974110394, + "grad_norm": 0.549490213394165, + "learning_rate": 1.350367413357867e-07, + "loss": 0.9464, + "step": 10937 + }, + { + "epoch": 0.9773270489423013, + "grad_norm": 0.5465390086174011, + "learning_rate": 1.3397602714946278e-07, + "loss": 0.7978, + "step": 10938 + }, + { + "epoch": 0.9774164004735632, + "grad_norm": 0.4476923942565918, + "learning_rate": 1.3291948977418033e-07, + "loss": 0.9909, + "step": 10939 + }, + { + "epoch": 0.977505752004825, + "grad_norm": 0.4725693464279175, + "learning_rate": 1.318671292984297e-07, + "loss": 0.9513, + "step": 10940 + }, + { + "epoch": 0.9775951035360868, + "grad_norm": 0.4664715528488159, + "learning_rate": 1.3081894581036813e-07, + "loss": 0.9367, + "step": 10941 + }, + { + "epoch": 0.9776844550673487, + "grad_norm": 0.5551668405532837, + "learning_rate": 1.297749393977865e-07, + "loss": 0.9184, + "step": 10942 + }, + { + "epoch": 0.9777738065986106, + "grad_norm": 0.44665080308914185, + "learning_rate": 1.2873511014813155e-07, + "loss": 0.9797, + "step": 10943 + }, + { + "epoch": 0.9778631581298725, + "grad_norm": 0.5233182907104492, + "learning_rate": 1.2769945814850582e-07, + "loss": 0.9347, + "step": 10944 + }, + { + "epoch": 0.9779525096611343, + "grad_norm": 0.48261916637420654, + "learning_rate": 1.2666798348564546e-07, + "loss": 0.9018, + "step": 10945 + }, + { + "epoch": 0.9780418611923962, + "grad_norm": 0.5972064733505249, + "learning_rate": 1.256406862459536e-07, + "loss": 0.9394, + "step": 10946 + }, + { + "epoch": 0.9781312127236581, + "grad_norm": 0.49780553579330444, + "learning_rate": 1.2461756651547807e-07, + "loss": 0.9478, + "step": 10947 + }, + { + "epoch": 0.9782205642549199, + "grad_norm": 0.48487037420272827, + "learning_rate": 1.2359862437991144e-07, + "loss": 0.8759, + "step": 10948 + }, + { + "epoch": 0.9783099157861818, + "grad_norm": 0.6491232514381409, + "learning_rate": 1.2258385992460764e-07, + "loss": 0.8842, + "step": 10949 + }, + { + "epoch": 0.9783992673174436, + "grad_norm": 0.4678862690925598, + "learning_rate": 1.215732732345598e-07, + "loss": 0.9332, + "step": 10950 + }, + { + "epoch": 0.9784886188487055, + "grad_norm": 0.4849478304386139, + "learning_rate": 1.205668643944169e-07, + "loss": 0.9802, + "step": 10951 + }, + { + "epoch": 0.9785779703799674, + "grad_norm": 0.5359340310096741, + "learning_rate": 1.195646334884726e-07, + "loss": 0.9359, + "step": 10952 + }, + { + "epoch": 0.9786673219112293, + "grad_norm": 0.49691978096961975, + "learning_rate": 1.1856658060068193e-07, + "loss": 0.9549, + "step": 10953 + }, + { + "epoch": 0.9787566734424911, + "grad_norm": 0.6014655232429504, + "learning_rate": 1.1757270581463364e-07, + "loss": 0.9857, + "step": 10954 + }, + { + "epoch": 0.978846024973753, + "grad_norm": 0.4421977400779724, + "learning_rate": 1.1658300921358334e-07, + "loss": 0.9145, + "step": 10955 + }, + { + "epoch": 0.9789353765050148, + "grad_norm": 0.552693247795105, + "learning_rate": 1.1559749088042026e-07, + "loss": 0.8431, + "step": 10956 + }, + { + "epoch": 0.9790247280362767, + "grad_norm": 0.46435338258743286, + "learning_rate": 1.1461615089770062e-07, + "loss": 0.9223, + "step": 10957 + }, + { + "epoch": 0.9791140795675386, + "grad_norm": 0.4661587178707123, + "learning_rate": 1.1363898934761974e-07, + "loss": 0.9771, + "step": 10958 + }, + { + "epoch": 0.9792034310988005, + "grad_norm": 0.5485957264900208, + "learning_rate": 1.1266600631202328e-07, + "loss": 0.9378, + "step": 10959 + }, + { + "epoch": 0.9792927826300624, + "grad_norm": 0.5378150939941406, + "learning_rate": 1.1169720187240718e-07, + "loss": 0.9114, + "step": 10960 + }, + { + "epoch": 0.9793821341613241, + "grad_norm": 0.5579792261123657, + "learning_rate": 1.1073257610991761e-07, + "loss": 0.9124, + "step": 10961 + }, + { + "epoch": 0.979471485692586, + "grad_norm": 0.3841468393802643, + "learning_rate": 1.0977212910536217e-07, + "loss": 1.0001, + "step": 10962 + }, + { + "epoch": 0.9795608372238479, + "grad_norm": 0.4615533649921417, + "learning_rate": 1.0881586093918205e-07, + "loss": 0.918, + "step": 10963 + }, + { + "epoch": 0.9796501887551098, + "grad_norm": 0.46879342198371887, + "learning_rate": 1.0786377169147432e-07, + "loss": 0.982, + "step": 10964 + }, + { + "epoch": 0.9797395402863717, + "grad_norm": 0.41505542397499084, + "learning_rate": 1.0691586144199184e-07, + "loss": 0.9517, + "step": 10965 + }, + { + "epoch": 0.9798288918176336, + "grad_norm": 0.5566489696502686, + "learning_rate": 1.059721302701211e-07, + "loss": 0.8337, + "step": 10966 + }, + { + "epoch": 0.9799182433488954, + "grad_norm": 0.5348109602928162, + "learning_rate": 1.0503257825492108e-07, + "loss": 1.0199, + "step": 10967 + }, + { + "epoch": 0.9800075948801572, + "grad_norm": 0.47354453802108765, + "learning_rate": 1.040972054750844e-07, + "loss": 0.9287, + "step": 10968 + }, + { + "epoch": 0.9800969464114191, + "grad_norm": 0.5585243105888367, + "learning_rate": 1.0316601200895948e-07, + "loss": 0.9726, + "step": 10969 + }, + { + "epoch": 0.980186297942681, + "grad_norm": 0.5595225095748901, + "learning_rate": 1.022389979345395e-07, + "loss": 0.8816, + "step": 10970 + }, + { + "epoch": 0.9802756494739429, + "grad_norm": 0.5292766094207764, + "learning_rate": 1.0131616332947346e-07, + "loss": 0.9289, + "step": 10971 + }, + { + "epoch": 0.9803650010052047, + "grad_norm": 0.42903298139572144, + "learning_rate": 1.003975082710662e-07, + "loss": 0.9523, + "step": 10972 + }, + { + "epoch": 0.9804543525364666, + "grad_norm": 0.43353918194770813, + "learning_rate": 9.948303283625615e-08, + "loss": 0.9218, + "step": 10973 + }, + { + "epoch": 0.9805437040677285, + "grad_norm": 0.6073578596115112, + "learning_rate": 9.857273710164871e-08, + "loss": 0.9989, + "step": 10974 + }, + { + "epoch": 0.9806330555989903, + "grad_norm": 0.4935568869113922, + "learning_rate": 9.766662114347736e-08, + "loss": 0.9325, + "step": 10975 + }, + { + "epoch": 0.9807224071302522, + "grad_norm": 0.6012499332427979, + "learning_rate": 9.676468503765357e-08, + "loss": 0.9096, + "step": 10976 + }, + { + "epoch": 0.980811758661514, + "grad_norm": 0.5922300219535828, + "learning_rate": 9.586692885971137e-08, + "loss": 0.8704, + "step": 10977 + }, + { + "epoch": 0.9809011101927759, + "grad_norm": 0.4871740937232971, + "learning_rate": 9.49733526848573e-08, + "loss": 0.901, + "step": 10978 + }, + { + "epoch": 0.9809904617240378, + "grad_norm": 0.5010733604431152, + "learning_rate": 9.4083956587937e-08, + "loss": 0.9744, + "step": 10979 + }, + { + "epoch": 0.9810798132552997, + "grad_norm": 0.5309439301490784, + "learning_rate": 9.319874064344092e-08, + "loss": 1.0236, + "step": 10980 + }, + { + "epoch": 0.9811691647865616, + "grad_norm": 0.542961597442627, + "learning_rate": 9.231770492552084e-08, + "loss": 0.9089, + "step": 10981 + }, + { + "epoch": 0.9812585163178233, + "grad_norm": 0.5267024636268616, + "learning_rate": 9.144084950796772e-08, + "loss": 0.9133, + "step": 10982 + }, + { + "epoch": 0.9813478678490852, + "grad_norm": 0.48576876521110535, + "learning_rate": 9.056817446422839e-08, + "loss": 0.9561, + "step": 10983 + }, + { + "epoch": 0.9814372193803471, + "grad_norm": 0.4452846944332123, + "learning_rate": 8.969967986740546e-08, + "loss": 1.0614, + "step": 10984 + }, + { + "epoch": 0.981526570911609, + "grad_norm": 0.5701860785484314, + "learning_rate": 8.883536579024077e-08, + "loss": 0.8198, + "step": 10985 + }, + { + "epoch": 0.9816159224428709, + "grad_norm": 0.4147273898124695, + "learning_rate": 8.797523230512639e-08, + "loss": 0.9686, + "step": 10986 + }, + { + "epoch": 0.9817052739741328, + "grad_norm": 0.5088819861412048, + "learning_rate": 8.711927948411025e-08, + "loss": 0.913, + "step": 10987 + }, + { + "epoch": 0.9817946255053946, + "grad_norm": 0.5437602996826172, + "learning_rate": 8.626750739888501e-08, + "loss": 1.0007, + "step": 10988 + }, + { + "epoch": 0.9818839770366564, + "grad_norm": 0.6010596752166748, + "learning_rate": 8.541991612080469e-08, + "loss": 0.8717, + "step": 10989 + }, + { + "epoch": 0.9819733285679183, + "grad_norm": 0.44414857029914856, + "learning_rate": 8.457650572085141e-08, + "loss": 0.9774, + "step": 10990 + }, + { + "epoch": 0.9820626800991802, + "grad_norm": 0.4728522002696991, + "learning_rate": 8.373727626967975e-08, + "loss": 0.9735, + "step": 10991 + }, + { + "epoch": 0.9821520316304421, + "grad_norm": 0.543158233165741, + "learning_rate": 8.290222783757795e-08, + "loss": 0.8637, + "step": 10992 + }, + { + "epoch": 0.982241383161704, + "grad_norm": 0.4106288254261017, + "learning_rate": 8.20713604944956e-08, + "loss": 0.9912, + "step": 10993 + }, + { + "epoch": 0.9823307346929658, + "grad_norm": 0.40964820981025696, + "learning_rate": 8.124467431002148e-08, + "loss": 0.9706, + "step": 10994 + }, + { + "epoch": 0.9824200862242277, + "grad_norm": 0.4523217976093292, + "learning_rate": 8.042216935340019e-08, + "loss": 0.9801, + "step": 10995 + }, + { + "epoch": 0.9825094377554895, + "grad_norm": 0.5060840249061584, + "learning_rate": 7.960384569353219e-08, + "loss": 0.9665, + "step": 10996 + }, + { + "epoch": 0.9825987892867514, + "grad_norm": 0.5814510583877563, + "learning_rate": 7.878970339894598e-08, + "loss": 0.9288, + "step": 10997 + }, + { + "epoch": 0.9826881408180133, + "grad_norm": 0.45717230439186096, + "learning_rate": 7.797974253785367e-08, + "loss": 0.9257, + "step": 10998 + }, + { + "epoch": 0.9827774923492751, + "grad_norm": 0.3922685384750366, + "learning_rate": 7.717396317808433e-08, + "loss": 0.9438, + "step": 10999 + }, + { + "epoch": 0.982866843880537, + "grad_norm": 0.5810117125511169, + "learning_rate": 7.637236538713399e-08, + "loss": 0.8851, + "step": 11000 + }, + { + "epoch": 0.9829561954117989, + "grad_norm": 0.5308172106742859, + "learning_rate": 7.557494923214337e-08, + "loss": 0.8542, + "step": 11001 + }, + { + "epoch": 0.9830455469430608, + "grad_norm": 0.48735228180885315, + "learning_rate": 7.478171477990902e-08, + "loss": 0.9284, + "step": 11002 + }, + { + "epoch": 0.9831348984743226, + "grad_norm": 0.5343984365463257, + "learning_rate": 7.399266209687228e-08, + "loss": 0.9664, + "step": 11003 + }, + { + "epoch": 0.9832242500055844, + "grad_norm": 0.40684744715690613, + "learning_rate": 7.32077912491247e-08, + "loss": 0.9746, + "step": 11004 + }, + { + "epoch": 0.9833136015368463, + "grad_norm": 0.4683247208595276, + "learning_rate": 7.242710230240257e-08, + "loss": 0.9381, + "step": 11005 + }, + { + "epoch": 0.9834029530681082, + "grad_norm": 0.4208630323410034, + "learning_rate": 7.165059532210361e-08, + "loss": 0.9214, + "step": 11006 + }, + { + "epoch": 0.9834923045993701, + "grad_norm": 0.45483431220054626, + "learning_rate": 7.087827037325912e-08, + "loss": 0.9788, + "step": 11007 + }, + { + "epoch": 0.983581656130632, + "grad_norm": 0.6926810145378113, + "learning_rate": 7.011012752056733e-08, + "loss": 0.826, + "step": 11008 + }, + { + "epoch": 0.9836710076618939, + "grad_norm": 0.4510803818702698, + "learning_rate": 6.934616682837125e-08, + "loss": 0.9017, + "step": 11009 + }, + { + "epoch": 0.9837603591931556, + "grad_norm": 0.5115488171577454, + "learning_rate": 6.8586388360653e-08, + "loss": 0.8919, + "step": 11010 + }, + { + "epoch": 0.9838497107244175, + "grad_norm": 0.5512319207191467, + "learning_rate": 6.783079218105614e-08, + "loss": 0.9227, + "step": 11011 + }, + { + "epoch": 0.9839390622556794, + "grad_norm": 0.5396715998649597, + "learning_rate": 6.707937835286893e-08, + "loss": 0.9214, + "step": 11012 + }, + { + "epoch": 0.9840284137869413, + "grad_norm": 0.5042988061904907, + "learning_rate": 6.63321469390299e-08, + "loss": 1.0184, + "step": 11013 + }, + { + "epoch": 0.9841177653182032, + "grad_norm": 0.5353363752365112, + "learning_rate": 6.558909800212787e-08, + "loss": 0.929, + "step": 11014 + }, + { + "epoch": 0.984207116849465, + "grad_norm": 0.5171821117401123, + "learning_rate": 6.485023160440195e-08, + "loss": 0.8725, + "step": 11015 + }, + { + "epoch": 0.9842964683807269, + "grad_norm": 0.44513246417045593, + "learning_rate": 6.411554780774154e-08, + "loss": 0.9139, + "step": 11016 + }, + { + "epoch": 0.9843858199119887, + "grad_norm": 0.4491952061653137, + "learning_rate": 6.338504667368072e-08, + "loss": 0.9752, + "step": 11017 + }, + { + "epoch": 0.9844751714432506, + "grad_norm": 0.47702059149742126, + "learning_rate": 6.265872826340946e-08, + "loss": 0.9612, + "step": 11018 + }, + { + "epoch": 0.9845645229745125, + "grad_norm": 0.6224638223648071, + "learning_rate": 6.193659263776242e-08, + "loss": 0.9551, + "step": 11019 + }, + { + "epoch": 0.9846538745057744, + "grad_norm": 0.521059513092041, + "learning_rate": 6.121863985722454e-08, + "loss": 0.9127, + "step": 11020 + }, + { + "epoch": 0.9847432260370362, + "grad_norm": 0.4040971100330353, + "learning_rate": 6.05048699819366e-08, + "loss": 0.9459, + "step": 11021 + }, + { + "epoch": 0.9848325775682981, + "grad_norm": 0.6411527395248413, + "learning_rate": 5.979528307168414e-08, + "loss": 0.9753, + "step": 11022 + }, + { + "epoch": 0.9849219290995599, + "grad_norm": 0.469348669052124, + "learning_rate": 5.908987918589737e-08, + "loss": 0.883, + "step": 11023 + }, + { + "epoch": 0.9850112806308218, + "grad_norm": 0.45892199873924255, + "learning_rate": 5.838865838366792e-08, + "loss": 0.9839, + "step": 11024 + }, + { + "epoch": 0.9851006321620837, + "grad_norm": 0.5176712870597839, + "learning_rate": 5.769162072373213e-08, + "loss": 0.9604, + "step": 11025 + }, + { + "epoch": 0.9851899836933455, + "grad_norm": 0.45386213064193726, + "learning_rate": 5.699876626446554e-08, + "loss": 0.9464, + "step": 11026 + }, + { + "epoch": 0.9852793352246074, + "grad_norm": 0.5465309619903564, + "learning_rate": 5.6310095063905056e-08, + "loss": 0.8722, + "step": 11027 + }, + { + "epoch": 0.9853686867558693, + "grad_norm": 0.4664804935455322, + "learning_rate": 5.562560717973786e-08, + "loss": 0.9386, + "step": 11028 + }, + { + "epoch": 0.9854580382871312, + "grad_norm": 0.4405352771282196, + "learning_rate": 5.494530266929032e-08, + "loss": 0.9666, + "step": 11029 + }, + { + "epoch": 0.985547389818393, + "grad_norm": 0.41485795378685, + "learning_rate": 5.426918158955574e-08, + "loss": 1.0112, + "step": 11030 + }, + { + "epoch": 0.9856367413496548, + "grad_norm": 0.45383763313293457, + "learning_rate": 5.359724399715549e-08, + "loss": 1.0, + "step": 11031 + }, + { + "epoch": 0.9857260928809167, + "grad_norm": 0.44838976860046387, + "learning_rate": 5.2929489948377876e-08, + "loss": 0.926, + "step": 11032 + }, + { + "epoch": 0.9858154444121786, + "grad_norm": 0.49654853343963623, + "learning_rate": 5.226591949915594e-08, + "loss": 0.9731, + "step": 11033 + }, + { + "epoch": 0.9859047959434405, + "grad_norm": 0.4356088936328888, + "learning_rate": 5.1606532705067436e-08, + "loss": 1.0049, + "step": 11034 + }, + { + "epoch": 0.9859941474747024, + "grad_norm": 0.5468194484710693, + "learning_rate": 5.0951329621340416e-08, + "loss": 0.9498, + "step": 11035 + }, + { + "epoch": 0.9860834990059643, + "grad_norm": 0.4790467321872711, + "learning_rate": 5.0300310302858754e-08, + "loss": 0.9816, + "step": 11036 + }, + { + "epoch": 0.986172850537226, + "grad_norm": 0.4556531012058258, + "learning_rate": 4.965347480415106e-08, + "loss": 0.9038, + "step": 11037 + }, + { + "epoch": 0.9862622020684879, + "grad_norm": 0.4872002601623535, + "learning_rate": 4.901082317940176e-08, + "loss": 0.9294, + "step": 11038 + }, + { + "epoch": 0.9863515535997498, + "grad_norm": 0.47440412640571594, + "learning_rate": 4.837235548242891e-08, + "loss": 0.9039, + "step": 11039 + }, + { + "epoch": 0.9864409051310117, + "grad_norm": 0.616805374622345, + "learning_rate": 4.773807176672307e-08, + "loss": 0.9557, + "step": 11040 + }, + { + "epoch": 0.9865302566622736, + "grad_norm": 0.6758015751838684, + "learning_rate": 4.710797208540285e-08, + "loss": 0.8747, + "step": 11041 + }, + { + "epoch": 0.9866196081935354, + "grad_norm": 0.5714487433433533, + "learning_rate": 4.648205649124826e-08, + "loss": 0.9604, + "step": 11042 + }, + { + "epoch": 0.9867089597247973, + "grad_norm": 0.5039824843406677, + "learning_rate": 4.586032503668958e-08, + "loss": 0.949, + "step": 11043 + }, + { + "epoch": 0.9867983112560591, + "grad_norm": 0.45489272475242615, + "learning_rate": 4.5242777773801816e-08, + "loss": 0.9297, + "step": 11044 + }, + { + "epoch": 0.986887662787321, + "grad_norm": 0.49669596552848816, + "learning_rate": 4.4629414754310264e-08, + "loss": 1.0207, + "step": 11045 + }, + { + "epoch": 0.9869770143185829, + "grad_norm": 0.4577721059322357, + "learning_rate": 4.402023602959604e-08, + "loss": 0.9492, + "step": 11046 + }, + { + "epoch": 0.9870663658498448, + "grad_norm": 0.4786894619464874, + "learning_rate": 4.34152416506739e-08, + "loss": 0.9994, + "step": 11047 + }, + { + "epoch": 0.9871557173811066, + "grad_norm": 0.49558448791503906, + "learning_rate": 4.281443166822552e-08, + "loss": 0.928, + "step": 11048 + }, + { + "epoch": 0.9872450689123685, + "grad_norm": 0.5031788945198059, + "learning_rate": 4.221780613257176e-08, + "loss": 0.9041, + "step": 11049 + }, + { + "epoch": 0.9873344204436304, + "grad_norm": 0.4300597310066223, + "learning_rate": 4.1625365093689306e-08, + "loss": 1.0044, + "step": 11050 + }, + { + "epoch": 0.9874237719748922, + "grad_norm": 0.5299180746078491, + "learning_rate": 4.103710860120513e-08, + "loss": 0.9538, + "step": 11051 + }, + { + "epoch": 0.9875131235061541, + "grad_norm": 0.4451320469379425, + "learning_rate": 4.045303670438538e-08, + "loss": 0.9107, + "step": 11052 + }, + { + "epoch": 0.9876024750374159, + "grad_norm": 0.4782625138759613, + "learning_rate": 3.987314945215204e-08, + "loss": 0.962, + "step": 11053 + }, + { + "epoch": 0.9876918265686778, + "grad_norm": 0.4866953194141388, + "learning_rate": 3.929744689307735e-08, + "loss": 0.9365, + "step": 11054 + }, + { + "epoch": 0.9877811780999397, + "grad_norm": 0.5117444396018982, + "learning_rate": 3.872592907538941e-08, + "loss": 0.8583, + "step": 11055 + }, + { + "epoch": 0.9878705296312016, + "grad_norm": 0.4728601574897766, + "learning_rate": 3.815859604694993e-08, + "loss": 0.8894, + "step": 11056 + }, + { + "epoch": 0.9879598811624635, + "grad_norm": 0.4523528814315796, + "learning_rate": 3.759544785528757e-08, + "loss": 1.0043, + "step": 11057 + }, + { + "epoch": 0.9880492326937252, + "grad_norm": 0.446228951215744, + "learning_rate": 3.7036484547564585e-08, + "loss": 0.9598, + "step": 11058 + }, + { + "epoch": 0.9881385842249871, + "grad_norm": 0.5366201996803284, + "learning_rate": 3.6481706170599094e-08, + "loss": 0.8954, + "step": 11059 + }, + { + "epoch": 0.988227935756249, + "grad_norm": 0.6366725564002991, + "learning_rate": 3.593111277086503e-08, + "loss": 0.9405, + "step": 11060 + }, + { + "epoch": 0.9883172872875109, + "grad_norm": 0.44401809573173523, + "learning_rate": 3.538470439448105e-08, + "loss": 0.9049, + "step": 11061 + }, + { + "epoch": 0.9884066388187728, + "grad_norm": 0.48204681277275085, + "learning_rate": 3.4842481087216104e-08, + "loss": 0.8971, + "step": 11062 + }, + { + "epoch": 0.9884959903500347, + "grad_norm": 0.4742887616157532, + "learning_rate": 3.4304442894478316e-08, + "loss": 0.9513, + "step": 11063 + }, + { + "epoch": 0.9885853418812965, + "grad_norm": 0.49685031175613403, + "learning_rate": 3.377058986134274e-08, + "loss": 0.9369, + "step": 11064 + }, + { + "epoch": 0.9886746934125583, + "grad_norm": 0.41819649934768677, + "learning_rate": 3.324092203251805e-08, + "loss": 0.9412, + "step": 11065 + }, + { + "epoch": 0.9887640449438202, + "grad_norm": 0.4875706434249878, + "learning_rate": 3.271543945237987e-08, + "loss": 0.8598, + "step": 11066 + }, + { + "epoch": 0.9888533964750821, + "grad_norm": 0.6076412200927734, + "learning_rate": 3.219414216493188e-08, + "loss": 0.9209, + "step": 11067 + }, + { + "epoch": 0.988942748006344, + "grad_norm": 0.5355422496795654, + "learning_rate": 3.167703021384471e-08, + "loss": 0.9449, + "step": 11068 + }, + { + "epoch": 0.9890320995376058, + "grad_norm": 0.48543617129325867, + "learning_rate": 3.1164103642428165e-08, + "loss": 0.9368, + "step": 11069 + }, + { + "epoch": 0.9891214510688677, + "grad_norm": 0.4472845494747162, + "learning_rate": 3.0655362493647865e-08, + "loss": 0.9731, + "step": 11070 + }, + { + "epoch": 0.9892108026001296, + "grad_norm": 0.4854414761066437, + "learning_rate": 3.015080681011972e-08, + "loss": 0.9416, + "step": 11071 + }, + { + "epoch": 0.9893001541313914, + "grad_norm": 0.5297408103942871, + "learning_rate": 2.965043663409883e-08, + "loss": 0.8901, + "step": 11072 + }, + { + "epoch": 0.9893895056626533, + "grad_norm": 0.5325205326080322, + "learning_rate": 2.9154252007496108e-08, + "loss": 0.8408, + "step": 11073 + }, + { + "epoch": 0.9894788571939152, + "grad_norm": 0.42406973242759705, + "learning_rate": 2.86622529718783e-08, + "loss": 0.9288, + "step": 11074 + }, + { + "epoch": 0.989568208725177, + "grad_norm": 0.49581971764564514, + "learning_rate": 2.817443956845689e-08, + "loss": 0.9434, + "step": 11075 + }, + { + "epoch": 0.9896575602564389, + "grad_norm": 0.5132964849472046, + "learning_rate": 2.769081183808253e-08, + "loss": 0.9745, + "step": 11076 + }, + { + "epoch": 0.9897469117877008, + "grad_norm": 0.4875660538673401, + "learning_rate": 2.7211369821272816e-08, + "loss": 0.9265, + "step": 11077 + }, + { + "epoch": 0.9898362633189627, + "grad_norm": 0.46364524960517883, + "learning_rate": 2.6736113558178954e-08, + "loss": 0.9082, + "step": 11078 + }, + { + "epoch": 0.9899256148502245, + "grad_norm": 0.4320114254951477, + "learning_rate": 2.626504308861355e-08, + "loss": 0.8796, + "step": 11079 + }, + { + "epoch": 0.9900149663814863, + "grad_norm": 0.471098393201828, + "learning_rate": 2.5798158452033927e-08, + "loss": 0.8859, + "step": 11080 + }, + { + "epoch": 0.9901043179127482, + "grad_norm": 0.5627569556236267, + "learning_rate": 2.533545968754214e-08, + "loss": 0.8719, + "step": 11081 + }, + { + "epoch": 0.9901936694440101, + "grad_norm": 0.4802547097206116, + "learning_rate": 2.4876946833901628e-08, + "loss": 0.949, + "step": 11082 + }, + { + "epoch": 0.990283020975272, + "grad_norm": 0.4991259276866913, + "learning_rate": 2.442261992950945e-08, + "loss": 0.8312, + "step": 11083 + }, + { + "epoch": 0.9903723725065339, + "grad_norm": 0.4693763256072998, + "learning_rate": 2.3972479012429605e-08, + "loss": 0.9392, + "step": 11084 + }, + { + "epoch": 0.9904617240377956, + "grad_norm": 0.6424983143806458, + "learning_rate": 2.3526524120354164e-08, + "loss": 0.8377, + "step": 11085 + }, + { + "epoch": 0.9905510755690575, + "grad_norm": 0.601347029209137, + "learning_rate": 2.3084755290647685e-08, + "loss": 0.8502, + "step": 11086 + }, + { + "epoch": 0.9906404271003194, + "grad_norm": 0.43913698196411133, + "learning_rate": 2.264717256030835e-08, + "loss": 0.9496, + "step": 11087 + }, + { + "epoch": 0.9907297786315813, + "grad_norm": 0.6155788898468018, + "learning_rate": 2.2213775965984617e-08, + "loss": 0.9325, + "step": 11088 + }, + { + "epoch": 0.9908191301628432, + "grad_norm": 0.48678359389305115, + "learning_rate": 2.1784565543986337e-08, + "loss": 0.9062, + "step": 11089 + }, + { + "epoch": 0.9909084816941051, + "grad_norm": 0.5303758978843689, + "learning_rate": 2.135954133025697e-08, + "loss": 0.8908, + "step": 11090 + }, + { + "epoch": 0.9909978332253669, + "grad_norm": 0.4228883385658264, + "learning_rate": 2.093870336040138e-08, + "loss": 0.9731, + "step": 11091 + }, + { + "epoch": 0.9910871847566287, + "grad_norm": 0.5669239163398743, + "learning_rate": 2.052205166966914e-08, + "loss": 0.9193, + "step": 11092 + }, + { + "epoch": 0.9911765362878906, + "grad_norm": 0.5962837338447571, + "learning_rate": 2.0109586292960116e-08, + "loss": 0.8922, + "step": 11093 + }, + { + "epoch": 0.9912658878191525, + "grad_norm": 0.47855356335639954, + "learning_rate": 1.9701307264818893e-08, + "loss": 0.8838, + "step": 11094 + }, + { + "epoch": 0.9913552393504144, + "grad_norm": 0.5598346590995789, + "learning_rate": 1.929721461944034e-08, + "loss": 0.8942, + "step": 11095 + }, + { + "epoch": 0.9914445908816762, + "grad_norm": 0.45872122049331665, + "learning_rate": 1.889730839068071e-08, + "loss": 1.0056, + "step": 11096 + }, + { + "epoch": 0.9915339424129381, + "grad_norm": 0.5087199807167053, + "learning_rate": 1.8501588612029887e-08, + "loss": 0.8756, + "step": 11097 + }, + { + "epoch": 0.9916232939442, + "grad_norm": 0.4826347529888153, + "learning_rate": 1.8110055316633567e-08, + "loss": 0.8969, + "step": 11098 + }, + { + "epoch": 0.9917126454754618, + "grad_norm": 0.4179272949695587, + "learning_rate": 1.7722708537293296e-08, + "loss": 0.9523, + "step": 11099 + }, + { + "epoch": 0.9918019970067237, + "grad_norm": 0.44711118936538696, + "learning_rate": 1.7339548306449794e-08, + "loss": 0.9084, + "step": 11100 + }, + { + "epoch": 0.9918913485379856, + "grad_norm": 0.4555767774581909, + "learning_rate": 1.69605746561885e-08, + "loss": 0.9573, + "step": 11101 + }, + { + "epoch": 0.9919807000692474, + "grad_norm": 0.5947062373161316, + "learning_rate": 1.6585787618267346e-08, + "loss": 0.8424, + "step": 11102 + }, + { + "epoch": 0.9920700516005093, + "grad_norm": 0.4358064532279968, + "learning_rate": 1.6215187224066787e-08, + "loss": 0.911, + "step": 11103 + }, + { + "epoch": 0.9921594031317712, + "grad_norm": 0.5996667742729187, + "learning_rate": 1.5848773504634207e-08, + "loss": 0.8718, + "step": 11104 + }, + { + "epoch": 0.9922487546630331, + "grad_norm": 0.4151008129119873, + "learning_rate": 1.5486546490661724e-08, + "loss": 0.9816, + "step": 11105 + }, + { + "epoch": 0.9923381061942949, + "grad_norm": 0.5637631416320801, + "learning_rate": 1.5128506212486183e-08, + "loss": 0.9081, + "step": 11106 + }, + { + "epoch": 0.9924274577255567, + "grad_norm": 0.46075546741485596, + "learning_rate": 1.4774652700100256e-08, + "loss": 0.8814, + "step": 11107 + }, + { + "epoch": 0.9925168092568186, + "grad_norm": 0.46446409821510315, + "learning_rate": 1.4424985983141348e-08, + "loss": 0.9606, + "step": 11108 + }, + { + "epoch": 0.9926061607880805, + "grad_norm": 0.4711821377277374, + "learning_rate": 1.4079506090891592e-08, + "loss": 0.932, + "step": 11109 + }, + { + "epoch": 0.9926955123193424, + "grad_norm": 0.5269238352775574, + "learning_rate": 1.3738213052300053e-08, + "loss": 0.8762, + "step": 11110 + }, + { + "epoch": 0.9927848638506043, + "grad_norm": 0.4487016499042511, + "learning_rate": 1.3401106895949422e-08, + "loss": 0.9347, + "step": 11111 + }, + { + "epoch": 0.9928742153818662, + "grad_norm": 0.6086486577987671, + "learning_rate": 1.3068187650072672e-08, + "loss": 0.8741, + "step": 11112 + }, + { + "epoch": 0.9929635669131279, + "grad_norm": 0.540021538734436, + "learning_rate": 1.2739455342558603e-08, + "loss": 0.8542, + "step": 11113 + }, + { + "epoch": 0.9930529184443898, + "grad_norm": 0.43035051226615906, + "learning_rate": 1.241491000094075e-08, + "loss": 0.8728, + "step": 11114 + }, + { + "epoch": 0.9931422699756517, + "grad_norm": 0.4739319980144501, + "learning_rate": 1.2094551652402919e-08, + "loss": 0.9627, + "step": 11115 + }, + { + "epoch": 0.9932316215069136, + "grad_norm": 0.42306095361709595, + "learning_rate": 1.1778380323779203e-08, + "loss": 0.9101, + "step": 11116 + }, + { + "epoch": 0.9933209730381755, + "grad_norm": 0.46345555782318115, + "learning_rate": 1.1466396041553973e-08, + "loss": 0.8924, + "step": 11117 + }, + { + "epoch": 0.9934103245694373, + "grad_norm": 0.4990481734275818, + "learning_rate": 1.1158598831856326e-08, + "loss": 0.9019, + "step": 11118 + }, + { + "epoch": 0.9934996761006992, + "grad_norm": 0.4848730266094208, + "learning_rate": 1.0854988720471193e-08, + "loss": 0.9512, + "step": 11119 + }, + { + "epoch": 0.993589027631961, + "grad_norm": 0.446789413690567, + "learning_rate": 1.0555565732822681e-08, + "loss": 0.951, + "step": 11120 + }, + { + "epoch": 0.9936783791632229, + "grad_norm": 0.5458630323410034, + "learning_rate": 1.0260329893996278e-08, + "loss": 0.9909, + "step": 11121 + }, + { + "epoch": 0.9937677306944848, + "grad_norm": 0.4132474660873413, + "learning_rate": 9.969281228722204e-09, + "loss": 0.9564, + "step": 11122 + }, + { + "epoch": 0.9938570822257466, + "grad_norm": 0.47372639179229736, + "learning_rate": 9.682419761369855e-09, + "loss": 0.9558, + "step": 11123 + }, + { + "epoch": 0.9939464337570085, + "grad_norm": 0.48265764117240906, + "learning_rate": 9.399745515981106e-09, + "loss": 0.94, + "step": 11124 + }, + { + "epoch": 0.9940357852882704, + "grad_norm": 0.4469437003135681, + "learning_rate": 9.121258516220366e-09, + "loss": 0.9356, + "step": 11125 + }, + { + "epoch": 0.9941251368195323, + "grad_norm": 0.553846538066864, + "learning_rate": 8.846958785418968e-09, + "loss": 0.941, + "step": 11126 + }, + { + "epoch": 0.9942144883507941, + "grad_norm": 0.46764692664146423, + "learning_rate": 8.57684634655298e-09, + "loss": 0.981, + "step": 11127 + }, + { + "epoch": 0.994303839882056, + "grad_norm": 0.5349944233894348, + "learning_rate": 8.310921222243195e-09, + "loss": 0.9157, + "step": 11128 + }, + { + "epoch": 0.9943931914133178, + "grad_norm": 0.47920531034469604, + "learning_rate": 8.04918343477179e-09, + "loss": 0.9246, + "step": 11129 + }, + { + "epoch": 0.9944825429445797, + "grad_norm": 0.4930896759033203, + "learning_rate": 7.791633006054567e-09, + "loss": 0.8867, + "step": 11130 + }, + { + "epoch": 0.9945718944758416, + "grad_norm": 0.5266461372375488, + "learning_rate": 7.538269957668709e-09, + "loss": 0.8601, + "step": 11131 + }, + { + "epoch": 0.9946612460071035, + "grad_norm": 0.5032413601875305, + "learning_rate": 7.2890943108305796e-09, + "loss": 0.9025, + "step": 11132 + }, + { + "epoch": 0.9947505975383654, + "grad_norm": 0.49191898107528687, + "learning_rate": 7.0441060864179235e-09, + "loss": 0.8893, + "step": 11133 + }, + { + "epoch": 0.9948399490696271, + "grad_norm": 0.46208131313323975, + "learning_rate": 6.80330530494766e-09, + "loss": 0.876, + "step": 11134 + }, + { + "epoch": 0.994929300600889, + "grad_norm": 0.5641262531280518, + "learning_rate": 6.566691986592543e-09, + "loss": 0.889, + "step": 11135 + }, + { + "epoch": 0.9950186521321509, + "grad_norm": 0.4556792080402374, + "learning_rate": 6.334266151164503e-09, + "loss": 0.933, + "step": 11136 + }, + { + "epoch": 0.9951080036634128, + "grad_norm": 0.439849317073822, + "learning_rate": 6.106027818136851e-09, + "loss": 0.9767, + "step": 11137 + }, + { + "epoch": 0.9951973551946747, + "grad_norm": 0.5620716214179993, + "learning_rate": 5.881977006622075e-09, + "loss": 0.8875, + "step": 11138 + }, + { + "epoch": 0.9952867067259366, + "grad_norm": 0.4618760943412781, + "learning_rate": 5.662113735394048e-09, + "loss": 0.9241, + "step": 11139 + }, + { + "epoch": 0.9953760582571984, + "grad_norm": 0.4364456236362457, + "learning_rate": 5.4464380228658184e-09, + "loss": 0.8809, + "step": 11140 + }, + { + "epoch": 0.9954654097884602, + "grad_norm": 0.5377599000930786, + "learning_rate": 5.234949887106266e-09, + "loss": 0.9302, + "step": 11141 + }, + { + "epoch": 0.9955547613197221, + "grad_norm": 0.4686380624771118, + "learning_rate": 5.0276493458178976e-09, + "loss": 0.9593, + "step": 11142 + }, + { + "epoch": 0.995644112850984, + "grad_norm": 0.4569147527217865, + "learning_rate": 4.8245364163757025e-09, + "loss": 0.8769, + "step": 11143 + }, + { + "epoch": 0.9957334643822459, + "grad_norm": 0.6316559910774231, + "learning_rate": 4.6256111157882976e-09, + "loss": 0.9268, + "step": 11144 + }, + { + "epoch": 0.9958228159135077, + "grad_norm": 0.6083618402481079, + "learning_rate": 4.430873460720131e-09, + "loss": 0.8524, + "step": 11145 + }, + { + "epoch": 0.9959121674447696, + "grad_norm": 0.42265886068344116, + "learning_rate": 4.2403234674803785e-09, + "loss": 0.9046, + "step": 11146 + }, + { + "epoch": 0.9960015189760314, + "grad_norm": 0.4713272750377655, + "learning_rate": 4.053961152028496e-09, + "loss": 0.9119, + "step": 11147 + }, + { + "epoch": 0.9960908705072933, + "grad_norm": 0.5426056385040283, + "learning_rate": 3.871786529974219e-09, + "loss": 0.901, + "step": 11148 + }, + { + "epoch": 0.9961802220385552, + "grad_norm": 0.46449586749076843, + "learning_rate": 3.6937996165831156e-09, + "loss": 0.9605, + "step": 11149 + }, + { + "epoch": 0.996269573569817, + "grad_norm": 0.4397094249725342, + "learning_rate": 3.5200004267543773e-09, + "loss": 0.9556, + "step": 11150 + }, + { + "epoch": 0.9963589251010789, + "grad_norm": 0.47000452876091003, + "learning_rate": 3.3503889750485796e-09, + "loss": 0.9636, + "step": 11151 + }, + { + "epoch": 0.9964482766323408, + "grad_norm": 0.48919039964675903, + "learning_rate": 3.184965275676577e-09, + "loss": 0.8975, + "step": 11152 + }, + { + "epoch": 0.9965376281636027, + "grad_norm": 0.46157464385032654, + "learning_rate": 3.0237293424939527e-09, + "loss": 0.8855, + "step": 11153 + }, + { + "epoch": 0.9966269796948645, + "grad_norm": 0.4459165036678314, + "learning_rate": 2.8666811890010193e-09, + "loss": 0.9519, + "step": 11154 + }, + { + "epoch": 0.9967163312261264, + "grad_norm": 0.5475977063179016, + "learning_rate": 2.713820828353919e-09, + "loss": 0.8692, + "step": 11155 + }, + { + "epoch": 0.9968056827573882, + "grad_norm": 0.40559154748916626, + "learning_rate": 2.5651482733535237e-09, + "loss": 0.9232, + "step": 11156 + }, + { + "epoch": 0.9968950342886501, + "grad_norm": 0.4425158202648163, + "learning_rate": 2.4206635364620867e-09, + "loss": 0.916, + "step": 11157 + }, + { + "epoch": 0.996984385819912, + "grad_norm": 0.4523138105869293, + "learning_rate": 2.2803666297754876e-09, + "loss": 0.8618, + "step": 11158 + }, + { + "epoch": 0.9970737373511739, + "grad_norm": 0.5861170291900635, + "learning_rate": 2.144257565045438e-09, + "loss": 0.8668, + "step": 11159 + }, + { + "epoch": 0.9971630888824358, + "grad_norm": 0.5498065948486328, + "learning_rate": 2.012336353668376e-09, + "loss": 0.8823, + "step": 11160 + }, + { + "epoch": 0.9972524404136975, + "grad_norm": 0.5954715013504028, + "learning_rate": 1.884603006702124e-09, + "loss": 0.8704, + "step": 11161 + }, + { + "epoch": 0.9973417919449594, + "grad_norm": 0.5423356294631958, + "learning_rate": 1.7610575348436798e-09, + "loss": 1.0066, + "step": 11162 + }, + { + "epoch": 0.9974311434762213, + "grad_norm": 0.51182621717453, + "learning_rate": 1.6416999484347716e-09, + "loss": 0.9438, + "step": 11163 + }, + { + "epoch": 0.9975204950074832, + "grad_norm": 0.5101816654205322, + "learning_rate": 1.5265302574785089e-09, + "loss": 1.0102, + "step": 11164 + }, + { + "epoch": 0.9976098465387451, + "grad_norm": 0.44059595465660095, + "learning_rate": 1.4155484716227296e-09, + "loss": 0.9013, + "step": 11165 + }, + { + "epoch": 0.997699198070007, + "grad_norm": 0.4549104869365692, + "learning_rate": 1.3087546001600004e-09, + "loss": 0.9969, + "step": 11166 + }, + { + "epoch": 0.9977885496012688, + "grad_norm": 0.6448225378990173, + "learning_rate": 1.2061486520387188e-09, + "loss": 0.8289, + "step": 11167 + }, + { + "epoch": 0.9978779011325306, + "grad_norm": 0.5743524432182312, + "learning_rate": 1.1077306358520112e-09, + "loss": 0.8468, + "step": 11168 + }, + { + "epoch": 0.9979672526637925, + "grad_norm": 0.5093139410018921, + "learning_rate": 1.0135005598432835e-09, + "loss": 0.9001, + "step": 11169 + }, + { + "epoch": 0.9980566041950544, + "grad_norm": 0.5424085855484009, + "learning_rate": 9.2345843190067e-10, + "loss": 0.8673, + "step": 11170 + }, + { + "epoch": 0.9981459557263163, + "grad_norm": 0.4926496744155884, + "learning_rate": 8.376042595736877e-10, + "loss": 0.9348, + "step": 11171 + }, + { + "epoch": 0.9982353072575781, + "grad_norm": 0.5678501129150391, + "learning_rate": 7.559380500454794e-10, + "loss": 0.9215, + "step": 11172 + }, + { + "epoch": 0.99832465878884, + "grad_norm": 0.5548151135444641, + "learning_rate": 6.784598101661211e-10, + "loss": 0.9721, + "step": 11173 + }, + { + "epoch": 0.9984140103201019, + "grad_norm": 0.5497487783432007, + "learning_rate": 6.051695464193152e-10, + "loss": 0.8651, + "step": 11174 + }, + { + "epoch": 0.9985033618513637, + "grad_norm": 0.6130139827728271, + "learning_rate": 5.360672649445952e-10, + "loss": 0.8772, + "step": 11175 + }, + { + "epoch": 0.9985927133826256, + "grad_norm": 0.6136727333068848, + "learning_rate": 4.711529715262231e-10, + "loss": 0.7813, + "step": 11176 + }, + { + "epoch": 0.9986820649138874, + "grad_norm": 0.5682715773582458, + "learning_rate": 4.104266716098426e-10, + "loss": 0.9652, + "step": 11177 + }, + { + "epoch": 0.9987714164451493, + "grad_norm": 0.4911440908908844, + "learning_rate": 3.538883702747242e-10, + "loss": 0.9506, + "step": 11178 + }, + { + "epoch": 0.9988607679764112, + "grad_norm": 0.48630785942077637, + "learning_rate": 3.015380722615202e-10, + "loss": 1.0029, + "step": 11179 + }, + { + "epoch": 0.9989501195076731, + "grad_norm": 0.4619069993495941, + "learning_rate": 2.533757819556115e-10, + "loss": 0.9795, + "step": 11180 + }, + { + "epoch": 0.999039471038935, + "grad_norm": 0.47902753949165344, + "learning_rate": 2.0940150338155662e-10, + "loss": 0.9322, + "step": 11181 + }, + { + "epoch": 0.9991288225701968, + "grad_norm": 0.4930683970451355, + "learning_rate": 1.6961524023639819e-10, + "loss": 0.9295, + "step": 11182 + }, + { + "epoch": 0.9992181741014586, + "grad_norm": 0.6377449035644531, + "learning_rate": 1.3401699583970306e-10, + "loss": 0.8337, + "step": 11183 + }, + { + "epoch": 0.9993075256327205, + "grad_norm": 0.47255298495292664, + "learning_rate": 1.026067731835223e-10, + "loss": 0.9339, + "step": 11184 + }, + { + "epoch": 0.9993968771639824, + "grad_norm": 0.5022866129875183, + "learning_rate": 7.538457489353334e-11, + "loss": 0.892, + "step": 11185 + }, + { + "epoch": 0.9994862286952443, + "grad_norm": 0.47073841094970703, + "learning_rate": 5.2350403251244516e-11, + "loss": 0.9153, + "step": 11186 + }, + { + "epoch": 0.9995755802265062, + "grad_norm": 0.5592496395111084, + "learning_rate": 3.3504260182892765e-11, + "loss": 0.9546, + "step": 11187 + }, + { + "epoch": 0.999664931757768, + "grad_norm": 0.41335591673851013, + "learning_rate": 1.8846147276097014e-11, + "loss": 0.9724, + "step": 11188 + }, + { + "epoch": 0.9997542832890298, + "grad_norm": 0.6354795694351196, + "learning_rate": 8.376065746551476e-12, + "loss": 0.8885, + "step": 11189 + }, + { + "epoch": 0.9998436348202917, + "grad_norm": 0.4729311466217041, + "learning_rate": 2.094016482434569e-12, + "loss": 0.897, + "step": 11190 + }, + { + "epoch": 0.9999329863515536, + "grad_norm": 0.5076917409896851, + "learning_rate": 0.0, + "loss": 0.9054, + "step": 11191 + } + ], + "logging_steps": 1.0, + "max_steps": 11191, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.597312254069965e+18, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}