diff --git "a/checkpoint-1940/trainer_state.json" "b/checkpoint-1940/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1940/trainer_state.json" @@ -0,0 +1,13613 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9998282082116475, + "eval_steps": 500, + "global_step": 1940, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005153753650575503, + "grad_norm": 7.126006126403809, + "learning_rate": 5.0000000000000004e-08, + "loss": 2.5468, + "step": 1 + }, + { + "epoch": 0.0010307507301151005, + "grad_norm": 6.614906311035156, + "learning_rate": 1.0000000000000001e-07, + "loss": 2.4946, + "step": 2 + }, + { + "epoch": 0.0015461260951726507, + "grad_norm": 6.964903831481934, + "learning_rate": 1.5000000000000002e-07, + "loss": 2.5099, + "step": 3 + }, + { + "epoch": 0.002061501460230201, + "grad_norm": 6.724514484405518, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.4418, + "step": 4 + }, + { + "epoch": 0.0025768768252877514, + "grad_norm": 7.039041519165039, + "learning_rate": 2.5000000000000004e-07, + "loss": 2.4028, + "step": 5 + }, + { + "epoch": 0.0030922521903453013, + "grad_norm": 7.229781627655029, + "learning_rate": 3.0000000000000004e-07, + "loss": 2.4623, + "step": 6 + }, + { + "epoch": 0.0036076275554028517, + "grad_norm": 6.759846210479736, + "learning_rate": 3.5000000000000004e-07, + "loss": 2.466, + "step": 7 + }, + { + "epoch": 0.004123002920460402, + "grad_norm": 6.571556091308594, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.3653, + "step": 8 + }, + { + "epoch": 0.0046383782855179525, + "grad_norm": 6.8509840965271, + "learning_rate": 4.5000000000000003e-07, + "loss": 2.5106, + "step": 9 + }, + { + "epoch": 0.005153753650575503, + "grad_norm": 6.830150604248047, + "learning_rate": 5.000000000000001e-07, + "loss": 2.5043, + "step": 10 + }, + { + "epoch": 0.005669129015633052, + "grad_norm": 6.322958469390869, + "learning_rate": 5.5e-07, + "loss": 2.4498, + "step": 11 + }, + { + "epoch": 0.006184504380690603, + "grad_norm": 6.736442565917969, + "learning_rate": 6.000000000000001e-07, + "loss": 2.5081, + "step": 12 + }, + { + "epoch": 0.006699879745748153, + "grad_norm": 7.156307697296143, + "learning_rate": 6.5e-07, + "loss": 2.4825, + "step": 13 + }, + { + "epoch": 0.0072152551108057034, + "grad_norm": 7.225257396697998, + "learning_rate": 7.000000000000001e-07, + "loss": 2.5138, + "step": 14 + }, + { + "epoch": 0.007730630475863254, + "grad_norm": 7.1376261711120605, + "learning_rate": 7.5e-07, + "loss": 2.5128, + "step": 15 + }, + { + "epoch": 0.008246005840920804, + "grad_norm": 6.821260929107666, + "learning_rate": 8.000000000000001e-07, + "loss": 2.4578, + "step": 16 + }, + { + "epoch": 0.008761381205978355, + "grad_norm": 6.766695976257324, + "learning_rate": 8.500000000000001e-07, + "loss": 2.3446, + "step": 17 + }, + { + "epoch": 0.009276756571035905, + "grad_norm": 6.862898349761963, + "learning_rate": 9.000000000000001e-07, + "loss": 2.3988, + "step": 18 + }, + { + "epoch": 0.009792131936093455, + "grad_norm": 6.1224517822265625, + "learning_rate": 9.500000000000001e-07, + "loss": 2.1776, + "step": 19 + }, + { + "epoch": 0.010307507301151006, + "grad_norm": 6.202511787414551, + "learning_rate": 1.0000000000000002e-06, + "loss": 2.2496, + "step": 20 + }, + { + "epoch": 0.010822882666208556, + "grad_norm": 6.215514659881592, + "learning_rate": 1.0500000000000001e-06, + "loss": 2.2555, + "step": 21 + }, + { + "epoch": 0.011338258031266105, + "grad_norm": 5.5256524085998535, + "learning_rate": 1.1e-06, + "loss": 2.1216, + "step": 22 + }, + { + "epoch": 0.011853633396323655, + "grad_norm": 5.103215217590332, + "learning_rate": 1.1500000000000002e-06, + "loss": 2.1142, + "step": 23 + }, + { + "epoch": 0.012369008761381205, + "grad_norm": 4.585116863250732, + "learning_rate": 1.2000000000000002e-06, + "loss": 2.1433, + "step": 24 + }, + { + "epoch": 0.012884384126438756, + "grad_norm": 4.1703972816467285, + "learning_rate": 1.25e-06, + "loss": 2.1236, + "step": 25 + }, + { + "epoch": 0.013399759491496306, + "grad_norm": 4.232865810394287, + "learning_rate": 1.3e-06, + "loss": 2.1689, + "step": 26 + }, + { + "epoch": 0.013915134856553856, + "grad_norm": 3.503655433654785, + "learning_rate": 1.3500000000000002e-06, + "loss": 2.0412, + "step": 27 + }, + { + "epoch": 0.014430510221611407, + "grad_norm": 3.439694404602051, + "learning_rate": 1.4000000000000001e-06, + "loss": 2.0577, + "step": 28 + }, + { + "epoch": 0.014945885586668957, + "grad_norm": 3.234438419342041, + "learning_rate": 1.45e-06, + "loss": 1.9973, + "step": 29 + }, + { + "epoch": 0.015461260951726508, + "grad_norm": 3.018153667449951, + "learning_rate": 1.5e-06, + "loss": 1.9895, + "step": 30 + }, + { + "epoch": 0.015976636316784056, + "grad_norm": 2.669713258743286, + "learning_rate": 1.5500000000000002e-06, + "loss": 1.9924, + "step": 31 + }, + { + "epoch": 0.01649201168184161, + "grad_norm": 2.5799427032470703, + "learning_rate": 1.6000000000000001e-06, + "loss": 1.9199, + "step": 32 + }, + { + "epoch": 0.017007387046899157, + "grad_norm": 2.6986613273620605, + "learning_rate": 1.6500000000000003e-06, + "loss": 1.9655, + "step": 33 + }, + { + "epoch": 0.01752276241195671, + "grad_norm": 2.537860870361328, + "learning_rate": 1.7000000000000002e-06, + "loss": 1.9494, + "step": 34 + }, + { + "epoch": 0.018038137777014258, + "grad_norm": 2.5696487426757812, + "learning_rate": 1.75e-06, + "loss": 1.8842, + "step": 35 + }, + { + "epoch": 0.01855351314207181, + "grad_norm": 2.4279472827911377, + "learning_rate": 1.8000000000000001e-06, + "loss": 1.8659, + "step": 36 + }, + { + "epoch": 0.01906888850712936, + "grad_norm": 2.5312445163726807, + "learning_rate": 1.85e-06, + "loss": 1.8631, + "step": 37 + }, + { + "epoch": 0.01958426387218691, + "grad_norm": 2.5473766326904297, + "learning_rate": 1.9000000000000002e-06, + "loss": 1.8355, + "step": 38 + }, + { + "epoch": 0.02009963923724446, + "grad_norm": 2.475839138031006, + "learning_rate": 1.9500000000000004e-06, + "loss": 1.8697, + "step": 39 + }, + { + "epoch": 0.02061501460230201, + "grad_norm": 2.154038190841675, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.7651, + "step": 40 + }, + { + "epoch": 0.02113038996735956, + "grad_norm": 2.04150390625, + "learning_rate": 2.05e-06, + "loss": 1.7075, + "step": 41 + }, + { + "epoch": 0.021645765332417112, + "grad_norm": 2.1084539890289307, + "learning_rate": 2.1000000000000002e-06, + "loss": 1.7563, + "step": 42 + }, + { + "epoch": 0.02216114069747466, + "grad_norm": 2.042217493057251, + "learning_rate": 2.15e-06, + "loss": 1.6482, + "step": 43 + }, + { + "epoch": 0.02267651606253221, + "grad_norm": 1.9986730813980103, + "learning_rate": 2.2e-06, + "loss": 1.5513, + "step": 44 + }, + { + "epoch": 0.02319189142758976, + "grad_norm": 2.1617605686187744, + "learning_rate": 2.25e-06, + "loss": 1.6998, + "step": 45 + }, + { + "epoch": 0.02370726679264731, + "grad_norm": 2.08594012260437, + "learning_rate": 2.3000000000000004e-06, + "loss": 1.5833, + "step": 46 + }, + { + "epoch": 0.024222642157704862, + "grad_norm": 2.058086633682251, + "learning_rate": 2.35e-06, + "loss": 1.593, + "step": 47 + }, + { + "epoch": 0.02473801752276241, + "grad_norm": 1.754408597946167, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.5644, + "step": 48 + }, + { + "epoch": 0.025253392887819963, + "grad_norm": 1.5493403673171997, + "learning_rate": 2.4500000000000003e-06, + "loss": 1.5755, + "step": 49 + }, + { + "epoch": 0.02576876825287751, + "grad_norm": 1.4086428880691528, + "learning_rate": 2.5e-06, + "loss": 1.4707, + "step": 50 + }, + { + "epoch": 0.026284143617935064, + "grad_norm": 1.4488685131072998, + "learning_rate": 2.55e-06, + "loss": 1.4902, + "step": 51 + }, + { + "epoch": 0.026799518982992612, + "grad_norm": 1.5926673412322998, + "learning_rate": 2.6e-06, + "loss": 1.4902, + "step": 52 + }, + { + "epoch": 0.027314894348050164, + "grad_norm": 1.7660025358200073, + "learning_rate": 2.6500000000000005e-06, + "loss": 1.5892, + "step": 53 + }, + { + "epoch": 0.027830269713107713, + "grad_norm": 1.5888456106185913, + "learning_rate": 2.7000000000000004e-06, + "loss": 1.4604, + "step": 54 + }, + { + "epoch": 0.028345645078165265, + "grad_norm": 1.3883376121520996, + "learning_rate": 2.7500000000000004e-06, + "loss": 1.4889, + "step": 55 + }, + { + "epoch": 0.028861020443222814, + "grad_norm": 1.291335105895996, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.4483, + "step": 56 + }, + { + "epoch": 0.029376395808280366, + "grad_norm": 1.3788918256759644, + "learning_rate": 2.85e-06, + "loss": 1.433, + "step": 57 + }, + { + "epoch": 0.029891771173337914, + "grad_norm": 1.839239478111267, + "learning_rate": 2.9e-06, + "loss": 1.5052, + "step": 58 + }, + { + "epoch": 0.030407146538395463, + "grad_norm": 1.4417849779129028, + "learning_rate": 2.95e-06, + "loss": 1.3936, + "step": 59 + }, + { + "epoch": 0.030922521903453015, + "grad_norm": 1.3167752027511597, + "learning_rate": 3e-06, + "loss": 1.4515, + "step": 60 + }, + { + "epoch": 0.03143789726851057, + "grad_norm": 1.2653827667236328, + "learning_rate": 3.05e-06, + "loss": 1.4057, + "step": 61 + }, + { + "epoch": 0.03195327263356811, + "grad_norm": 1.1869252920150757, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.4312, + "step": 62 + }, + { + "epoch": 0.032468647998625665, + "grad_norm": 1.275302767753601, + "learning_rate": 3.1500000000000003e-06, + "loss": 1.3919, + "step": 63 + }, + { + "epoch": 0.03298402336368322, + "grad_norm": 1.2022939920425415, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.352, + "step": 64 + }, + { + "epoch": 0.03349939872874077, + "grad_norm": 1.1764057874679565, + "learning_rate": 3.2500000000000002e-06, + "loss": 1.3249, + "step": 65 + }, + { + "epoch": 0.034014774093798314, + "grad_norm": 1.1948983669281006, + "learning_rate": 3.3000000000000006e-06, + "loss": 1.3397, + "step": 66 + }, + { + "epoch": 0.034530149458855866, + "grad_norm": 1.1970183849334717, + "learning_rate": 3.3500000000000005e-06, + "loss": 1.4166, + "step": 67 + }, + { + "epoch": 0.03504552482391342, + "grad_norm": 1.1630629301071167, + "learning_rate": 3.4000000000000005e-06, + "loss": 1.3197, + "step": 68 + }, + { + "epoch": 0.03556090018897097, + "grad_norm": 1.1130441427230835, + "learning_rate": 3.45e-06, + "loss": 1.4192, + "step": 69 + }, + { + "epoch": 0.036076275554028515, + "grad_norm": 1.1333914995193481, + "learning_rate": 3.5e-06, + "loss": 1.402, + "step": 70 + }, + { + "epoch": 0.03659165091908607, + "grad_norm": 0.9886772036552429, + "learning_rate": 3.5500000000000003e-06, + "loss": 1.3807, + "step": 71 + }, + { + "epoch": 0.03710702628414362, + "grad_norm": 1.0266650915145874, + "learning_rate": 3.6000000000000003e-06, + "loss": 1.3801, + "step": 72 + }, + { + "epoch": 0.037622401649201165, + "grad_norm": 1.1841363906860352, + "learning_rate": 3.65e-06, + "loss": 1.3611, + "step": 73 + }, + { + "epoch": 0.03813777701425872, + "grad_norm": 1.2051784992218018, + "learning_rate": 3.7e-06, + "loss": 1.3565, + "step": 74 + }, + { + "epoch": 0.03865315237931627, + "grad_norm": 1.070709228515625, + "learning_rate": 3.7500000000000005e-06, + "loss": 1.3538, + "step": 75 + }, + { + "epoch": 0.03916852774437382, + "grad_norm": 0.9480461478233337, + "learning_rate": 3.8000000000000005e-06, + "loss": 1.3386, + "step": 76 + }, + { + "epoch": 0.039683903109431366, + "grad_norm": 1.0606422424316406, + "learning_rate": 3.85e-06, + "loss": 1.3701, + "step": 77 + }, + { + "epoch": 0.04019927847448892, + "grad_norm": 1.029645323753357, + "learning_rate": 3.900000000000001e-06, + "loss": 1.3649, + "step": 78 + }, + { + "epoch": 0.04071465383954647, + "grad_norm": 1.0215805768966675, + "learning_rate": 3.95e-06, + "loss": 1.3358, + "step": 79 + }, + { + "epoch": 0.04123002920460402, + "grad_norm": 1.0088640451431274, + "learning_rate": 4.000000000000001e-06, + "loss": 1.3529, + "step": 80 + }, + { + "epoch": 0.04174540456966157, + "grad_norm": 0.9737629890441895, + "learning_rate": 4.05e-06, + "loss": 1.3922, + "step": 81 + }, + { + "epoch": 0.04226077993471912, + "grad_norm": 0.9443690776824951, + "learning_rate": 4.1e-06, + "loss": 1.3181, + "step": 82 + }, + { + "epoch": 0.04277615529977667, + "grad_norm": 0.9460474252700806, + "learning_rate": 4.15e-06, + "loss": 1.364, + "step": 83 + }, + { + "epoch": 0.043291530664834224, + "grad_norm": 0.9993011355400085, + "learning_rate": 4.2000000000000004e-06, + "loss": 1.3664, + "step": 84 + }, + { + "epoch": 0.04380690602989177, + "grad_norm": 0.9136483073234558, + "learning_rate": 4.25e-06, + "loss": 1.4122, + "step": 85 + }, + { + "epoch": 0.04432228139494932, + "grad_norm": 1.0764967203140259, + "learning_rate": 4.3e-06, + "loss": 1.3409, + "step": 86 + }, + { + "epoch": 0.04483765676000687, + "grad_norm": 0.9557569622993469, + "learning_rate": 4.350000000000001e-06, + "loss": 1.3843, + "step": 87 + }, + { + "epoch": 0.04535303212506442, + "grad_norm": 0.9669111371040344, + "learning_rate": 4.4e-06, + "loss": 1.3469, + "step": 88 + }, + { + "epoch": 0.04586840749012197, + "grad_norm": 0.9530505537986755, + "learning_rate": 4.450000000000001e-06, + "loss": 1.3295, + "step": 89 + }, + { + "epoch": 0.04638378285517952, + "grad_norm": 0.9201849102973938, + "learning_rate": 4.5e-06, + "loss": 1.3297, + "step": 90 + }, + { + "epoch": 0.046899158220237075, + "grad_norm": 0.9111409187316895, + "learning_rate": 4.5500000000000005e-06, + "loss": 1.3776, + "step": 91 + }, + { + "epoch": 0.04741453358529462, + "grad_norm": 1.0236597061157227, + "learning_rate": 4.600000000000001e-06, + "loss": 1.3248, + "step": 92 + }, + { + "epoch": 0.04792990895035217, + "grad_norm": 1.0763428211212158, + "learning_rate": 4.65e-06, + "loss": 1.3199, + "step": 93 + }, + { + "epoch": 0.048445284315409724, + "grad_norm": 1.1314724683761597, + "learning_rate": 4.7e-06, + "loss": 1.3956, + "step": 94 + }, + { + "epoch": 0.048960659680467276, + "grad_norm": 0.9608203768730164, + "learning_rate": 4.75e-06, + "loss": 1.3685, + "step": 95 + }, + { + "epoch": 0.04947603504552482, + "grad_norm": 0.9246897101402283, + "learning_rate": 4.800000000000001e-06, + "loss": 1.2863, + "step": 96 + }, + { + "epoch": 0.049991410410582374, + "grad_norm": 0.870741605758667, + "learning_rate": 4.85e-06, + "loss": 1.2298, + "step": 97 + }, + { + "epoch": 0.050506785775639926, + "grad_norm": 0.9093057513237, + "learning_rate": 4.9000000000000005e-06, + "loss": 1.2509, + "step": 98 + }, + { + "epoch": 0.05102216114069748, + "grad_norm": 0.9211610555648804, + "learning_rate": 4.95e-06, + "loss": 1.3473, + "step": 99 + }, + { + "epoch": 0.05153753650575502, + "grad_norm": 0.9618102312088013, + "learning_rate": 5e-06, + "loss": 1.32, + "step": 100 + }, + { + "epoch": 0.052052911870812575, + "grad_norm": 0.9188870787620544, + "learning_rate": 4.9999999073600874e-06, + "loss": 1.2792, + "step": 101 + }, + { + "epoch": 0.05256828723587013, + "grad_norm": 0.9936563372612, + "learning_rate": 4.999999629440356e-06, + "loss": 1.285, + "step": 102 + }, + { + "epoch": 0.05308366260092767, + "grad_norm": 0.9464974999427795, + "learning_rate": 4.999999166240825e-06, + "loss": 1.3415, + "step": 103 + }, + { + "epoch": 0.053599037965985225, + "grad_norm": 0.9253501296043396, + "learning_rate": 4.999998517761531e-06, + "loss": 1.2913, + "step": 104 + }, + { + "epoch": 0.05411441333104278, + "grad_norm": 1.006068468093872, + "learning_rate": 4.999997684002521e-06, + "loss": 1.3463, + "step": 105 + }, + { + "epoch": 0.05462978869610033, + "grad_norm": 0.962740957736969, + "learning_rate": 4.999996664963857e-06, + "loss": 1.3093, + "step": 106 + }, + { + "epoch": 0.055145164061157874, + "grad_norm": 0.9607085585594177, + "learning_rate": 4.999995460645613e-06, + "loss": 1.2853, + "step": 107 + }, + { + "epoch": 0.055660539426215426, + "grad_norm": 0.9489694833755493, + "learning_rate": 4.999994071047881e-06, + "loss": 1.2854, + "step": 108 + }, + { + "epoch": 0.05617591479127298, + "grad_norm": 1.1459851264953613, + "learning_rate": 4.999992496170762e-06, + "loss": 1.2925, + "step": 109 + }, + { + "epoch": 0.05669129015633053, + "grad_norm": 0.9199038743972778, + "learning_rate": 4.999990736014374e-06, + "loss": 1.3164, + "step": 110 + }, + { + "epoch": 0.057206665521388075, + "grad_norm": 0.9531081914901733, + "learning_rate": 4.999988790578847e-06, + "loss": 1.3016, + "step": 111 + }, + { + "epoch": 0.05772204088644563, + "grad_norm": 0.9910569190979004, + "learning_rate": 4.999986659864325e-06, + "loss": 1.2889, + "step": 112 + }, + { + "epoch": 0.05823741625150318, + "grad_norm": 0.9495322704315186, + "learning_rate": 4.999984343870964e-06, + "loss": 1.2881, + "step": 113 + }, + { + "epoch": 0.05875279161656073, + "grad_norm": 0.9481725096702576, + "learning_rate": 4.999981842598939e-06, + "loss": 1.309, + "step": 114 + }, + { + "epoch": 0.05926816698161828, + "grad_norm": 0.9376801252365112, + "learning_rate": 4.999979156048433e-06, + "loss": 1.3022, + "step": 115 + }, + { + "epoch": 0.05978354234667583, + "grad_norm": 1.0330339670181274, + "learning_rate": 4.999976284219646e-06, + "loss": 1.3911, + "step": 116 + }, + { + "epoch": 0.06029891771173338, + "grad_norm": 1.1374934911727905, + "learning_rate": 4.999973227112792e-06, + "loss": 1.2454, + "step": 117 + }, + { + "epoch": 0.060814293076790926, + "grad_norm": 0.9146735668182373, + "learning_rate": 4.999969984728096e-06, + "loss": 1.2508, + "step": 118 + }, + { + "epoch": 0.06132966844184848, + "grad_norm": 0.9474186897277832, + "learning_rate": 4.9999665570657985e-06, + "loss": 1.2772, + "step": 119 + }, + { + "epoch": 0.06184504380690603, + "grad_norm": 1.0182085037231445, + "learning_rate": 4.9999629441261535e-06, + "loss": 1.2795, + "step": 120 + }, + { + "epoch": 0.06236041917196358, + "grad_norm": 1.0316343307495117, + "learning_rate": 4.999959145909429e-06, + "loss": 1.301, + "step": 121 + }, + { + "epoch": 0.06287579453702113, + "grad_norm": 0.9423215389251709, + "learning_rate": 4.999955162415906e-06, + "loss": 1.2808, + "step": 122 + }, + { + "epoch": 0.06339116990207869, + "grad_norm": 0.9410876631736755, + "learning_rate": 4.999950993645881e-06, + "loss": 1.2881, + "step": 123 + }, + { + "epoch": 0.06390654526713622, + "grad_norm": 0.96896892786026, + "learning_rate": 4.999946639599662e-06, + "loss": 1.2718, + "step": 124 + }, + { + "epoch": 0.06442192063219378, + "grad_norm": 1.081750512123108, + "learning_rate": 4.999942100277572e-06, + "loss": 1.2793, + "step": 125 + }, + { + "epoch": 0.06493729599725133, + "grad_norm": 0.9917174577713013, + "learning_rate": 4.999937375679946e-06, + "loss": 1.2755, + "step": 126 + }, + { + "epoch": 0.06545267136230888, + "grad_norm": 0.9916597604751587, + "learning_rate": 4.9999324658071365e-06, + "loss": 1.2558, + "step": 127 + }, + { + "epoch": 0.06596804672736643, + "grad_norm": 1.0110090970993042, + "learning_rate": 4.999927370659507e-06, + "loss": 1.3498, + "step": 128 + }, + { + "epoch": 0.06648342209242399, + "grad_norm": 0.9905858039855957, + "learning_rate": 4.999922090237433e-06, + "loss": 1.2436, + "step": 129 + }, + { + "epoch": 0.06699879745748154, + "grad_norm": 1.0680683851242065, + "learning_rate": 4.999916624541308e-06, + "loss": 1.2331, + "step": 130 + }, + { + "epoch": 0.06751417282253908, + "grad_norm": 0.9393003582954407, + "learning_rate": 4.9999109735715356e-06, + "loss": 1.2809, + "step": 131 + }, + { + "epoch": 0.06802954818759663, + "grad_norm": 1.0146749019622803, + "learning_rate": 4.999905137328537e-06, + "loss": 1.3052, + "step": 132 + }, + { + "epoch": 0.06854492355265418, + "grad_norm": 0.8913703560829163, + "learning_rate": 4.9998991158127404e-06, + "loss": 1.2852, + "step": 133 + }, + { + "epoch": 0.06906029891771173, + "grad_norm": 0.8886722326278687, + "learning_rate": 4.999892909024597e-06, + "loss": 1.2871, + "step": 134 + }, + { + "epoch": 0.06957567428276928, + "grad_norm": 0.9775382280349731, + "learning_rate": 4.999886516964564e-06, + "loss": 1.2081, + "step": 135 + }, + { + "epoch": 0.07009104964782684, + "grad_norm": 1.0126523971557617, + "learning_rate": 4.999879939633115e-06, + "loss": 1.2435, + "step": 136 + }, + { + "epoch": 0.07060642501288439, + "grad_norm": 0.9554210305213928, + "learning_rate": 4.999873177030739e-06, + "loss": 1.2399, + "step": 137 + }, + { + "epoch": 0.07112180037794194, + "grad_norm": 0.9512026906013489, + "learning_rate": 4.999866229157935e-06, + "loss": 1.1091, + "step": 138 + }, + { + "epoch": 0.07163717574299948, + "grad_norm": 0.9823938012123108, + "learning_rate": 4.9998590960152195e-06, + "loss": 1.2531, + "step": 139 + }, + { + "epoch": 0.07215255110805703, + "grad_norm": 0.998545229434967, + "learning_rate": 4.999851777603122e-06, + "loss": 1.258, + "step": 140 + }, + { + "epoch": 0.07266792647311458, + "grad_norm": 1.0376691818237305, + "learning_rate": 4.999844273922183e-06, + "loss": 1.2007, + "step": 141 + }, + { + "epoch": 0.07318330183817214, + "grad_norm": 0.9358575344085693, + "learning_rate": 4.999836584972959e-06, + "loss": 1.2966, + "step": 142 + }, + { + "epoch": 0.07369867720322969, + "grad_norm": 1.057644248008728, + "learning_rate": 4.99982871075602e-06, + "loss": 1.2255, + "step": 143 + }, + { + "epoch": 0.07421405256828724, + "grad_norm": 1.049521565437317, + "learning_rate": 4.999820651271951e-06, + "loss": 1.2814, + "step": 144 + }, + { + "epoch": 0.07472942793334479, + "grad_norm": 0.9946297407150269, + "learning_rate": 4.999812406521347e-06, + "loss": 1.259, + "step": 145 + }, + { + "epoch": 0.07524480329840233, + "grad_norm": 0.9518601894378662, + "learning_rate": 4.9998039765048215e-06, + "loss": 1.2347, + "step": 146 + }, + { + "epoch": 0.07576017866345988, + "grad_norm": 0.9874811768531799, + "learning_rate": 4.999795361222996e-06, + "loss": 1.2508, + "step": 147 + }, + { + "epoch": 0.07627555402851743, + "grad_norm": 0.9873331189155579, + "learning_rate": 4.999786560676512e-06, + "loss": 1.2307, + "step": 148 + }, + { + "epoch": 0.07679092939357499, + "grad_norm": 0.9384865760803223, + "learning_rate": 4.99977757486602e-06, + "loss": 1.2487, + "step": 149 + }, + { + "epoch": 0.07730630475863254, + "grad_norm": 1.0104529857635498, + "learning_rate": 4.999768403792188e-06, + "loss": 1.2755, + "step": 150 + }, + { + "epoch": 0.07782168012369009, + "grad_norm": 0.9755220413208008, + "learning_rate": 4.999759047455692e-06, + "loss": 1.257, + "step": 151 + }, + { + "epoch": 0.07833705548874764, + "grad_norm": 0.9542570114135742, + "learning_rate": 4.99974950585723e-06, + "loss": 1.2804, + "step": 152 + }, + { + "epoch": 0.0788524308538052, + "grad_norm": 1.006955862045288, + "learning_rate": 4.999739778997505e-06, + "loss": 1.3559, + "step": 153 + }, + { + "epoch": 0.07936780621886273, + "grad_norm": 0.9349725842475891, + "learning_rate": 4.99972986687724e-06, + "loss": 1.2523, + "step": 154 + }, + { + "epoch": 0.07988318158392028, + "grad_norm": 1.0557712316513062, + "learning_rate": 4.99971976949717e-06, + "loss": 1.2769, + "step": 155 + }, + { + "epoch": 0.08039855694897784, + "grad_norm": 0.9416980743408203, + "learning_rate": 4.999709486858042e-06, + "loss": 1.2152, + "step": 156 + }, + { + "epoch": 0.08091393231403539, + "grad_norm": 0.9875814318656921, + "learning_rate": 4.999699018960619e-06, + "loss": 1.2225, + "step": 157 + }, + { + "epoch": 0.08142930767909294, + "grad_norm": 1.0240778923034668, + "learning_rate": 4.999688365805676e-06, + "loss": 1.224, + "step": 158 + }, + { + "epoch": 0.0819446830441505, + "grad_norm": 1.0420036315917969, + "learning_rate": 4.999677527394003e-06, + "loss": 1.2509, + "step": 159 + }, + { + "epoch": 0.08246005840920805, + "grad_norm": 1.004077434539795, + "learning_rate": 4.999666503726404e-06, + "loss": 1.2131, + "step": 160 + }, + { + "epoch": 0.08297543377426558, + "grad_norm": 1.1523828506469727, + "learning_rate": 4.999655294803695e-06, + "loss": 1.2318, + "step": 161 + }, + { + "epoch": 0.08349080913932314, + "grad_norm": 0.9839702248573303, + "learning_rate": 4.999643900626707e-06, + "loss": 1.2845, + "step": 162 + }, + { + "epoch": 0.08400618450438069, + "grad_norm": 0.9726237058639526, + "learning_rate": 4.999632321196284e-06, + "loss": 1.2949, + "step": 163 + }, + { + "epoch": 0.08452155986943824, + "grad_norm": 0.994744598865509, + "learning_rate": 4.999620556513285e-06, + "loss": 1.2553, + "step": 164 + }, + { + "epoch": 0.08503693523449579, + "grad_norm": 1.0818318128585815, + "learning_rate": 4.999608606578581e-06, + "loss": 1.2687, + "step": 165 + }, + { + "epoch": 0.08555231059955334, + "grad_norm": 1.1112850904464722, + "learning_rate": 4.999596471393059e-06, + "loss": 1.3266, + "step": 166 + }, + { + "epoch": 0.0860676859646109, + "grad_norm": 0.9371221661567688, + "learning_rate": 4.999584150957616e-06, + "loss": 1.2056, + "step": 167 + }, + { + "epoch": 0.08658306132966845, + "grad_norm": 0.9769211411476135, + "learning_rate": 4.999571645273168e-06, + "loss": 1.2046, + "step": 168 + }, + { + "epoch": 0.08709843669472599, + "grad_norm": 1.0210055112838745, + "learning_rate": 4.99955895434064e-06, + "loss": 1.318, + "step": 169 + }, + { + "epoch": 0.08761381205978354, + "grad_norm": 1.0373449325561523, + "learning_rate": 4.9995460781609725e-06, + "loss": 1.2416, + "step": 170 + }, + { + "epoch": 0.08812918742484109, + "grad_norm": 0.9221280813217163, + "learning_rate": 4.999533016735121e-06, + "loss": 1.1639, + "step": 171 + }, + { + "epoch": 0.08864456278989864, + "grad_norm": 0.9534924626350403, + "learning_rate": 4.9995197700640515e-06, + "loss": 1.2645, + "step": 172 + }, + { + "epoch": 0.0891599381549562, + "grad_norm": 1.0295299291610718, + "learning_rate": 4.999506338148748e-06, + "loss": 1.2781, + "step": 173 + }, + { + "epoch": 0.08967531352001375, + "grad_norm": 0.9931419491767883, + "learning_rate": 4.999492720990204e-06, + "loss": 1.2215, + "step": 174 + }, + { + "epoch": 0.0901906888850713, + "grad_norm": 0.9851301908493042, + "learning_rate": 4.999478918589431e-06, + "loss": 1.2605, + "step": 175 + }, + { + "epoch": 0.09070606425012884, + "grad_norm": 0.9377535581588745, + "learning_rate": 4.999464930947449e-06, + "loss": 1.1951, + "step": 176 + }, + { + "epoch": 0.09122143961518639, + "grad_norm": 0.968189001083374, + "learning_rate": 4.999450758065297e-06, + "loss": 1.2141, + "step": 177 + }, + { + "epoch": 0.09173681498024394, + "grad_norm": 1.0255149602890015, + "learning_rate": 4.9994363999440245e-06, + "loss": 1.2684, + "step": 178 + }, + { + "epoch": 0.0922521903453015, + "grad_norm": 1.0423952341079712, + "learning_rate": 4.999421856584696e-06, + "loss": 1.2221, + "step": 179 + }, + { + "epoch": 0.09276756571035905, + "grad_norm": 1.1341553926467896, + "learning_rate": 4.999407127988388e-06, + "loss": 1.2467, + "step": 180 + }, + { + "epoch": 0.0932829410754166, + "grad_norm": 0.9786616563796997, + "learning_rate": 4.9993922141561944e-06, + "loss": 1.177, + "step": 181 + }, + { + "epoch": 0.09379831644047415, + "grad_norm": 1.032413363456726, + "learning_rate": 4.999377115089219e-06, + "loss": 1.2769, + "step": 182 + }, + { + "epoch": 0.0943136918055317, + "grad_norm": 1.0089246034622192, + "learning_rate": 4.999361830788581e-06, + "loss": 1.2701, + "step": 183 + }, + { + "epoch": 0.09482906717058924, + "grad_norm": 1.0086696147918701, + "learning_rate": 4.999346361255412e-06, + "loss": 1.1965, + "step": 184 + }, + { + "epoch": 0.09534444253564679, + "grad_norm": 1.0027146339416504, + "learning_rate": 4.999330706490861e-06, + "loss": 1.2585, + "step": 185 + }, + { + "epoch": 0.09585981790070434, + "grad_norm": 1.0136017799377441, + "learning_rate": 4.999314866496085e-06, + "loss": 1.2137, + "step": 186 + }, + { + "epoch": 0.0963751932657619, + "grad_norm": 1.0555775165557861, + "learning_rate": 4.999298841272262e-06, + "loss": 1.2377, + "step": 187 + }, + { + "epoch": 0.09689056863081945, + "grad_norm": 1.1131389141082764, + "learning_rate": 4.999282630820576e-06, + "loss": 1.2059, + "step": 188 + }, + { + "epoch": 0.097405943995877, + "grad_norm": 1.0880460739135742, + "learning_rate": 4.9992662351422305e-06, + "loss": 1.2721, + "step": 189 + }, + { + "epoch": 0.09792131936093455, + "grad_norm": 1.0187134742736816, + "learning_rate": 4.999249654238439e-06, + "loss": 1.3137, + "step": 190 + }, + { + "epoch": 0.09843669472599209, + "grad_norm": 1.0277626514434814, + "learning_rate": 4.999232888110432e-06, + "loss": 1.2624, + "step": 191 + }, + { + "epoch": 0.09895207009104964, + "grad_norm": 1.046287178993225, + "learning_rate": 4.999215936759452e-06, + "loss": 1.1828, + "step": 192 + }, + { + "epoch": 0.0994674454561072, + "grad_norm": 1.0114896297454834, + "learning_rate": 4.999198800186753e-06, + "loss": 1.2784, + "step": 193 + }, + { + "epoch": 0.09998282082116475, + "grad_norm": 1.0122891664505005, + "learning_rate": 4.999181478393608e-06, + "loss": 1.2461, + "step": 194 + }, + { + "epoch": 0.1004981961862223, + "grad_norm": 0.9655713438987732, + "learning_rate": 4.999163971381299e-06, + "loss": 1.1965, + "step": 195 + }, + { + "epoch": 0.10101357155127985, + "grad_norm": 1.0382893085479736, + "learning_rate": 4.999146279151123e-06, + "loss": 1.1827, + "step": 196 + }, + { + "epoch": 0.1015289469163374, + "grad_norm": 1.0155049562454224, + "learning_rate": 4.999128401704394e-06, + "loss": 1.2268, + "step": 197 + }, + { + "epoch": 0.10204432228139496, + "grad_norm": 1.0959563255310059, + "learning_rate": 4.9991103390424334e-06, + "loss": 1.2438, + "step": 198 + }, + { + "epoch": 0.1025596976464525, + "grad_norm": 1.1797666549682617, + "learning_rate": 4.999092091166582e-06, + "loss": 1.2539, + "step": 199 + }, + { + "epoch": 0.10307507301151005, + "grad_norm": 1.0557608604431152, + "learning_rate": 4.999073658078191e-06, + "loss": 1.1386, + "step": 200 + }, + { + "epoch": 0.1035904483765676, + "grad_norm": 0.9590182304382324, + "learning_rate": 4.9990550397786285e-06, + "loss": 1.1971, + "step": 201 + }, + { + "epoch": 0.10410582374162515, + "grad_norm": 1.0338568687438965, + "learning_rate": 4.999036236269272e-06, + "loss": 1.2171, + "step": 202 + }, + { + "epoch": 0.1046211991066827, + "grad_norm": 0.966436505317688, + "learning_rate": 4.999017247551516e-06, + "loss": 1.2225, + "step": 203 + }, + { + "epoch": 0.10513657447174025, + "grad_norm": 0.985903263092041, + "learning_rate": 4.998998073626769e-06, + "loss": 1.2416, + "step": 204 + }, + { + "epoch": 0.1056519498367978, + "grad_norm": 1.057100772857666, + "learning_rate": 4.99897871449645e-06, + "loss": 1.2662, + "step": 205 + }, + { + "epoch": 0.10616732520185534, + "grad_norm": 0.9916481971740723, + "learning_rate": 4.998959170161996e-06, + "loss": 1.1879, + "step": 206 + }, + { + "epoch": 0.1066827005669129, + "grad_norm": 1.0549089908599854, + "learning_rate": 4.9989394406248524e-06, + "loss": 1.2343, + "step": 207 + }, + { + "epoch": 0.10719807593197045, + "grad_norm": 1.0757921934127808, + "learning_rate": 4.998919525886483e-06, + "loss": 1.2122, + "step": 208 + }, + { + "epoch": 0.107713451297028, + "grad_norm": 1.0240768194198608, + "learning_rate": 4.998899425948365e-06, + "loss": 1.2599, + "step": 209 + }, + { + "epoch": 0.10822882666208555, + "grad_norm": 0.9299185872077942, + "learning_rate": 4.998879140811986e-06, + "loss": 1.2077, + "step": 210 + }, + { + "epoch": 0.1087442020271431, + "grad_norm": 0.9661532640457153, + "learning_rate": 4.998858670478851e-06, + "loss": 1.1937, + "step": 211 + }, + { + "epoch": 0.10925957739220066, + "grad_norm": 1.0395828485488892, + "learning_rate": 4.9988380149504755e-06, + "loss": 1.2129, + "step": 212 + }, + { + "epoch": 0.10977495275725821, + "grad_norm": 1.0316669940948486, + "learning_rate": 4.9988171742283916e-06, + "loss": 1.2469, + "step": 213 + }, + { + "epoch": 0.11029032812231575, + "grad_norm": 1.0263491868972778, + "learning_rate": 4.998796148314144e-06, + "loss": 1.2414, + "step": 214 + }, + { + "epoch": 0.1108057034873733, + "grad_norm": 0.9928942918777466, + "learning_rate": 4.998774937209289e-06, + "loss": 1.2504, + "step": 215 + }, + { + "epoch": 0.11132107885243085, + "grad_norm": 1.1089606285095215, + "learning_rate": 4.998753540915401e-06, + "loss": 1.2508, + "step": 216 + }, + { + "epoch": 0.1118364542174884, + "grad_norm": 1.0831267833709717, + "learning_rate": 4.998731959434064e-06, + "loss": 1.275, + "step": 217 + }, + { + "epoch": 0.11235182958254596, + "grad_norm": 1.089594841003418, + "learning_rate": 4.998710192766879e-06, + "loss": 1.2942, + "step": 218 + }, + { + "epoch": 0.11286720494760351, + "grad_norm": 1.04873788356781, + "learning_rate": 4.998688240915457e-06, + "loss": 1.2425, + "step": 219 + }, + { + "epoch": 0.11338258031266106, + "grad_norm": 1.0630252361297607, + "learning_rate": 4.998666103881427e-06, + "loss": 1.2628, + "step": 220 + }, + { + "epoch": 0.1138979556777186, + "grad_norm": 1.0307626724243164, + "learning_rate": 4.998643781666429e-06, + "loss": 1.2594, + "step": 221 + }, + { + "epoch": 0.11441333104277615, + "grad_norm": 0.9258455634117126, + "learning_rate": 4.998621274272117e-06, + "loss": 1.1952, + "step": 222 + }, + { + "epoch": 0.1149287064078337, + "grad_norm": 0.9914063215255737, + "learning_rate": 4.998598581700159e-06, + "loss": 1.2426, + "step": 223 + }, + { + "epoch": 0.11544408177289125, + "grad_norm": 1.0159790515899658, + "learning_rate": 4.998575703952237e-06, + "loss": 1.232, + "step": 224 + }, + { + "epoch": 0.11595945713794881, + "grad_norm": 1.0941535234451294, + "learning_rate": 4.998552641030046e-06, + "loss": 1.2329, + "step": 225 + }, + { + "epoch": 0.11647483250300636, + "grad_norm": 1.0479755401611328, + "learning_rate": 4.9985293929352976e-06, + "loss": 1.2104, + "step": 226 + }, + { + "epoch": 0.11699020786806391, + "grad_norm": 1.0361521244049072, + "learning_rate": 4.9985059596697115e-06, + "loss": 1.256, + "step": 227 + }, + { + "epoch": 0.11750558323312146, + "grad_norm": 1.0784052610397339, + "learning_rate": 4.998482341235026e-06, + "loss": 1.2458, + "step": 228 + }, + { + "epoch": 0.118020958598179, + "grad_norm": 0.9795831441879272, + "learning_rate": 4.9984585376329915e-06, + "loss": 1.2419, + "step": 229 + }, + { + "epoch": 0.11853633396323655, + "grad_norm": 1.059186577796936, + "learning_rate": 4.998434548865372e-06, + "loss": 1.1864, + "step": 230 + }, + { + "epoch": 0.1190517093282941, + "grad_norm": 1.0938690900802612, + "learning_rate": 4.998410374933945e-06, + "loss": 1.2137, + "step": 231 + }, + { + "epoch": 0.11956708469335166, + "grad_norm": 1.0159286260604858, + "learning_rate": 4.998386015840502e-06, + "loss": 1.2016, + "step": 232 + }, + { + "epoch": 0.12008246005840921, + "grad_norm": 1.1006368398666382, + "learning_rate": 4.99836147158685e-06, + "loss": 1.1593, + "step": 233 + }, + { + "epoch": 0.12059783542346676, + "grad_norm": 0.9765269756317139, + "learning_rate": 4.998336742174805e-06, + "loss": 1.186, + "step": 234 + }, + { + "epoch": 0.12111321078852431, + "grad_norm": 1.060289740562439, + "learning_rate": 4.998311827606203e-06, + "loss": 1.208, + "step": 235 + }, + { + "epoch": 0.12162858615358185, + "grad_norm": 1.0146334171295166, + "learning_rate": 4.998286727882888e-06, + "loss": 1.2077, + "step": 236 + }, + { + "epoch": 0.1221439615186394, + "grad_norm": 1.0203336477279663, + "learning_rate": 4.998261443006721e-06, + "loss": 1.1844, + "step": 237 + }, + { + "epoch": 0.12265933688369696, + "grad_norm": 1.0171735286712646, + "learning_rate": 4.998235972979576e-06, + "loss": 1.2218, + "step": 238 + }, + { + "epoch": 0.12317471224875451, + "grad_norm": 1.0859501361846924, + "learning_rate": 4.998210317803341e-06, + "loss": 1.1748, + "step": 239 + }, + { + "epoch": 0.12369008761381206, + "grad_norm": 1.0058231353759766, + "learning_rate": 4.998184477479918e-06, + "loss": 1.1903, + "step": 240 + }, + { + "epoch": 0.12420546297886961, + "grad_norm": 1.0329095125198364, + "learning_rate": 4.998158452011219e-06, + "loss": 1.1815, + "step": 241 + }, + { + "epoch": 0.12472083834392717, + "grad_norm": 1.0615721940994263, + "learning_rate": 4.998132241399176e-06, + "loss": 1.2004, + "step": 242 + }, + { + "epoch": 0.12523621370898472, + "grad_norm": 1.030116081237793, + "learning_rate": 4.998105845645731e-06, + "loss": 1.1981, + "step": 243 + }, + { + "epoch": 0.12575158907404227, + "grad_norm": 1.0303820371627808, + "learning_rate": 4.9980792647528384e-06, + "loss": 1.2333, + "step": 244 + }, + { + "epoch": 0.12626696443909982, + "grad_norm": 1.1086900234222412, + "learning_rate": 4.99805249872247e-06, + "loss": 1.1616, + "step": 245 + }, + { + "epoch": 0.12678233980415737, + "grad_norm": 1.0880281925201416, + "learning_rate": 4.998025547556609e-06, + "loss": 1.235, + "step": 246 + }, + { + "epoch": 0.1272977151692149, + "grad_norm": 1.0249994993209839, + "learning_rate": 4.997998411257252e-06, + "loss": 1.2356, + "step": 247 + }, + { + "epoch": 0.12781309053427245, + "grad_norm": 1.1765490770339966, + "learning_rate": 4.997971089826412e-06, + "loss": 1.2925, + "step": 248 + }, + { + "epoch": 0.12832846589933, + "grad_norm": 1.064995527267456, + "learning_rate": 4.997943583266112e-06, + "loss": 1.2629, + "step": 249 + }, + { + "epoch": 0.12884384126438755, + "grad_norm": 1.0707592964172363, + "learning_rate": 4.997915891578391e-06, + "loss": 1.2272, + "step": 250 + }, + { + "epoch": 0.1293592166294451, + "grad_norm": 1.0647047758102417, + "learning_rate": 4.997888014765301e-06, + "loss": 1.1205, + "step": 251 + }, + { + "epoch": 0.12987459199450266, + "grad_norm": 1.0850799083709717, + "learning_rate": 4.9978599528289094e-06, + "loss": 1.1708, + "step": 252 + }, + { + "epoch": 0.1303899673595602, + "grad_norm": 1.105841040611267, + "learning_rate": 4.997831705771295e-06, + "loss": 1.2088, + "step": 253 + }, + { + "epoch": 0.13090534272461776, + "grad_norm": 1.0339291095733643, + "learning_rate": 4.997803273594551e-06, + "loss": 1.1912, + "step": 254 + }, + { + "epoch": 0.13142071808967531, + "grad_norm": 0.9818682074546814, + "learning_rate": 4.997774656300784e-06, + "loss": 1.1551, + "step": 255 + }, + { + "epoch": 0.13193609345473287, + "grad_norm": 1.044981598854065, + "learning_rate": 4.997745853892118e-06, + "loss": 1.1997, + "step": 256 + }, + { + "epoch": 0.13245146881979042, + "grad_norm": 1.0077413320541382, + "learning_rate": 4.997716866370683e-06, + "loss": 1.2071, + "step": 257 + }, + { + "epoch": 0.13296684418484797, + "grad_norm": 1.0919009447097778, + "learning_rate": 4.99768769373863e-06, + "loss": 1.1637, + "step": 258 + }, + { + "epoch": 0.13348221954990552, + "grad_norm": 1.122795581817627, + "learning_rate": 4.997658335998121e-06, + "loss": 1.2381, + "step": 259 + }, + { + "epoch": 0.13399759491496308, + "grad_norm": 1.10574471950531, + "learning_rate": 4.9976287931513326e-06, + "loss": 1.2633, + "step": 260 + }, + { + "epoch": 0.13451297028002063, + "grad_norm": 1.1048368215560913, + "learning_rate": 4.997599065200452e-06, + "loss": 1.1864, + "step": 261 + }, + { + "epoch": 0.13502834564507815, + "grad_norm": 0.9917008876800537, + "learning_rate": 4.997569152147683e-06, + "loss": 1.2127, + "step": 262 + }, + { + "epoch": 0.1355437210101357, + "grad_norm": 1.0532373189926147, + "learning_rate": 4.997539053995244e-06, + "loss": 1.2023, + "step": 263 + }, + { + "epoch": 0.13605909637519326, + "grad_norm": 1.0858818292617798, + "learning_rate": 4.997508770745364e-06, + "loss": 1.2196, + "step": 264 + }, + { + "epoch": 0.1365744717402508, + "grad_norm": 1.035705327987671, + "learning_rate": 4.9974783024002886e-06, + "loss": 1.2244, + "step": 265 + }, + { + "epoch": 0.13708984710530836, + "grad_norm": 1.0526639223098755, + "learning_rate": 4.997447648962275e-06, + "loss": 1.1519, + "step": 266 + }, + { + "epoch": 0.1376052224703659, + "grad_norm": 0.9649120569229126, + "learning_rate": 4.997416810433595e-06, + "loss": 1.1471, + "step": 267 + }, + { + "epoch": 0.13812059783542346, + "grad_norm": 1.0110032558441162, + "learning_rate": 4.997385786816534e-06, + "loss": 1.1555, + "step": 268 + }, + { + "epoch": 0.13863597320048102, + "grad_norm": 1.08561110496521, + "learning_rate": 4.997354578113392e-06, + "loss": 1.2526, + "step": 269 + }, + { + "epoch": 0.13915134856553857, + "grad_norm": 1.045015811920166, + "learning_rate": 4.997323184326483e-06, + "loss": 1.21, + "step": 270 + }, + { + "epoch": 0.13966672393059612, + "grad_norm": 1.0490946769714355, + "learning_rate": 4.99729160545813e-06, + "loss": 1.1412, + "step": 271 + }, + { + "epoch": 0.14018209929565367, + "grad_norm": 1.0669465065002441, + "learning_rate": 4.997259841510677e-06, + "loss": 1.185, + "step": 272 + }, + { + "epoch": 0.14069747466071122, + "grad_norm": 1.0484760999679565, + "learning_rate": 4.997227892486476e-06, + "loss": 1.2348, + "step": 273 + }, + { + "epoch": 0.14121285002576878, + "grad_norm": 1.1375056505203247, + "learning_rate": 4.997195758387895e-06, + "loss": 1.1896, + "step": 274 + }, + { + "epoch": 0.14172822539082633, + "grad_norm": 1.055327296257019, + "learning_rate": 4.997163439217316e-06, + "loss": 1.1287, + "step": 275 + }, + { + "epoch": 0.14224360075588388, + "grad_norm": 1.1165558099746704, + "learning_rate": 4.997130934977134e-06, + "loss": 1.2059, + "step": 276 + }, + { + "epoch": 0.1427589761209414, + "grad_norm": 1.208341121673584, + "learning_rate": 4.997098245669759e-06, + "loss": 1.2388, + "step": 277 + }, + { + "epoch": 0.14327435148599896, + "grad_norm": 1.0463298559188843, + "learning_rate": 4.997065371297613e-06, + "loss": 1.1952, + "step": 278 + }, + { + "epoch": 0.1437897268510565, + "grad_norm": 1.0449258089065552, + "learning_rate": 4.997032311863131e-06, + "loss": 1.172, + "step": 279 + }, + { + "epoch": 0.14430510221611406, + "grad_norm": 1.1663317680358887, + "learning_rate": 4.996999067368766e-06, + "loss": 1.1271, + "step": 280 + }, + { + "epoch": 0.1448204775811716, + "grad_norm": 1.086437702178955, + "learning_rate": 4.996965637816979e-06, + "loss": 1.1885, + "step": 281 + }, + { + "epoch": 0.14533585294622917, + "grad_norm": 1.1258007287979126, + "learning_rate": 4.996932023210249e-06, + "loss": 1.2649, + "step": 282 + }, + { + "epoch": 0.14585122831128672, + "grad_norm": 1.0517827272415161, + "learning_rate": 4.9968982235510665e-06, + "loss": 1.1854, + "step": 283 + }, + { + "epoch": 0.14636660367634427, + "grad_norm": 1.1258128881454468, + "learning_rate": 4.996864238841936e-06, + "loss": 1.196, + "step": 284 + }, + { + "epoch": 0.14688197904140182, + "grad_norm": 1.2804971933364868, + "learning_rate": 4.996830069085379e-06, + "loss": 1.2104, + "step": 285 + }, + { + "epoch": 0.14739735440645937, + "grad_norm": 1.0372135639190674, + "learning_rate": 4.996795714283924e-06, + "loss": 1.2311, + "step": 286 + }, + { + "epoch": 0.14791272977151693, + "grad_norm": 1.1211464405059814, + "learning_rate": 4.99676117444012e-06, + "loss": 1.1966, + "step": 287 + }, + { + "epoch": 0.14842810513657448, + "grad_norm": 1.0660741329193115, + "learning_rate": 4.996726449556526e-06, + "loss": 1.2235, + "step": 288 + }, + { + "epoch": 0.14894348050163203, + "grad_norm": 1.0334893465042114, + "learning_rate": 4.996691539635715e-06, + "loss": 1.222, + "step": 289 + }, + { + "epoch": 0.14945885586668958, + "grad_norm": 1.1706844568252563, + "learning_rate": 4.996656444680275e-06, + "loss": 1.2978, + "step": 290 + }, + { + "epoch": 0.14997423123174713, + "grad_norm": 1.1230109930038452, + "learning_rate": 4.996621164692805e-06, + "loss": 1.1557, + "step": 291 + }, + { + "epoch": 0.15048960659680466, + "grad_norm": 1.1593081951141357, + "learning_rate": 4.9965856996759234e-06, + "loss": 1.214, + "step": 292 + }, + { + "epoch": 0.1510049819618622, + "grad_norm": 1.097525954246521, + "learning_rate": 4.996550049632255e-06, + "loss": 1.0969, + "step": 293 + }, + { + "epoch": 0.15152035732691976, + "grad_norm": 1.0412386655807495, + "learning_rate": 4.996514214564443e-06, + "loss": 1.2315, + "step": 294 + }, + { + "epoch": 0.15203573269197732, + "grad_norm": 1.103298306465149, + "learning_rate": 4.996478194475145e-06, + "loss": 1.1963, + "step": 295 + }, + { + "epoch": 0.15255110805703487, + "grad_norm": 1.147619366645813, + "learning_rate": 4.996441989367028e-06, + "loss": 1.2171, + "step": 296 + }, + { + "epoch": 0.15306648342209242, + "grad_norm": 1.0953634977340698, + "learning_rate": 4.9964055992427755e-06, + "loss": 1.1773, + "step": 297 + }, + { + "epoch": 0.15358185878714997, + "grad_norm": 1.1153252124786377, + "learning_rate": 4.996369024105086e-06, + "loss": 1.257, + "step": 298 + }, + { + "epoch": 0.15409723415220752, + "grad_norm": 1.1131457090377808, + "learning_rate": 4.996332263956669e-06, + "loss": 1.2331, + "step": 299 + }, + { + "epoch": 0.15461260951726508, + "grad_norm": 1.1843032836914062, + "learning_rate": 4.99629531880025e-06, + "loss": 1.2694, + "step": 300 + }, + { + "epoch": 0.15512798488232263, + "grad_norm": 1.1492340564727783, + "learning_rate": 4.996258188638565e-06, + "loss": 1.247, + "step": 301 + }, + { + "epoch": 0.15564336024738018, + "grad_norm": 1.1046638488769531, + "learning_rate": 4.9962208734743684e-06, + "loss": 1.1701, + "step": 302 + }, + { + "epoch": 0.15615873561243773, + "grad_norm": 1.108048677444458, + "learning_rate": 4.9961833733104234e-06, + "loss": 1.1667, + "step": 303 + }, + { + "epoch": 0.15667411097749528, + "grad_norm": 1.096451759338379, + "learning_rate": 4.996145688149511e-06, + "loss": 1.221, + "step": 304 + }, + { + "epoch": 0.15718948634255284, + "grad_norm": 1.0377289056777954, + "learning_rate": 4.996107817994422e-06, + "loss": 1.1303, + "step": 305 + }, + { + "epoch": 0.1577048617076104, + "grad_norm": 1.0976608991622925, + "learning_rate": 4.996069762847966e-06, + "loss": 1.2131, + "step": 306 + }, + { + "epoch": 0.1582202370726679, + "grad_norm": 1.0804593563079834, + "learning_rate": 4.996031522712961e-06, + "loss": 1.1413, + "step": 307 + }, + { + "epoch": 0.15873561243772547, + "grad_norm": 1.0988240242004395, + "learning_rate": 4.995993097592241e-06, + "loss": 1.1787, + "step": 308 + }, + { + "epoch": 0.15925098780278302, + "grad_norm": 1.1158981323242188, + "learning_rate": 4.995954487488655e-06, + "loss": 1.1956, + "step": 309 + }, + { + "epoch": 0.15976636316784057, + "grad_norm": 1.06276535987854, + "learning_rate": 4.9959156924050635e-06, + "loss": 1.1488, + "step": 310 + }, + { + "epoch": 0.16028173853289812, + "grad_norm": 1.118338942527771, + "learning_rate": 4.995876712344343e-06, + "loss": 1.2046, + "step": 311 + }, + { + "epoch": 0.16079711389795567, + "grad_norm": 1.1164604425430298, + "learning_rate": 4.995837547309381e-06, + "loss": 1.1962, + "step": 312 + }, + { + "epoch": 0.16131248926301323, + "grad_norm": 1.0719722509384155, + "learning_rate": 4.99579819730308e-06, + "loss": 1.1474, + "step": 313 + }, + { + "epoch": 0.16182786462807078, + "grad_norm": 1.0934160947799683, + "learning_rate": 4.995758662328358e-06, + "loss": 1.2062, + "step": 314 + }, + { + "epoch": 0.16234323999312833, + "grad_norm": 1.1083345413208008, + "learning_rate": 4.995718942388143e-06, + "loss": 1.181, + "step": 315 + }, + { + "epoch": 0.16285861535818588, + "grad_norm": 1.188507080078125, + "learning_rate": 4.99567903748538e-06, + "loss": 1.2613, + "step": 316 + }, + { + "epoch": 0.16337399072324343, + "grad_norm": 1.0850340127944946, + "learning_rate": 4.995638947623026e-06, + "loss": 1.1736, + "step": 317 + }, + { + "epoch": 0.163889366088301, + "grad_norm": 1.1532150506973267, + "learning_rate": 4.995598672804053e-06, + "loss": 1.2298, + "step": 318 + }, + { + "epoch": 0.16440474145335854, + "grad_norm": 1.1475430727005005, + "learning_rate": 4.9955582130314436e-06, + "loss": 1.2495, + "step": 319 + }, + { + "epoch": 0.1649201168184161, + "grad_norm": 1.221464991569519, + "learning_rate": 4.995517568308199e-06, + "loss": 1.3148, + "step": 320 + }, + { + "epoch": 0.16543549218347364, + "grad_norm": 1.0542255640029907, + "learning_rate": 4.99547673863733e-06, + "loss": 1.1144, + "step": 321 + }, + { + "epoch": 0.16595086754853117, + "grad_norm": 1.0566575527191162, + "learning_rate": 4.995435724021862e-06, + "loss": 1.1591, + "step": 322 + }, + { + "epoch": 0.16646624291358872, + "grad_norm": 1.0220612287521362, + "learning_rate": 4.995394524464836e-06, + "loss": 1.1624, + "step": 323 + }, + { + "epoch": 0.16698161827864627, + "grad_norm": 1.1840770244598389, + "learning_rate": 4.9953531399693056e-06, + "loss": 1.189, + "step": 324 + }, + { + "epoch": 0.16749699364370382, + "grad_norm": 1.1456857919692993, + "learning_rate": 4.995311570538336e-06, + "loss": 1.2096, + "step": 325 + }, + { + "epoch": 0.16801236900876138, + "grad_norm": 1.1925227642059326, + "learning_rate": 4.99526981617501e-06, + "loss": 1.2296, + "step": 326 + }, + { + "epoch": 0.16852774437381893, + "grad_norm": 1.1686952114105225, + "learning_rate": 4.995227876882421e-06, + "loss": 1.2267, + "step": 327 + }, + { + "epoch": 0.16904311973887648, + "grad_norm": 1.1459681987762451, + "learning_rate": 4.9951857526636775e-06, + "loss": 1.1841, + "step": 328 + }, + { + "epoch": 0.16955849510393403, + "grad_norm": 1.1983642578125, + "learning_rate": 4.995143443521902e-06, + "loss": 1.1892, + "step": 329 + }, + { + "epoch": 0.17007387046899158, + "grad_norm": 1.1032333374023438, + "learning_rate": 4.9951009494602275e-06, + "loss": 1.1855, + "step": 330 + }, + { + "epoch": 0.17058924583404914, + "grad_norm": 1.1817179918289185, + "learning_rate": 4.995058270481807e-06, + "loss": 1.1391, + "step": 331 + }, + { + "epoch": 0.1711046211991067, + "grad_norm": 1.1281492710113525, + "learning_rate": 4.9950154065898015e-06, + "loss": 1.2523, + "step": 332 + }, + { + "epoch": 0.17161999656416424, + "grad_norm": 1.1411288976669312, + "learning_rate": 4.9949723577873875e-06, + "loss": 1.2221, + "step": 333 + }, + { + "epoch": 0.1721353719292218, + "grad_norm": 1.2243231534957886, + "learning_rate": 4.994929124077756e-06, + "loss": 1.2469, + "step": 334 + }, + { + "epoch": 0.17265074729427934, + "grad_norm": 1.1771996021270752, + "learning_rate": 4.994885705464112e-06, + "loss": 1.1737, + "step": 335 + }, + { + "epoch": 0.1731661226593369, + "grad_norm": 1.1274346113204956, + "learning_rate": 4.9948421019496715e-06, + "loss": 1.2228, + "step": 336 + }, + { + "epoch": 0.17368149802439442, + "grad_norm": 1.0713800191879272, + "learning_rate": 4.994798313537668e-06, + "loss": 1.2221, + "step": 337 + }, + { + "epoch": 0.17419687338945197, + "grad_norm": 1.2393444776535034, + "learning_rate": 4.994754340231345e-06, + "loss": 1.1559, + "step": 338 + }, + { + "epoch": 0.17471224875450952, + "grad_norm": 1.173123836517334, + "learning_rate": 4.9947101820339615e-06, + "loss": 1.1904, + "step": 339 + }, + { + "epoch": 0.17522762411956708, + "grad_norm": 1.181182861328125, + "learning_rate": 4.994665838948792e-06, + "loss": 1.1822, + "step": 340 + }, + { + "epoch": 0.17574299948462463, + "grad_norm": 1.076783537864685, + "learning_rate": 4.994621310979121e-06, + "loss": 1.2435, + "step": 341 + }, + { + "epoch": 0.17625837484968218, + "grad_norm": 1.1784260272979736, + "learning_rate": 4.99457659812825e-06, + "loss": 1.2318, + "step": 342 + }, + { + "epoch": 0.17677375021473973, + "grad_norm": 1.105627179145813, + "learning_rate": 4.994531700399491e-06, + "loss": 1.238, + "step": 343 + }, + { + "epoch": 0.17728912557979729, + "grad_norm": 1.1091389656066895, + "learning_rate": 4.994486617796172e-06, + "loss": 1.1879, + "step": 344 + }, + { + "epoch": 0.17780450094485484, + "grad_norm": 1.0326907634735107, + "learning_rate": 4.994441350321636e-06, + "loss": 1.1607, + "step": 345 + }, + { + "epoch": 0.1783198763099124, + "grad_norm": 1.0360324382781982, + "learning_rate": 4.994395897979236e-06, + "loss": 1.1719, + "step": 346 + }, + { + "epoch": 0.17883525167496994, + "grad_norm": 1.1248440742492676, + "learning_rate": 4.994350260772341e-06, + "loss": 1.1273, + "step": 347 + }, + { + "epoch": 0.1793506270400275, + "grad_norm": 1.0810964107513428, + "learning_rate": 4.9943044387043324e-06, + "loss": 1.2178, + "step": 348 + }, + { + "epoch": 0.17986600240508505, + "grad_norm": 1.1901923418045044, + "learning_rate": 4.994258431778608e-06, + "loss": 1.2045, + "step": 349 + }, + { + "epoch": 0.1803813777701426, + "grad_norm": 1.3648993968963623, + "learning_rate": 4.994212239998577e-06, + "loss": 1.2434, + "step": 350 + }, + { + "epoch": 0.18089675313520015, + "grad_norm": 1.2584412097930908, + "learning_rate": 4.994165863367661e-06, + "loss": 1.2653, + "step": 351 + }, + { + "epoch": 0.18141212850025767, + "grad_norm": 1.1656162738800049, + "learning_rate": 4.994119301889299e-06, + "loss": 1.2114, + "step": 352 + }, + { + "epoch": 0.18192750386531523, + "grad_norm": 1.3355895280838013, + "learning_rate": 4.994072555566941e-06, + "loss": 1.2333, + "step": 353 + }, + { + "epoch": 0.18244287923037278, + "grad_norm": 1.1254146099090576, + "learning_rate": 4.994025624404052e-06, + "loss": 1.1686, + "step": 354 + }, + { + "epoch": 0.18295825459543033, + "grad_norm": 1.1978436708450317, + "learning_rate": 4.993978508404109e-06, + "loss": 1.121, + "step": 355 + }, + { + "epoch": 0.18347362996048788, + "grad_norm": 1.1566013097763062, + "learning_rate": 4.9939312075706046e-06, + "loss": 1.1487, + "step": 356 + }, + { + "epoch": 0.18398900532554543, + "grad_norm": 1.1110261678695679, + "learning_rate": 4.993883721907045e-06, + "loss": 1.1865, + "step": 357 + }, + { + "epoch": 0.184504380690603, + "grad_norm": 1.1373953819274902, + "learning_rate": 4.993836051416948e-06, + "loss": 1.1603, + "step": 358 + }, + { + "epoch": 0.18501975605566054, + "grad_norm": 1.1255364418029785, + "learning_rate": 4.993788196103847e-06, + "loss": 1.2125, + "step": 359 + }, + { + "epoch": 0.1855351314207181, + "grad_norm": 1.1953908205032349, + "learning_rate": 4.9937401559712895e-06, + "loss": 1.1718, + "step": 360 + }, + { + "epoch": 0.18605050678577564, + "grad_norm": 1.1173852682113647, + "learning_rate": 4.993691931022836e-06, + "loss": 1.1882, + "step": 361 + }, + { + "epoch": 0.1865658821508332, + "grad_norm": 1.155800461769104, + "learning_rate": 4.993643521262059e-06, + "loss": 1.2037, + "step": 362 + }, + { + "epoch": 0.18708125751589075, + "grad_norm": 1.1004916429519653, + "learning_rate": 4.993594926692547e-06, + "loss": 1.2128, + "step": 363 + }, + { + "epoch": 0.1875966328809483, + "grad_norm": 1.2067153453826904, + "learning_rate": 4.993546147317902e-06, + "loss": 1.2047, + "step": 364 + }, + { + "epoch": 0.18811200824600585, + "grad_norm": 1.1071127653121948, + "learning_rate": 4.993497183141739e-06, + "loss": 1.1837, + "step": 365 + }, + { + "epoch": 0.1886273836110634, + "grad_norm": 1.2650222778320312, + "learning_rate": 4.993448034167685e-06, + "loss": 1.1978, + "step": 366 + }, + { + "epoch": 0.18914275897612093, + "grad_norm": 1.251857042312622, + "learning_rate": 4.993398700399386e-06, + "loss": 1.194, + "step": 367 + }, + { + "epoch": 0.18965813434117848, + "grad_norm": 1.1337946653366089, + "learning_rate": 4.993349181840495e-06, + "loss": 1.1191, + "step": 368 + }, + { + "epoch": 0.19017350970623603, + "grad_norm": 1.1532639265060425, + "learning_rate": 4.9932994784946835e-06, + "loss": 1.1845, + "step": 369 + }, + { + "epoch": 0.19068888507129358, + "grad_norm": 1.1313118934631348, + "learning_rate": 4.993249590365635e-06, + "loss": 1.1588, + "step": 370 + }, + { + "epoch": 0.19120426043635114, + "grad_norm": 1.1309375762939453, + "learning_rate": 4.993199517457046e-06, + "loss": 1.1107, + "step": 371 + }, + { + "epoch": 0.1917196358014087, + "grad_norm": 1.2140612602233887, + "learning_rate": 4.9931492597726285e-06, + "loss": 1.1418, + "step": 372 + }, + { + "epoch": 0.19223501116646624, + "grad_norm": 1.2233984470367432, + "learning_rate": 4.993098817316107e-06, + "loss": 1.1627, + "step": 373 + }, + { + "epoch": 0.1927503865315238, + "grad_norm": 1.1959316730499268, + "learning_rate": 4.993048190091219e-06, + "loss": 1.2024, + "step": 374 + }, + { + "epoch": 0.19326576189658135, + "grad_norm": 1.1958948373794556, + "learning_rate": 4.9929973781017175e-06, + "loss": 1.2381, + "step": 375 + }, + { + "epoch": 0.1937811372616389, + "grad_norm": 1.071413516998291, + "learning_rate": 4.9929463813513676e-06, + "loss": 1.1692, + "step": 376 + }, + { + "epoch": 0.19429651262669645, + "grad_norm": 1.1295154094696045, + "learning_rate": 4.99289519984395e-06, + "loss": 1.2469, + "step": 377 + }, + { + "epoch": 0.194811887991754, + "grad_norm": 1.059316635131836, + "learning_rate": 4.9928438335832575e-06, + "loss": 1.1579, + "step": 378 + }, + { + "epoch": 0.19532726335681155, + "grad_norm": 1.0744855403900146, + "learning_rate": 4.992792282573095e-06, + "loss": 1.1765, + "step": 379 + }, + { + "epoch": 0.1958426387218691, + "grad_norm": 1.1298083066940308, + "learning_rate": 4.992740546817286e-06, + "loss": 1.2546, + "step": 380 + }, + { + "epoch": 0.19635801408692666, + "grad_norm": 1.0984526872634888, + "learning_rate": 4.992688626319662e-06, + "loss": 1.1716, + "step": 381 + }, + { + "epoch": 0.19687338945198418, + "grad_norm": 1.1361417770385742, + "learning_rate": 4.992636521084073e-06, + "loss": 1.1532, + "step": 382 + }, + { + "epoch": 0.19738876481704173, + "grad_norm": 1.1759017705917358, + "learning_rate": 4.9925842311143794e-06, + "loss": 1.2112, + "step": 383 + }, + { + "epoch": 0.1979041401820993, + "grad_norm": 1.1265839338302612, + "learning_rate": 4.992531756414457e-06, + "loss": 1.1864, + "step": 384 + }, + { + "epoch": 0.19841951554715684, + "grad_norm": 1.319730281829834, + "learning_rate": 4.992479096988196e-06, + "loss": 1.1995, + "step": 385 + }, + { + "epoch": 0.1989348909122144, + "grad_norm": 1.1547197103500366, + "learning_rate": 4.992426252839496e-06, + "loss": 1.2146, + "step": 386 + }, + { + "epoch": 0.19945026627727194, + "grad_norm": 1.1917409896850586, + "learning_rate": 4.992373223972276e-06, + "loss": 1.1627, + "step": 387 + }, + { + "epoch": 0.1999656416423295, + "grad_norm": 1.0941272974014282, + "learning_rate": 4.992320010390465e-06, + "loss": 1.1789, + "step": 388 + }, + { + "epoch": 0.20048101700738705, + "grad_norm": 1.236716389656067, + "learning_rate": 4.992266612098008e-06, + "loss": 1.2297, + "step": 389 + }, + { + "epoch": 0.2009963923724446, + "grad_norm": 1.0751968622207642, + "learning_rate": 4.9922130290988606e-06, + "loss": 1.1377, + "step": 390 + }, + { + "epoch": 0.20151176773750215, + "grad_norm": 1.168311595916748, + "learning_rate": 4.992159261396995e-06, + "loss": 1.2123, + "step": 391 + }, + { + "epoch": 0.2020271431025597, + "grad_norm": 1.1629900932312012, + "learning_rate": 4.9921053089963956e-06, + "loss": 1.1618, + "step": 392 + }, + { + "epoch": 0.20254251846761726, + "grad_norm": 1.1627777814865112, + "learning_rate": 4.992051171901062e-06, + "loss": 1.2022, + "step": 393 + }, + { + "epoch": 0.2030578938326748, + "grad_norm": 1.0521548986434937, + "learning_rate": 4.991996850115005e-06, + "loss": 1.1376, + "step": 394 + }, + { + "epoch": 0.20357326919773236, + "grad_norm": 1.0964367389678955, + "learning_rate": 4.991942343642251e-06, + "loss": 1.151, + "step": 395 + }, + { + "epoch": 0.2040886445627899, + "grad_norm": 1.103646993637085, + "learning_rate": 4.99188765248684e-06, + "loss": 1.1088, + "step": 396 + }, + { + "epoch": 0.20460401992784744, + "grad_norm": 1.2664456367492676, + "learning_rate": 4.991832776652824e-06, + "loss": 1.1976, + "step": 397 + }, + { + "epoch": 0.205119395292905, + "grad_norm": 1.0794459581375122, + "learning_rate": 4.991777716144272e-06, + "loss": 1.1382, + "step": 398 + }, + { + "epoch": 0.20563477065796254, + "grad_norm": 1.1090879440307617, + "learning_rate": 4.991722470965263e-06, + "loss": 1.1332, + "step": 399 + }, + { + "epoch": 0.2061501460230201, + "grad_norm": 1.1466739177703857, + "learning_rate": 4.991667041119892e-06, + "loss": 1.1976, + "step": 400 + }, + { + "epoch": 0.20666552138807764, + "grad_norm": 1.1100345849990845, + "learning_rate": 4.991611426612267e-06, + "loss": 1.2142, + "step": 401 + }, + { + "epoch": 0.2071808967531352, + "grad_norm": 1.1167898178100586, + "learning_rate": 4.991555627446509e-06, + "loss": 1.1701, + "step": 402 + }, + { + "epoch": 0.20769627211819275, + "grad_norm": 1.124314546585083, + "learning_rate": 4.9914996436267545e-06, + "loss": 1.1326, + "step": 403 + }, + { + "epoch": 0.2082116474832503, + "grad_norm": 1.0986205339431763, + "learning_rate": 4.991443475157152e-06, + "loss": 1.1814, + "step": 404 + }, + { + "epoch": 0.20872702284830785, + "grad_norm": 1.1348336935043335, + "learning_rate": 4.9913871220418645e-06, + "loss": 1.2108, + "step": 405 + }, + { + "epoch": 0.2092423982133654, + "grad_norm": 1.117326259613037, + "learning_rate": 4.991330584285068e-06, + "loss": 1.1665, + "step": 406 + }, + { + "epoch": 0.20975777357842296, + "grad_norm": 1.251592755317688, + "learning_rate": 4.991273861890952e-06, + "loss": 1.1327, + "step": 407 + }, + { + "epoch": 0.2102731489434805, + "grad_norm": 1.2220373153686523, + "learning_rate": 4.991216954863722e-06, + "loss": 1.181, + "step": 408 + }, + { + "epoch": 0.21078852430853806, + "grad_norm": 1.265209436416626, + "learning_rate": 4.991159863207595e-06, + "loss": 1.1855, + "step": 409 + }, + { + "epoch": 0.2113038996735956, + "grad_norm": 1.1213948726654053, + "learning_rate": 4.991102586926801e-06, + "loss": 1.1839, + "step": 410 + }, + { + "epoch": 0.21181927503865317, + "grad_norm": 1.1888283491134644, + "learning_rate": 4.991045126025585e-06, + "loss": 1.1649, + "step": 411 + }, + { + "epoch": 0.2123346504037107, + "grad_norm": 1.208165168762207, + "learning_rate": 4.990987480508208e-06, + "loss": 1.1751, + "step": 412 + }, + { + "epoch": 0.21285002576876824, + "grad_norm": 1.254124641418457, + "learning_rate": 4.990929650378939e-06, + "loss": 1.1697, + "step": 413 + }, + { + "epoch": 0.2133654011338258, + "grad_norm": 1.1125601530075073, + "learning_rate": 4.990871635642066e-06, + "loss": 1.1212, + "step": 414 + }, + { + "epoch": 0.21388077649888335, + "grad_norm": 1.1498005390167236, + "learning_rate": 4.990813436301888e-06, + "loss": 1.1929, + "step": 415 + }, + { + "epoch": 0.2143961518639409, + "grad_norm": 1.1422375440597534, + "learning_rate": 4.990755052362718e-06, + "loss": 1.1379, + "step": 416 + }, + { + "epoch": 0.21491152722899845, + "grad_norm": 1.214056134223938, + "learning_rate": 4.990696483828883e-06, + "loss": 1.1666, + "step": 417 + }, + { + "epoch": 0.215426902594056, + "grad_norm": 1.1769565343856812, + "learning_rate": 4.990637730704723e-06, + "loss": 1.1579, + "step": 418 + }, + { + "epoch": 0.21594227795911355, + "grad_norm": 1.1861079931259155, + "learning_rate": 4.990578792994594e-06, + "loss": 1.162, + "step": 419 + }, + { + "epoch": 0.2164576533241711, + "grad_norm": 1.1193722486495972, + "learning_rate": 4.990519670702862e-06, + "loss": 1.201, + "step": 420 + }, + { + "epoch": 0.21697302868922866, + "grad_norm": 1.1538455486297607, + "learning_rate": 4.990460363833911e-06, + "loss": 1.1193, + "step": 421 + }, + { + "epoch": 0.2174884040542862, + "grad_norm": 1.1257574558258057, + "learning_rate": 4.990400872392135e-06, + "loss": 1.1047, + "step": 422 + }, + { + "epoch": 0.21800377941934376, + "grad_norm": 1.1589558124542236, + "learning_rate": 4.990341196381943e-06, + "loss": 1.203, + "step": 423 + }, + { + "epoch": 0.21851915478440131, + "grad_norm": 1.1535980701446533, + "learning_rate": 4.990281335807757e-06, + "loss": 1.0999, + "step": 424 + }, + { + "epoch": 0.21903453014945887, + "grad_norm": 1.198763132095337, + "learning_rate": 4.990221290674015e-06, + "loss": 1.1161, + "step": 425 + }, + { + "epoch": 0.21954990551451642, + "grad_norm": 1.258095383644104, + "learning_rate": 4.990161060985164e-06, + "loss": 1.2035, + "step": 426 + }, + { + "epoch": 0.22006528087957394, + "grad_norm": 1.1130949258804321, + "learning_rate": 4.990100646745673e-06, + "loss": 1.1441, + "step": 427 + }, + { + "epoch": 0.2205806562446315, + "grad_norm": 1.224503517150879, + "learning_rate": 4.990040047960015e-06, + "loss": 1.1787, + "step": 428 + }, + { + "epoch": 0.22109603160968905, + "grad_norm": 1.2775068283081055, + "learning_rate": 4.989979264632683e-06, + "loss": 1.1818, + "step": 429 + }, + { + "epoch": 0.2216114069747466, + "grad_norm": 1.1601276397705078, + "learning_rate": 4.989918296768181e-06, + "loss": 1.1887, + "step": 430 + }, + { + "epoch": 0.22212678233980415, + "grad_norm": 1.1515289545059204, + "learning_rate": 4.9898571443710265e-06, + "loss": 1.1445, + "step": 431 + }, + { + "epoch": 0.2226421577048617, + "grad_norm": 1.141028881072998, + "learning_rate": 4.989795807445753e-06, + "loss": 1.1901, + "step": 432 + }, + { + "epoch": 0.22315753306991926, + "grad_norm": 1.192578673362732, + "learning_rate": 4.989734285996907e-06, + "loss": 1.1934, + "step": 433 + }, + { + "epoch": 0.2236729084349768, + "grad_norm": 1.2101963758468628, + "learning_rate": 4.989672580029047e-06, + "loss": 1.1493, + "step": 434 + }, + { + "epoch": 0.22418828380003436, + "grad_norm": 1.150976538658142, + "learning_rate": 4.989610689546747e-06, + "loss": 1.2027, + "step": 435 + }, + { + "epoch": 0.2247036591650919, + "grad_norm": 1.1535725593566895, + "learning_rate": 4.989548614554592e-06, + "loss": 1.1637, + "step": 436 + }, + { + "epoch": 0.22521903453014946, + "grad_norm": 1.12608802318573, + "learning_rate": 4.989486355057184e-06, + "loss": 1.1192, + "step": 437 + }, + { + "epoch": 0.22573440989520702, + "grad_norm": 1.143556833267212, + "learning_rate": 4.989423911059136e-06, + "loss": 1.1314, + "step": 438 + }, + { + "epoch": 0.22624978526026457, + "grad_norm": 1.2795722484588623, + "learning_rate": 4.989361282565077e-06, + "loss": 1.1525, + "step": 439 + }, + { + "epoch": 0.22676516062532212, + "grad_norm": 1.1248250007629395, + "learning_rate": 4.9892984695796484e-06, + "loss": 1.1635, + "step": 440 + }, + { + "epoch": 0.22728053599037967, + "grad_norm": 1.1849747896194458, + "learning_rate": 4.989235472107504e-06, + "loss": 1.1905, + "step": 441 + }, + { + "epoch": 0.2277959113554372, + "grad_norm": 1.243175983428955, + "learning_rate": 4.989172290153314e-06, + "loss": 1.1504, + "step": 442 + }, + { + "epoch": 0.22831128672049475, + "grad_norm": 1.2722020149230957, + "learning_rate": 4.989108923721762e-06, + "loss": 1.2047, + "step": 443 + }, + { + "epoch": 0.2288266620855523, + "grad_norm": 1.1677055358886719, + "learning_rate": 4.989045372817542e-06, + "loss": 1.1834, + "step": 444 + }, + { + "epoch": 0.22934203745060985, + "grad_norm": 1.2984954118728638, + "learning_rate": 4.988981637445365e-06, + "loss": 1.2295, + "step": 445 + }, + { + "epoch": 0.2298574128156674, + "grad_norm": 1.3156894445419312, + "learning_rate": 4.988917717609954e-06, + "loss": 1.1091, + "step": 446 + }, + { + "epoch": 0.23037278818072496, + "grad_norm": 1.2146261930465698, + "learning_rate": 4.988853613316047e-06, + "loss": 1.1843, + "step": 447 + }, + { + "epoch": 0.2308881635457825, + "grad_norm": 1.1358487606048584, + "learning_rate": 4.988789324568394e-06, + "loss": 1.1165, + "step": 448 + }, + { + "epoch": 0.23140353891084006, + "grad_norm": 1.2033812999725342, + "learning_rate": 4.98872485137176e-06, + "loss": 1.1391, + "step": 449 + }, + { + "epoch": 0.23191891427589761, + "grad_norm": 1.194493293762207, + "learning_rate": 4.9886601937309235e-06, + "loss": 1.1541, + "step": 450 + }, + { + "epoch": 0.23243428964095517, + "grad_norm": 1.121980905532837, + "learning_rate": 4.988595351650676e-06, + "loss": 1.1393, + "step": 451 + }, + { + "epoch": 0.23294966500601272, + "grad_norm": 1.1627380847930908, + "learning_rate": 4.988530325135823e-06, + "loss": 1.1332, + "step": 452 + }, + { + "epoch": 0.23346504037107027, + "grad_norm": 1.3269914388656616, + "learning_rate": 4.988465114191183e-06, + "loss": 1.1415, + "step": 453 + }, + { + "epoch": 0.23398041573612782, + "grad_norm": 1.1122335195541382, + "learning_rate": 4.988399718821592e-06, + "loss": 1.1451, + "step": 454 + }, + { + "epoch": 0.23449579110118537, + "grad_norm": 1.2022422552108765, + "learning_rate": 4.988334139031893e-06, + "loss": 1.1097, + "step": 455 + }, + { + "epoch": 0.23501116646624293, + "grad_norm": 1.1904706954956055, + "learning_rate": 4.988268374826948e-06, + "loss": 1.1822, + "step": 456 + }, + { + "epoch": 0.23552654183130045, + "grad_norm": 1.1483618021011353, + "learning_rate": 4.98820242621163e-06, + "loss": 1.138, + "step": 457 + }, + { + "epoch": 0.236041917196358, + "grad_norm": 1.140466570854187, + "learning_rate": 4.988136293190828e-06, + "loss": 1.1507, + "step": 458 + }, + { + "epoch": 0.23655729256141556, + "grad_norm": 1.12582528591156, + "learning_rate": 4.988069975769442e-06, + "loss": 1.1049, + "step": 459 + }, + { + "epoch": 0.2370726679264731, + "grad_norm": 1.2952275276184082, + "learning_rate": 4.9880034739523875e-06, + "loss": 1.1716, + "step": 460 + }, + { + "epoch": 0.23758804329153066, + "grad_norm": 1.1505956649780273, + "learning_rate": 4.9879367877445935e-06, + "loss": 1.1647, + "step": 461 + }, + { + "epoch": 0.2381034186565882, + "grad_norm": 1.237804889678955, + "learning_rate": 4.987869917151001e-06, + "loss": 1.2249, + "step": 462 + }, + { + "epoch": 0.23861879402164576, + "grad_norm": 1.2095214128494263, + "learning_rate": 4.987802862176566e-06, + "loss": 1.163, + "step": 463 + }, + { + "epoch": 0.23913416938670332, + "grad_norm": 1.2811049222946167, + "learning_rate": 4.98773562282626e-06, + "loss": 1.1861, + "step": 464 + }, + { + "epoch": 0.23964954475176087, + "grad_norm": 1.2019784450531006, + "learning_rate": 4.9876681991050635e-06, + "loss": 1.1322, + "step": 465 + }, + { + "epoch": 0.24016492011681842, + "grad_norm": 1.2116162776947021, + "learning_rate": 4.987600591017975e-06, + "loss": 1.2162, + "step": 466 + }, + { + "epoch": 0.24068029548187597, + "grad_norm": 1.1459062099456787, + "learning_rate": 4.987532798570005e-06, + "loss": 1.1615, + "step": 467 + }, + { + "epoch": 0.24119567084693352, + "grad_norm": 1.2083719968795776, + "learning_rate": 4.987464821766178e-06, + "loss": 1.1726, + "step": 468 + }, + { + "epoch": 0.24171104621199108, + "grad_norm": 1.2292635440826416, + "learning_rate": 4.987396660611531e-06, + "loss": 1.1212, + "step": 469 + }, + { + "epoch": 0.24222642157704863, + "grad_norm": 1.0754214525222778, + "learning_rate": 4.987328315111116e-06, + "loss": 1.1372, + "step": 470 + }, + { + "epoch": 0.24274179694210618, + "grad_norm": 1.205851674079895, + "learning_rate": 4.987259785269999e-06, + "loss": 1.1463, + "step": 471 + }, + { + "epoch": 0.2432571723071637, + "grad_norm": 1.2169773578643799, + "learning_rate": 4.987191071093257e-06, + "loss": 1.1705, + "step": 472 + }, + { + "epoch": 0.24377254767222126, + "grad_norm": 1.1939457654953003, + "learning_rate": 4.987122172585984e-06, + "loss": 1.1503, + "step": 473 + }, + { + "epoch": 0.2442879230372788, + "grad_norm": 1.179269552230835, + "learning_rate": 4.987053089753286e-06, + "loss": 1.1693, + "step": 474 + }, + { + "epoch": 0.24480329840233636, + "grad_norm": 1.1368995904922485, + "learning_rate": 4.986983822600283e-06, + "loss": 1.1204, + "step": 475 + }, + { + "epoch": 0.2453186737673939, + "grad_norm": 1.1833655834197998, + "learning_rate": 4.986914371132108e-06, + "loss": 1.1512, + "step": 476 + }, + { + "epoch": 0.24583404913245147, + "grad_norm": 1.1153130531311035, + "learning_rate": 4.986844735353908e-06, + "loss": 1.1726, + "step": 477 + }, + { + "epoch": 0.24634942449750902, + "grad_norm": 1.1726752519607544, + "learning_rate": 4.986774915270845e-06, + "loss": 1.1657, + "step": 478 + }, + { + "epoch": 0.24686479986256657, + "grad_norm": 1.2259719371795654, + "learning_rate": 4.9867049108880915e-06, + "loss": 1.1545, + "step": 479 + }, + { + "epoch": 0.24738017522762412, + "grad_norm": 1.2755433320999146, + "learning_rate": 4.986634722210838e-06, + "loss": 1.2373, + "step": 480 + }, + { + "epoch": 0.24789555059268167, + "grad_norm": 1.139876365661621, + "learning_rate": 4.986564349244285e-06, + "loss": 1.0958, + "step": 481 + }, + { + "epoch": 0.24841092595773923, + "grad_norm": 1.1974653005599976, + "learning_rate": 4.986493791993648e-06, + "loss": 1.1678, + "step": 482 + }, + { + "epoch": 0.24892630132279678, + "grad_norm": 1.124848484992981, + "learning_rate": 4.986423050464156e-06, + "loss": 1.1129, + "step": 483 + }, + { + "epoch": 0.24944167668785433, + "grad_norm": 1.2005858421325684, + "learning_rate": 4.986352124661053e-06, + "loss": 1.1665, + "step": 484 + }, + { + "epoch": 0.24995705205291188, + "grad_norm": 1.133514642715454, + "learning_rate": 4.986281014589594e-06, + "loss": 1.0938, + "step": 485 + }, + { + "epoch": 0.25047242741796943, + "grad_norm": 1.2003309726715088, + "learning_rate": 4.9862097202550496e-06, + "loss": 1.1792, + "step": 486 + }, + { + "epoch": 0.25098780278302696, + "grad_norm": 1.1737325191497803, + "learning_rate": 4.986138241662705e-06, + "loss": 1.1225, + "step": 487 + }, + { + "epoch": 0.25150317814808454, + "grad_norm": 1.1730244159698486, + "learning_rate": 4.9860665788178545e-06, + "loss": 1.1572, + "step": 488 + }, + { + "epoch": 0.25201855351314206, + "grad_norm": 1.206847906112671, + "learning_rate": 4.985994731725811e-06, + "loss": 1.1763, + "step": 489 + }, + { + "epoch": 0.25253392887819964, + "grad_norm": 1.1796056032180786, + "learning_rate": 4.985922700391901e-06, + "loss": 1.1572, + "step": 490 + }, + { + "epoch": 0.25304930424325717, + "grad_norm": 1.1734232902526855, + "learning_rate": 4.98585048482146e-06, + "loss": 1.1403, + "step": 491 + }, + { + "epoch": 0.25356467960831475, + "grad_norm": 1.1931697130203247, + "learning_rate": 4.98577808501984e-06, + "loss": 1.1966, + "step": 492 + }, + { + "epoch": 0.25408005497337227, + "grad_norm": 1.2577358484268188, + "learning_rate": 4.985705500992408e-06, + "loss": 1.1307, + "step": 493 + }, + { + "epoch": 0.2545954303384298, + "grad_norm": 1.2262202501296997, + "learning_rate": 4.985632732744544e-06, + "loss": 1.1662, + "step": 494 + }, + { + "epoch": 0.2551108057034874, + "grad_norm": 1.2096452713012695, + "learning_rate": 4.9855597802816405e-06, + "loss": 1.1953, + "step": 495 + }, + { + "epoch": 0.2556261810685449, + "grad_norm": 1.161454200744629, + "learning_rate": 4.985486643609103e-06, + "loss": 1.1444, + "step": 496 + }, + { + "epoch": 0.2561415564336025, + "grad_norm": 1.1236664056777954, + "learning_rate": 4.985413322732353e-06, + "loss": 1.157, + "step": 497 + }, + { + "epoch": 0.25665693179866, + "grad_norm": 1.314698338508606, + "learning_rate": 4.9853398176568235e-06, + "loss": 1.1376, + "step": 498 + }, + { + "epoch": 0.2571723071637176, + "grad_norm": 1.1444615125656128, + "learning_rate": 4.985266128387963e-06, + "loss": 1.126, + "step": 499 + }, + { + "epoch": 0.2576876825287751, + "grad_norm": 1.2295600175857544, + "learning_rate": 4.985192254931232e-06, + "loss": 1.15, + "step": 500 + }, + { + "epoch": 0.2582030578938327, + "grad_norm": 1.197485327720642, + "learning_rate": 4.985118197292106e-06, + "loss": 1.2171, + "step": 501 + }, + { + "epoch": 0.2587184332588902, + "grad_norm": 1.2253940105438232, + "learning_rate": 4.985043955476072e-06, + "loss": 1.1144, + "step": 502 + }, + { + "epoch": 0.2592338086239478, + "grad_norm": 1.2434132099151611, + "learning_rate": 4.984969529488636e-06, + "loss": 1.1776, + "step": 503 + }, + { + "epoch": 0.2597491839890053, + "grad_norm": 1.1865484714508057, + "learning_rate": 4.98489491933531e-06, + "loss": 1.1125, + "step": 504 + }, + { + "epoch": 0.2602645593540629, + "grad_norm": 1.2330963611602783, + "learning_rate": 4.984820125021624e-06, + "loss": 1.1663, + "step": 505 + }, + { + "epoch": 0.2607799347191204, + "grad_norm": 1.1894609928131104, + "learning_rate": 4.9847451465531235e-06, + "loss": 1.1503, + "step": 506 + }, + { + "epoch": 0.261295310084178, + "grad_norm": 1.209188461303711, + "learning_rate": 4.984669983935363e-06, + "loss": 1.1668, + "step": 507 + }, + { + "epoch": 0.2618106854492355, + "grad_norm": 1.1585817337036133, + "learning_rate": 4.984594637173915e-06, + "loss": 1.1212, + "step": 508 + }, + { + "epoch": 0.26232606081429305, + "grad_norm": 1.091931700706482, + "learning_rate": 4.9845191062743615e-06, + "loss": 1.1311, + "step": 509 + }, + { + "epoch": 0.26284143617935063, + "grad_norm": 1.207065463066101, + "learning_rate": 4.984443391242301e-06, + "loss": 1.18, + "step": 510 + }, + { + "epoch": 0.26335681154440815, + "grad_norm": 1.2607285976409912, + "learning_rate": 4.984367492083345e-06, + "loss": 1.1685, + "step": 511 + }, + { + "epoch": 0.26387218690946573, + "grad_norm": 1.1094540357589722, + "learning_rate": 4.984291408803118e-06, + "loss": 1.0477, + "step": 512 + }, + { + "epoch": 0.26438756227452326, + "grad_norm": 1.123847246170044, + "learning_rate": 4.9842151414072605e-06, + "loss": 1.1365, + "step": 513 + }, + { + "epoch": 0.26490293763958084, + "grad_norm": 1.189071536064148, + "learning_rate": 4.984138689901423e-06, + "loss": 1.1528, + "step": 514 + }, + { + "epoch": 0.26541831300463836, + "grad_norm": 1.227555274963379, + "learning_rate": 4.984062054291272e-06, + "loss": 1.0919, + "step": 515 + }, + { + "epoch": 0.26593368836969594, + "grad_norm": 1.148440957069397, + "learning_rate": 4.983985234582487e-06, + "loss": 1.1267, + "step": 516 + }, + { + "epoch": 0.26644906373475347, + "grad_norm": 1.227849006652832, + "learning_rate": 4.9839082307807605e-06, + "loss": 1.1306, + "step": 517 + }, + { + "epoch": 0.26696443909981105, + "grad_norm": 1.255359172821045, + "learning_rate": 4.983831042891801e-06, + "loss": 1.2317, + "step": 518 + }, + { + "epoch": 0.26747981446486857, + "grad_norm": 1.2154213190078735, + "learning_rate": 4.983753670921329e-06, + "loss": 1.1966, + "step": 519 + }, + { + "epoch": 0.26799518982992615, + "grad_norm": 1.2825920581817627, + "learning_rate": 4.983676114875078e-06, + "loss": 1.2206, + "step": 520 + }, + { + "epoch": 0.2685105651949837, + "grad_norm": 1.1783586740493774, + "learning_rate": 4.983598374758794e-06, + "loss": 1.1799, + "step": 521 + }, + { + "epoch": 0.26902594056004125, + "grad_norm": 1.1926697492599487, + "learning_rate": 4.983520450578242e-06, + "loss": 1.1976, + "step": 522 + }, + { + "epoch": 0.2695413159250988, + "grad_norm": 1.291904091835022, + "learning_rate": 4.983442342339195e-06, + "loss": 1.147, + "step": 523 + }, + { + "epoch": 0.2700566912901563, + "grad_norm": 1.2051136493682861, + "learning_rate": 4.9833640500474426e-06, + "loss": 1.1981, + "step": 524 + }, + { + "epoch": 0.2705720666552139, + "grad_norm": 1.1611932516098022, + "learning_rate": 4.983285573708786e-06, + "loss": 1.1444, + "step": 525 + }, + { + "epoch": 0.2710874420202714, + "grad_norm": 1.1475062370300293, + "learning_rate": 4.983206913329043e-06, + "loss": 1.1773, + "step": 526 + }, + { + "epoch": 0.271602817385329, + "grad_norm": 1.2453409433364868, + "learning_rate": 4.983128068914041e-06, + "loss": 1.1212, + "step": 527 + }, + { + "epoch": 0.2721181927503865, + "grad_norm": 1.224652886390686, + "learning_rate": 4.983049040469625e-06, + "loss": 1.13, + "step": 528 + }, + { + "epoch": 0.2726335681154441, + "grad_norm": 1.2433110475540161, + "learning_rate": 4.982969828001652e-06, + "loss": 1.1605, + "step": 529 + }, + { + "epoch": 0.2731489434805016, + "grad_norm": 1.1937365531921387, + "learning_rate": 4.982890431515992e-06, + "loss": 1.1063, + "step": 530 + }, + { + "epoch": 0.2736643188455592, + "grad_norm": 1.1492749452590942, + "learning_rate": 4.982810851018529e-06, + "loss": 1.1967, + "step": 531 + }, + { + "epoch": 0.2741796942106167, + "grad_norm": 1.1265840530395508, + "learning_rate": 4.982731086515161e-06, + "loss": 1.1016, + "step": 532 + }, + { + "epoch": 0.2746950695756743, + "grad_norm": 1.262799620628357, + "learning_rate": 4.9826511380118e-06, + "loss": 1.1779, + "step": 533 + }, + { + "epoch": 0.2752104449407318, + "grad_norm": 1.199607491493225, + "learning_rate": 4.982571005514371e-06, + "loss": 1.0855, + "step": 534 + }, + { + "epoch": 0.2757258203057894, + "grad_norm": 1.227246642112732, + "learning_rate": 4.982490689028812e-06, + "loss": 1.1195, + "step": 535 + }, + { + "epoch": 0.27624119567084693, + "grad_norm": 1.30074942111969, + "learning_rate": 4.982410188561077e-06, + "loss": 1.1773, + "step": 536 + }, + { + "epoch": 0.2767565710359045, + "grad_norm": 1.219909906387329, + "learning_rate": 4.982329504117131e-06, + "loss": 1.1434, + "step": 537 + }, + { + "epoch": 0.27727194640096203, + "grad_norm": 1.3017019033432007, + "learning_rate": 4.982248635702953e-06, + "loss": 1.2061, + "step": 538 + }, + { + "epoch": 0.27778732176601956, + "grad_norm": 1.2198671102523804, + "learning_rate": 4.9821675833245375e-06, + "loss": 1.1297, + "step": 539 + }, + { + "epoch": 0.27830269713107714, + "grad_norm": 1.2024649381637573, + "learning_rate": 4.982086346987891e-06, + "loss": 1.188, + "step": 540 + }, + { + "epoch": 0.27881807249613466, + "grad_norm": 1.2666974067687988, + "learning_rate": 4.9820049266990345e-06, + "loss": 1.2017, + "step": 541 + }, + { + "epoch": 0.27933344786119224, + "grad_norm": 1.197114109992981, + "learning_rate": 4.981923322464001e-06, + "loss": 1.1401, + "step": 542 + }, + { + "epoch": 0.27984882322624977, + "grad_norm": 1.2055046558380127, + "learning_rate": 4.9818415342888395e-06, + "loss": 1.1649, + "step": 543 + }, + { + "epoch": 0.28036419859130735, + "grad_norm": 1.1736465692520142, + "learning_rate": 4.981759562179611e-06, + "loss": 1.1378, + "step": 544 + }, + { + "epoch": 0.28087957395636487, + "grad_norm": 1.273497462272644, + "learning_rate": 4.981677406142391e-06, + "loss": 1.1159, + "step": 545 + }, + { + "epoch": 0.28139494932142245, + "grad_norm": 1.2971827983856201, + "learning_rate": 4.981595066183268e-06, + "loss": 1.1851, + "step": 546 + }, + { + "epoch": 0.28191032468648, + "grad_norm": 1.309173822402954, + "learning_rate": 4.981512542308344e-06, + "loss": 1.1604, + "step": 547 + }, + { + "epoch": 0.28242570005153755, + "grad_norm": 1.2924010753631592, + "learning_rate": 4.9814298345237364e-06, + "loss": 1.1017, + "step": 548 + }, + { + "epoch": 0.2829410754165951, + "grad_norm": 1.2045514583587646, + "learning_rate": 4.981346942835573e-06, + "loss": 1.1712, + "step": 549 + }, + { + "epoch": 0.28345645078165266, + "grad_norm": 1.2449471950531006, + "learning_rate": 4.981263867249998e-06, + "loss": 1.1483, + "step": 550 + }, + { + "epoch": 0.2839718261467102, + "grad_norm": 1.2177159786224365, + "learning_rate": 4.981180607773167e-06, + "loss": 1.1609, + "step": 551 + }, + { + "epoch": 0.28448720151176776, + "grad_norm": 1.238996982574463, + "learning_rate": 4.981097164411254e-06, + "loss": 1.1247, + "step": 552 + }, + { + "epoch": 0.2850025768768253, + "grad_norm": 1.233088731765747, + "learning_rate": 4.98101353717044e-06, + "loss": 1.1483, + "step": 553 + }, + { + "epoch": 0.2855179522418828, + "grad_norm": 1.1626131534576416, + "learning_rate": 4.980929726056922e-06, + "loss": 1.1444, + "step": 554 + }, + { + "epoch": 0.2860333276069404, + "grad_norm": 1.193416714668274, + "learning_rate": 4.980845731076915e-06, + "loss": 1.1958, + "step": 555 + }, + { + "epoch": 0.2865487029719979, + "grad_norm": 1.2389411926269531, + "learning_rate": 4.980761552236641e-06, + "loss": 1.1193, + "step": 556 + }, + { + "epoch": 0.2870640783370555, + "grad_norm": 1.2497589588165283, + "learning_rate": 4.98067718954234e-06, + "loss": 1.1505, + "step": 557 + }, + { + "epoch": 0.287579453702113, + "grad_norm": 1.3262685537338257, + "learning_rate": 4.980592643000264e-06, + "loss": 1.1793, + "step": 558 + }, + { + "epoch": 0.2880948290671706, + "grad_norm": 1.2589205503463745, + "learning_rate": 4.980507912616678e-06, + "loss": 1.1485, + "step": 559 + }, + { + "epoch": 0.2886102044322281, + "grad_norm": 1.1957213878631592, + "learning_rate": 4.980422998397864e-06, + "loss": 1.1295, + "step": 560 + }, + { + "epoch": 0.2891255797972857, + "grad_norm": 1.1982243061065674, + "learning_rate": 4.980337900350112e-06, + "loss": 1.1534, + "step": 561 + }, + { + "epoch": 0.2896409551623432, + "grad_norm": 1.2490700483322144, + "learning_rate": 4.980252618479731e-06, + "loss": 1.1792, + "step": 562 + }, + { + "epoch": 0.2901563305274008, + "grad_norm": 1.2109037637710571, + "learning_rate": 4.980167152793041e-06, + "loss": 1.0959, + "step": 563 + }, + { + "epoch": 0.29067170589245833, + "grad_norm": 1.177827000617981, + "learning_rate": 4.9800815032963755e-06, + "loss": 1.1518, + "step": 564 + }, + { + "epoch": 0.2911870812575159, + "grad_norm": 1.2427681684494019, + "learning_rate": 4.979995669996083e-06, + "loss": 1.1479, + "step": 565 + }, + { + "epoch": 0.29170245662257344, + "grad_norm": 1.17928147315979, + "learning_rate": 4.9799096528985235e-06, + "loss": 1.1189, + "step": 566 + }, + { + "epoch": 0.292217831987631, + "grad_norm": 1.31520676612854, + "learning_rate": 4.979823452010073e-06, + "loss": 1.1245, + "step": 567 + }, + { + "epoch": 0.29273320735268854, + "grad_norm": 1.3166396617889404, + "learning_rate": 4.979737067337119e-06, + "loss": 1.1499, + "step": 568 + }, + { + "epoch": 0.29324858271774606, + "grad_norm": 1.226168155670166, + "learning_rate": 4.979650498886065e-06, + "loss": 1.1441, + "step": 569 + }, + { + "epoch": 0.29376395808280364, + "grad_norm": 1.2373491525650024, + "learning_rate": 4.979563746663326e-06, + "loss": 1.1841, + "step": 570 + }, + { + "epoch": 0.29427933344786117, + "grad_norm": 1.2169537544250488, + "learning_rate": 4.979476810675331e-06, + "loss": 1.1627, + "step": 571 + }, + { + "epoch": 0.29479470881291875, + "grad_norm": 1.20741605758667, + "learning_rate": 4.979389690928524e-06, + "loss": 1.1586, + "step": 572 + }, + { + "epoch": 0.2953100841779763, + "grad_norm": 1.2474502325057983, + "learning_rate": 4.97930238742936e-06, + "loss": 1.1497, + "step": 573 + }, + { + "epoch": 0.29582545954303385, + "grad_norm": 1.2915810346603394, + "learning_rate": 4.979214900184311e-06, + "loss": 1.1545, + "step": 574 + }, + { + "epoch": 0.2963408349080914, + "grad_norm": 1.136528491973877, + "learning_rate": 4.97912722919986e-06, + "loss": 1.1371, + "step": 575 + }, + { + "epoch": 0.29685621027314896, + "grad_norm": 1.2052608728408813, + "learning_rate": 4.979039374482504e-06, + "loss": 1.1598, + "step": 576 + }, + { + "epoch": 0.2973715856382065, + "grad_norm": 1.2963969707489014, + "learning_rate": 4.978951336038756e-06, + "loss": 1.1278, + "step": 577 + }, + { + "epoch": 0.29788696100326406, + "grad_norm": 1.3299596309661865, + "learning_rate": 4.978863113875138e-06, + "loss": 1.1455, + "step": 578 + }, + { + "epoch": 0.2984023363683216, + "grad_norm": 1.178487777709961, + "learning_rate": 4.978774707998189e-06, + "loss": 1.1572, + "step": 579 + }, + { + "epoch": 0.29891771173337917, + "grad_norm": 1.4150831699371338, + "learning_rate": 4.978686118414463e-06, + "loss": 1.2016, + "step": 580 + }, + { + "epoch": 0.2994330870984367, + "grad_norm": 1.1844868659973145, + "learning_rate": 4.978597345130523e-06, + "loss": 1.1767, + "step": 581 + }, + { + "epoch": 0.29994846246349427, + "grad_norm": 1.2888526916503906, + "learning_rate": 4.97850838815295e-06, + "loss": 1.1512, + "step": 582 + }, + { + "epoch": 0.3004638378285518, + "grad_norm": 1.3352088928222656, + "learning_rate": 4.978419247488335e-06, + "loss": 1.1527, + "step": 583 + }, + { + "epoch": 0.3009792131936093, + "grad_norm": 1.26283860206604, + "learning_rate": 4.978329923143287e-06, + "loss": 1.1724, + "step": 584 + }, + { + "epoch": 0.3014945885586669, + "grad_norm": 1.182499885559082, + "learning_rate": 4.978240415124424e-06, + "loss": 1.0651, + "step": 585 + }, + { + "epoch": 0.3020099639237244, + "grad_norm": 1.3811864852905273, + "learning_rate": 4.9781507234383795e-06, + "loss": 1.0996, + "step": 586 + }, + { + "epoch": 0.302525339288782, + "grad_norm": 1.2513461112976074, + "learning_rate": 4.9780608480918015e-06, + "loss": 1.222, + "step": 587 + }, + { + "epoch": 0.3030407146538395, + "grad_norm": 1.345369815826416, + "learning_rate": 4.977970789091349e-06, + "loss": 1.1866, + "step": 588 + }, + { + "epoch": 0.3035560900188971, + "grad_norm": 1.1981523036956787, + "learning_rate": 4.977880546443701e-06, + "loss": 1.0942, + "step": 589 + }, + { + "epoch": 0.30407146538395463, + "grad_norm": 1.2694369554519653, + "learning_rate": 4.97779012015554e-06, + "loss": 1.1449, + "step": 590 + }, + { + "epoch": 0.3045868407490122, + "grad_norm": 1.249166488647461, + "learning_rate": 4.977699510233573e-06, + "loss": 1.1193, + "step": 591 + }, + { + "epoch": 0.30510221611406974, + "grad_norm": 1.222267746925354, + "learning_rate": 4.97760871668451e-06, + "loss": 1.1391, + "step": 592 + }, + { + "epoch": 0.3056175914791273, + "grad_norm": 1.3113726377487183, + "learning_rate": 4.977517739515083e-06, + "loss": 1.1619, + "step": 593 + }, + { + "epoch": 0.30613296684418484, + "grad_norm": 1.276493787765503, + "learning_rate": 4.977426578732035e-06, + "loss": 1.1728, + "step": 594 + }, + { + "epoch": 0.3066483422092424, + "grad_norm": 1.23202383518219, + "learning_rate": 4.97733523434212e-06, + "loss": 1.0695, + "step": 595 + }, + { + "epoch": 0.30716371757429994, + "grad_norm": 1.2622519731521606, + "learning_rate": 4.977243706352109e-06, + "loss": 1.1807, + "step": 596 + }, + { + "epoch": 0.3076790929393575, + "grad_norm": 1.2572592496871948, + "learning_rate": 4.977151994768785e-06, + "loss": 1.1816, + "step": 597 + }, + { + "epoch": 0.30819446830441505, + "grad_norm": 1.232765555381775, + "learning_rate": 4.977060099598945e-06, + "loss": 1.1402, + "step": 598 + }, + { + "epoch": 0.3087098436694726, + "grad_norm": 1.2640727758407593, + "learning_rate": 4.9769680208494e-06, + "loss": 1.1249, + "step": 599 + }, + { + "epoch": 0.30922521903453015, + "grad_norm": 1.138247013092041, + "learning_rate": 4.976875758526973e-06, + "loss": 1.1192, + "step": 600 + }, + { + "epoch": 0.3097405943995877, + "grad_norm": 1.3037362098693848, + "learning_rate": 4.976783312638503e-06, + "loss": 1.2068, + "step": 601 + }, + { + "epoch": 0.31025596976464526, + "grad_norm": 1.187371850013733, + "learning_rate": 4.976690683190839e-06, + "loss": 1.1223, + "step": 602 + }, + { + "epoch": 0.3107713451297028, + "grad_norm": 1.3030028343200684, + "learning_rate": 4.976597870190849e-06, + "loss": 1.1401, + "step": 603 + }, + { + "epoch": 0.31128672049476036, + "grad_norm": 1.1680561304092407, + "learning_rate": 4.976504873645411e-06, + "loss": 1.1077, + "step": 604 + }, + { + "epoch": 0.3118020958598179, + "grad_norm": 1.2486811876296997, + "learning_rate": 4.9764116935614146e-06, + "loss": 1.1878, + "step": 605 + }, + { + "epoch": 0.31231747122487546, + "grad_norm": 1.2635382413864136, + "learning_rate": 4.976318329945768e-06, + "loss": 1.1504, + "step": 606 + }, + { + "epoch": 0.312832846589933, + "grad_norm": 1.1843292713165283, + "learning_rate": 4.976224782805391e-06, + "loss": 1.2165, + "step": 607 + }, + { + "epoch": 0.31334822195499057, + "grad_norm": 1.2353806495666504, + "learning_rate": 4.9761310521472136e-06, + "loss": 1.0973, + "step": 608 + }, + { + "epoch": 0.3138635973200481, + "grad_norm": 1.2707849740982056, + "learning_rate": 4.976037137978186e-06, + "loss": 1.2043, + "step": 609 + }, + { + "epoch": 0.3143789726851057, + "grad_norm": 1.1960488557815552, + "learning_rate": 4.975943040305266e-06, + "loss": 1.1002, + "step": 610 + }, + { + "epoch": 0.3148943480501632, + "grad_norm": 1.2896122932434082, + "learning_rate": 4.975848759135427e-06, + "loss": 1.1557, + "step": 611 + }, + { + "epoch": 0.3154097234152208, + "grad_norm": 1.3175156116485596, + "learning_rate": 4.9757542944756576e-06, + "loss": 1.1808, + "step": 612 + }, + { + "epoch": 0.3159250987802783, + "grad_norm": 1.284606695175171, + "learning_rate": 4.9756596463329585e-06, + "loss": 1.1451, + "step": 613 + }, + { + "epoch": 0.3164404741453358, + "grad_norm": 1.1585352420806885, + "learning_rate": 4.975564814714345e-06, + "loss": 1.1862, + "step": 614 + }, + { + "epoch": 0.3169558495103934, + "grad_norm": 1.1436694860458374, + "learning_rate": 4.975469799626844e-06, + "loss": 1.0628, + "step": 615 + }, + { + "epoch": 0.31747122487545093, + "grad_norm": 1.1853829622268677, + "learning_rate": 4.975374601077497e-06, + "loss": 1.1256, + "step": 616 + }, + { + "epoch": 0.3179866002405085, + "grad_norm": 1.2337476015090942, + "learning_rate": 4.97527921907336e-06, + "loss": 1.1067, + "step": 617 + }, + { + "epoch": 0.31850197560556603, + "grad_norm": 1.2059632539749146, + "learning_rate": 4.975183653621503e-06, + "loss": 1.1379, + "step": 618 + }, + { + "epoch": 0.3190173509706236, + "grad_norm": 1.193901777267456, + "learning_rate": 4.975087904729007e-06, + "loss": 1.1349, + "step": 619 + }, + { + "epoch": 0.31953272633568114, + "grad_norm": 1.285905361175537, + "learning_rate": 4.974991972402969e-06, + "loss": 1.1368, + "step": 620 + }, + { + "epoch": 0.3200481017007387, + "grad_norm": 1.2497121095657349, + "learning_rate": 4.974895856650498e-06, + "loss": 1.1165, + "step": 621 + }, + { + "epoch": 0.32056347706579624, + "grad_norm": 1.208838939666748, + "learning_rate": 4.974799557478718e-06, + "loss": 1.1446, + "step": 622 + }, + { + "epoch": 0.3210788524308538, + "grad_norm": 1.3200178146362305, + "learning_rate": 4.974703074894766e-06, + "loss": 1.1881, + "step": 623 + }, + { + "epoch": 0.32159422779591135, + "grad_norm": 1.2912616729736328, + "learning_rate": 4.974606408905791e-06, + "loss": 1.1516, + "step": 624 + }, + { + "epoch": 0.3221096031609689, + "grad_norm": 1.2733863592147827, + "learning_rate": 4.9745095595189585e-06, + "loss": 1.1073, + "step": 625 + }, + { + "epoch": 0.32262497852602645, + "grad_norm": 1.2568449974060059, + "learning_rate": 4.974412526741447e-06, + "loss": 1.1035, + "step": 626 + }, + { + "epoch": 0.32314035389108403, + "grad_norm": 1.3438385725021362, + "learning_rate": 4.974315310580445e-06, + "loss": 1.1347, + "step": 627 + }, + { + "epoch": 0.32365572925614156, + "grad_norm": 1.2689323425292969, + "learning_rate": 4.974217911043161e-06, + "loss": 1.16, + "step": 628 + }, + { + "epoch": 0.3241711046211991, + "grad_norm": 1.2674888372421265, + "learning_rate": 4.97412032813681e-06, + "loss": 1.1229, + "step": 629 + }, + { + "epoch": 0.32468647998625666, + "grad_norm": 1.2539578676223755, + "learning_rate": 4.9740225618686265e-06, + "loss": 1.1325, + "step": 630 + }, + { + "epoch": 0.3252018553513142, + "grad_norm": 1.2101805210113525, + "learning_rate": 4.973924612245855e-06, + "loss": 1.126, + "step": 631 + }, + { + "epoch": 0.32571723071637176, + "grad_norm": 1.2297779321670532, + "learning_rate": 4.9738264792757555e-06, + "loss": 1.1918, + "step": 632 + }, + { + "epoch": 0.3262326060814293, + "grad_norm": 1.3601030111312866, + "learning_rate": 4.9737281629655996e-06, + "loss": 1.1677, + "step": 633 + }, + { + "epoch": 0.32674798144648687, + "grad_norm": 1.2321043014526367, + "learning_rate": 4.973629663322675e-06, + "loss": 1.1329, + "step": 634 + }, + { + "epoch": 0.3272633568115444, + "grad_norm": 1.2176779508590698, + "learning_rate": 4.973530980354281e-06, + "loss": 1.1136, + "step": 635 + }, + { + "epoch": 0.327778732176602, + "grad_norm": 1.2674428224563599, + "learning_rate": 4.973432114067731e-06, + "loss": 1.126, + "step": 636 + }, + { + "epoch": 0.3282941075416595, + "grad_norm": 1.1574589014053345, + "learning_rate": 4.973333064470352e-06, + "loss": 1.1402, + "step": 637 + }, + { + "epoch": 0.3288094829067171, + "grad_norm": 1.230744481086731, + "learning_rate": 4.973233831569487e-06, + "loss": 1.1292, + "step": 638 + }, + { + "epoch": 0.3293248582717746, + "grad_norm": 1.3396669626235962, + "learning_rate": 4.973134415372488e-06, + "loss": 1.1737, + "step": 639 + }, + { + "epoch": 0.3298402336368322, + "grad_norm": 1.2492640018463135, + "learning_rate": 4.973034815886723e-06, + "loss": 1.0871, + "step": 640 + }, + { + "epoch": 0.3303556090018897, + "grad_norm": 1.348250150680542, + "learning_rate": 4.9729350331195746e-06, + "loss": 1.1464, + "step": 641 + }, + { + "epoch": 0.3308709843669473, + "grad_norm": 1.226974606513977, + "learning_rate": 4.972835067078436e-06, + "loss": 1.1298, + "step": 642 + }, + { + "epoch": 0.3313863597320048, + "grad_norm": 1.2675280570983887, + "learning_rate": 4.972734917770719e-06, + "loss": 1.0708, + "step": 643 + }, + { + "epoch": 0.33190173509706233, + "grad_norm": 1.2955893278121948, + "learning_rate": 4.972634585203843e-06, + "loss": 1.0924, + "step": 644 + }, + { + "epoch": 0.3324171104621199, + "grad_norm": 1.290199875831604, + "learning_rate": 4.972534069385245e-06, + "loss": 1.1479, + "step": 645 + }, + { + "epoch": 0.33293248582717744, + "grad_norm": 1.3217016458511353, + "learning_rate": 4.9724333703223746e-06, + "loss": 1.1328, + "step": 646 + }, + { + "epoch": 0.333447861192235, + "grad_norm": 1.355234980583191, + "learning_rate": 4.9723324880226945e-06, + "loss": 1.1553, + "step": 647 + }, + { + "epoch": 0.33396323655729254, + "grad_norm": 1.3599210977554321, + "learning_rate": 4.972231422493681e-06, + "loss": 1.1859, + "step": 648 + }, + { + "epoch": 0.3344786119223501, + "grad_norm": 1.2493696212768555, + "learning_rate": 4.972130173742825e-06, + "loss": 1.1226, + "step": 649 + }, + { + "epoch": 0.33499398728740765, + "grad_norm": 1.2313398122787476, + "learning_rate": 4.9720287417776296e-06, + "loss": 1.1063, + "step": 650 + }, + { + "epoch": 0.3355093626524652, + "grad_norm": 1.299560546875, + "learning_rate": 4.971927126605613e-06, + "loss": 1.1341, + "step": 651 + }, + { + "epoch": 0.33602473801752275, + "grad_norm": 1.2997859716415405, + "learning_rate": 4.9718253282343045e-06, + "loss": 1.1745, + "step": 652 + }, + { + "epoch": 0.33654011338258033, + "grad_norm": 1.2912580966949463, + "learning_rate": 4.971723346671249e-06, + "loss": 1.1192, + "step": 653 + }, + { + "epoch": 0.33705548874763785, + "grad_norm": 1.2521673440933228, + "learning_rate": 4.971621181924006e-06, + "loss": 1.1352, + "step": 654 + }, + { + "epoch": 0.33757086411269543, + "grad_norm": 1.1946815252304077, + "learning_rate": 4.971518834000147e-06, + "loss": 1.1158, + "step": 655 + }, + { + "epoch": 0.33808623947775296, + "grad_norm": 1.3462064266204834, + "learning_rate": 4.971416302907255e-06, + "loss": 1.1664, + "step": 656 + }, + { + "epoch": 0.33860161484281054, + "grad_norm": 1.317471981048584, + "learning_rate": 4.97131358865293e-06, + "loss": 1.1294, + "step": 657 + }, + { + "epoch": 0.33911699020786806, + "grad_norm": 1.2769732475280762, + "learning_rate": 4.971210691244786e-06, + "loss": 1.1233, + "step": 658 + }, + { + "epoch": 0.3396323655729256, + "grad_norm": 1.2474626302719116, + "learning_rate": 4.971107610690445e-06, + "loss": 1.1829, + "step": 659 + }, + { + "epoch": 0.34014774093798317, + "grad_norm": 1.2211552858352661, + "learning_rate": 4.971004346997551e-06, + "loss": 1.135, + "step": 660 + }, + { + "epoch": 0.3406631163030407, + "grad_norm": 1.2152830362319946, + "learning_rate": 4.970900900173755e-06, + "loss": 1.1382, + "step": 661 + }, + { + "epoch": 0.34117849166809827, + "grad_norm": 1.2274240255355835, + "learning_rate": 4.970797270226722e-06, + "loss": 1.1046, + "step": 662 + }, + { + "epoch": 0.3416938670331558, + "grad_norm": 1.194387435913086, + "learning_rate": 4.9706934571641355e-06, + "loss": 1.0675, + "step": 663 + }, + { + "epoch": 0.3422092423982134, + "grad_norm": 1.2856234312057495, + "learning_rate": 4.970589460993687e-06, + "loss": 1.1626, + "step": 664 + }, + { + "epoch": 0.3427246177632709, + "grad_norm": 1.327237606048584, + "learning_rate": 4.970485281723085e-06, + "loss": 1.1075, + "step": 665 + }, + { + "epoch": 0.3432399931283285, + "grad_norm": 1.2260476350784302, + "learning_rate": 4.97038091936005e-06, + "loss": 1.1481, + "step": 666 + }, + { + "epoch": 0.343755368493386, + "grad_norm": 1.3910338878631592, + "learning_rate": 4.970276373912316e-06, + "loss": 1.183, + "step": 667 + }, + { + "epoch": 0.3442707438584436, + "grad_norm": 1.2876691818237305, + "learning_rate": 4.970171645387632e-06, + "loss": 1.1146, + "step": 668 + }, + { + "epoch": 0.3447861192235011, + "grad_norm": 1.2745333909988403, + "learning_rate": 4.970066733793758e-06, + "loss": 1.2011, + "step": 669 + }, + { + "epoch": 0.3453014945885587, + "grad_norm": 1.3226712942123413, + "learning_rate": 4.9699616391384705e-06, + "loss": 1.1331, + "step": 670 + }, + { + "epoch": 0.3458168699536162, + "grad_norm": 1.2185267210006714, + "learning_rate": 4.969856361429559e-06, + "loss": 1.168, + "step": 671 + }, + { + "epoch": 0.3463322453186738, + "grad_norm": 1.243064045906067, + "learning_rate": 4.969750900674825e-06, + "loss": 1.126, + "step": 672 + }, + { + "epoch": 0.3468476206837313, + "grad_norm": 1.2364131212234497, + "learning_rate": 4.969645256882083e-06, + "loss": 1.119, + "step": 673 + }, + { + "epoch": 0.34736299604878884, + "grad_norm": 1.3395003080368042, + "learning_rate": 4.969539430059165e-06, + "loss": 1.0998, + "step": 674 + }, + { + "epoch": 0.3478783714138464, + "grad_norm": 1.3111714124679565, + "learning_rate": 4.969433420213912e-06, + "loss": 1.1397, + "step": 675 + }, + { + "epoch": 0.34839374677890395, + "grad_norm": 1.2934197187423706, + "learning_rate": 4.969327227354182e-06, + "loss": 1.1013, + "step": 676 + }, + { + "epoch": 0.3489091221439615, + "grad_norm": 1.1807507276535034, + "learning_rate": 4.9692208514878445e-06, + "loss": 1.0706, + "step": 677 + }, + { + "epoch": 0.34942449750901905, + "grad_norm": 1.1788406372070312, + "learning_rate": 4.969114292622783e-06, + "loss": 1.1055, + "step": 678 + }, + { + "epoch": 0.34993987287407663, + "grad_norm": 1.3066070079803467, + "learning_rate": 4.969007550766896e-06, + "loss": 1.174, + "step": 679 + }, + { + "epoch": 0.35045524823913415, + "grad_norm": 1.3041435480117798, + "learning_rate": 4.968900625928092e-06, + "loss": 1.1639, + "step": 680 + }, + { + "epoch": 0.35097062360419173, + "grad_norm": 1.1880706548690796, + "learning_rate": 4.968793518114299e-06, + "loss": 1.1309, + "step": 681 + }, + { + "epoch": 0.35148599896924926, + "grad_norm": 1.2611826658248901, + "learning_rate": 4.968686227333451e-06, + "loss": 1.1353, + "step": 682 + }, + { + "epoch": 0.35200137433430684, + "grad_norm": 1.2815132141113281, + "learning_rate": 4.968578753593501e-06, + "loss": 1.1029, + "step": 683 + }, + { + "epoch": 0.35251674969936436, + "grad_norm": 1.3178620338439941, + "learning_rate": 4.9684710969024155e-06, + "loss": 1.1045, + "step": 684 + }, + { + "epoch": 0.35303212506442194, + "grad_norm": 1.3577544689178467, + "learning_rate": 4.968363257268172e-06, + "loss": 1.1487, + "step": 685 + }, + { + "epoch": 0.35354750042947947, + "grad_norm": 1.3682572841644287, + "learning_rate": 4.968255234698762e-06, + "loss": 1.1279, + "step": 686 + }, + { + "epoch": 0.35406287579453705, + "grad_norm": 1.2696278095245361, + "learning_rate": 4.968147029202193e-06, + "loss": 1.1658, + "step": 687 + }, + { + "epoch": 0.35457825115959457, + "grad_norm": 1.285667896270752, + "learning_rate": 4.968038640786484e-06, + "loss": 1.1614, + "step": 688 + }, + { + "epoch": 0.3550936265246521, + "grad_norm": 1.2449527978897095, + "learning_rate": 4.967930069459665e-06, + "loss": 1.1607, + "step": 689 + }, + { + "epoch": 0.3556090018897097, + "grad_norm": 1.3253920078277588, + "learning_rate": 4.967821315229786e-06, + "loss": 1.1755, + "step": 690 + }, + { + "epoch": 0.3561243772547672, + "grad_norm": 1.2488442659378052, + "learning_rate": 4.9677123781049056e-06, + "loss": 1.1352, + "step": 691 + }, + { + "epoch": 0.3566397526198248, + "grad_norm": 1.1993600130081177, + "learning_rate": 4.967603258093097e-06, + "loss": 1.1264, + "step": 692 + }, + { + "epoch": 0.3571551279848823, + "grad_norm": 1.3461339473724365, + "learning_rate": 4.967493955202448e-06, + "loss": 1.1764, + "step": 693 + }, + { + "epoch": 0.3576705033499399, + "grad_norm": 1.3022563457489014, + "learning_rate": 4.967384469441059e-06, + "loss": 1.1225, + "step": 694 + }, + { + "epoch": 0.3581858787149974, + "grad_norm": 1.2400002479553223, + "learning_rate": 4.967274800817044e-06, + "loss": 1.1201, + "step": 695 + }, + { + "epoch": 0.358701254080055, + "grad_norm": 1.3111238479614258, + "learning_rate": 4.9671649493385306e-06, + "loss": 1.1748, + "step": 696 + }, + { + "epoch": 0.3592166294451125, + "grad_norm": 1.2304924726486206, + "learning_rate": 4.96705491501366e-06, + "loss": 1.1069, + "step": 697 + }, + { + "epoch": 0.3597320048101701, + "grad_norm": 1.2275941371917725, + "learning_rate": 4.9669446978505884e-06, + "loss": 1.1006, + "step": 698 + }, + { + "epoch": 0.3602473801752276, + "grad_norm": 1.2485218048095703, + "learning_rate": 4.966834297857483e-06, + "loss": 1.2003, + "step": 699 + }, + { + "epoch": 0.3607627555402852, + "grad_norm": 1.2599111795425415, + "learning_rate": 4.966723715042526e-06, + "loss": 1.1177, + "step": 700 + }, + { + "epoch": 0.3612781309053427, + "grad_norm": 1.2331079244613647, + "learning_rate": 4.966612949413912e-06, + "loss": 1.1089, + "step": 701 + }, + { + "epoch": 0.3617935062704003, + "grad_norm": 1.2413543462753296, + "learning_rate": 4.966502000979852e-06, + "loss": 1.1406, + "step": 702 + }, + { + "epoch": 0.3623088816354578, + "grad_norm": 1.155291199684143, + "learning_rate": 4.966390869748567e-06, + "loss": 1.0574, + "step": 703 + }, + { + "epoch": 0.36282425700051535, + "grad_norm": 1.1831265687942505, + "learning_rate": 4.966279555728295e-06, + "loss": 1.1267, + "step": 704 + }, + { + "epoch": 0.36333963236557293, + "grad_norm": 1.2939788103103638, + "learning_rate": 4.966168058927283e-06, + "loss": 1.0658, + "step": 705 + }, + { + "epoch": 0.36385500773063045, + "grad_norm": 1.2909181118011475, + "learning_rate": 4.966056379353796e-06, + "loss": 1.1176, + "step": 706 + }, + { + "epoch": 0.36437038309568803, + "grad_norm": 1.2729825973510742, + "learning_rate": 4.96594451701611e-06, + "loss": 1.1081, + "step": 707 + }, + { + "epoch": 0.36488575846074556, + "grad_norm": 1.2541344165802002, + "learning_rate": 4.965832471922517e-06, + "loss": 1.1312, + "step": 708 + }, + { + "epoch": 0.36540113382580314, + "grad_norm": 1.3798422813415527, + "learning_rate": 4.965720244081319e-06, + "loss": 1.2037, + "step": 709 + }, + { + "epoch": 0.36591650919086066, + "grad_norm": 1.2447426319122314, + "learning_rate": 4.965607833500834e-06, + "loss": 1.1166, + "step": 710 + }, + { + "epoch": 0.36643188455591824, + "grad_norm": 1.215897560119629, + "learning_rate": 4.965495240189393e-06, + "loss": 1.1064, + "step": 711 + }, + { + "epoch": 0.36694725992097577, + "grad_norm": 1.2096561193466187, + "learning_rate": 4.965382464155341e-06, + "loss": 1.0328, + "step": 712 + }, + { + "epoch": 0.36746263528603335, + "grad_norm": 1.3168424367904663, + "learning_rate": 4.9652695054070344e-06, + "loss": 1.12, + "step": 713 + }, + { + "epoch": 0.36797801065109087, + "grad_norm": 1.1280862092971802, + "learning_rate": 4.9651563639528465e-06, + "loss": 1.0804, + "step": 714 + }, + { + "epoch": 0.36849338601614845, + "grad_norm": 1.3654303550720215, + "learning_rate": 4.9650430398011626e-06, + "loss": 1.1334, + "step": 715 + }, + { + "epoch": 0.369008761381206, + "grad_norm": 1.2694456577301025, + "learning_rate": 4.96492953296038e-06, + "loss": 1.1135, + "step": 716 + }, + { + "epoch": 0.36952413674626355, + "grad_norm": 1.2896363735198975, + "learning_rate": 4.9648158434389124e-06, + "loss": 1.152, + "step": 717 + }, + { + "epoch": 0.3700395121113211, + "grad_norm": 1.276693344116211, + "learning_rate": 4.9647019712451836e-06, + "loss": 1.1311, + "step": 718 + }, + { + "epoch": 0.3705548874763786, + "grad_norm": 1.407243013381958, + "learning_rate": 4.964587916387635e-06, + "loss": 1.1585, + "step": 719 + }, + { + "epoch": 0.3710702628414362, + "grad_norm": 1.2954100370407104, + "learning_rate": 4.964473678874718e-06, + "loss": 1.1195, + "step": 720 + }, + { + "epoch": 0.3715856382064937, + "grad_norm": 1.2955098152160645, + "learning_rate": 4.9643592587149005e-06, + "loss": 1.016, + "step": 721 + }, + { + "epoch": 0.3721010135715513, + "grad_norm": 1.2117847204208374, + "learning_rate": 4.9642446559166605e-06, + "loss": 1.1082, + "step": 722 + }, + { + "epoch": 0.3726163889366088, + "grad_norm": 1.2178875207901, + "learning_rate": 4.964129870488491e-06, + "loss": 1.1686, + "step": 723 + }, + { + "epoch": 0.3731317643016664, + "grad_norm": 1.27935791015625, + "learning_rate": 4.9640149024389014e-06, + "loss": 1.1193, + "step": 724 + }, + { + "epoch": 0.3736471396667239, + "grad_norm": 1.3828037977218628, + "learning_rate": 4.963899751776411e-06, + "loss": 1.1167, + "step": 725 + }, + { + "epoch": 0.3741625150317815, + "grad_norm": 1.3130462169647217, + "learning_rate": 4.9637844185095545e-06, + "loss": 1.1457, + "step": 726 + }, + { + "epoch": 0.374677890396839, + "grad_norm": 1.2902321815490723, + "learning_rate": 4.963668902646879e-06, + "loss": 1.2095, + "step": 727 + }, + { + "epoch": 0.3751932657618966, + "grad_norm": 1.273491382598877, + "learning_rate": 4.963553204196944e-06, + "loss": 1.1037, + "step": 728 + }, + { + "epoch": 0.3757086411269541, + "grad_norm": 1.422223687171936, + "learning_rate": 4.963437323168326e-06, + "loss": 1.2536, + "step": 729 + }, + { + "epoch": 0.3762240164920117, + "grad_norm": 1.3016620874404907, + "learning_rate": 4.963321259569613e-06, + "loss": 1.2464, + "step": 730 + }, + { + "epoch": 0.37673939185706923, + "grad_norm": 1.3318872451782227, + "learning_rate": 4.963205013409407e-06, + "loss": 1.1456, + "step": 731 + }, + { + "epoch": 0.3772547672221268, + "grad_norm": 1.3237916231155396, + "learning_rate": 4.963088584696322e-06, + "loss": 1.0936, + "step": 732 + }, + { + "epoch": 0.37777014258718433, + "grad_norm": 1.2826303243637085, + "learning_rate": 4.962971973438988e-06, + "loss": 1.1412, + "step": 733 + }, + { + "epoch": 0.37828551795224186, + "grad_norm": 1.254653811454773, + "learning_rate": 4.962855179646046e-06, + "loss": 1.1696, + "step": 734 + }, + { + "epoch": 0.37880089331729944, + "grad_norm": 1.294478416442871, + "learning_rate": 4.962738203326153e-06, + "loss": 1.1704, + "step": 735 + }, + { + "epoch": 0.37931626868235696, + "grad_norm": 1.2368558645248413, + "learning_rate": 4.962621044487978e-06, + "loss": 1.1011, + "step": 736 + }, + { + "epoch": 0.37983164404741454, + "grad_norm": 1.2399348020553589, + "learning_rate": 4.962503703140203e-06, + "loss": 1.1748, + "step": 737 + }, + { + "epoch": 0.38034701941247206, + "grad_norm": 1.3496227264404297, + "learning_rate": 4.962386179291526e-06, + "loss": 1.1774, + "step": 738 + }, + { + "epoch": 0.38086239477752964, + "grad_norm": 1.2116947174072266, + "learning_rate": 4.9622684729506555e-06, + "loss": 1.13, + "step": 739 + }, + { + "epoch": 0.38137777014258717, + "grad_norm": 1.3135943412780762, + "learning_rate": 4.9621505841263155e-06, + "loss": 1.172, + "step": 740 + }, + { + "epoch": 0.38189314550764475, + "grad_norm": 1.275046467781067, + "learning_rate": 4.962032512827243e-06, + "loss": 1.1179, + "step": 741 + }, + { + "epoch": 0.3824085208727023, + "grad_norm": 1.3962445259094238, + "learning_rate": 4.961914259062189e-06, + "loss": 1.1301, + "step": 742 + }, + { + "epoch": 0.38292389623775985, + "grad_norm": 1.3349716663360596, + "learning_rate": 4.961795822839917e-06, + "loss": 1.1139, + "step": 743 + }, + { + "epoch": 0.3834392716028174, + "grad_norm": 1.3125500679016113, + "learning_rate": 4.9616772041692035e-06, + "loss": 1.1209, + "step": 744 + }, + { + "epoch": 0.38395464696787496, + "grad_norm": 1.3276808261871338, + "learning_rate": 4.961558403058841e-06, + "loss": 1.115, + "step": 745 + }, + { + "epoch": 0.3844700223329325, + "grad_norm": 1.3487229347229004, + "learning_rate": 4.961439419517634e-06, + "loss": 1.1177, + "step": 746 + }, + { + "epoch": 0.38498539769799006, + "grad_norm": 1.262861967086792, + "learning_rate": 4.9613202535544e-06, + "loss": 1.1196, + "step": 747 + }, + { + "epoch": 0.3855007730630476, + "grad_norm": 1.347224473953247, + "learning_rate": 4.961200905177971e-06, + "loss": 1.1486, + "step": 748 + }, + { + "epoch": 0.3860161484281051, + "grad_norm": 1.3654824495315552, + "learning_rate": 4.961081374397192e-06, + "loss": 1.1499, + "step": 749 + }, + { + "epoch": 0.3865315237931627, + "grad_norm": 1.2931691408157349, + "learning_rate": 4.960961661220921e-06, + "loss": 1.1048, + "step": 750 + }, + { + "epoch": 0.3870468991582202, + "grad_norm": 1.3167107105255127, + "learning_rate": 4.960841765658032e-06, + "loss": 1.1344, + "step": 751 + }, + { + "epoch": 0.3875622745232778, + "grad_norm": 1.2615445852279663, + "learning_rate": 4.960721687717408e-06, + "loss": 1.0626, + "step": 752 + }, + { + "epoch": 0.3880776498883353, + "grad_norm": 1.3376338481903076, + "learning_rate": 4.960601427407952e-06, + "loss": 1.1224, + "step": 753 + }, + { + "epoch": 0.3885930252533929, + "grad_norm": 1.3246850967407227, + "learning_rate": 4.960480984738573e-06, + "loss": 1.1603, + "step": 754 + }, + { + "epoch": 0.3891084006184504, + "grad_norm": 1.2683501243591309, + "learning_rate": 4.960360359718198e-06, + "loss": 1.0908, + "step": 755 + }, + { + "epoch": 0.389623775983508, + "grad_norm": 1.259310007095337, + "learning_rate": 4.960239552355769e-06, + "loss": 1.141, + "step": 756 + }, + { + "epoch": 0.3901391513485655, + "grad_norm": 1.2660895586013794, + "learning_rate": 4.960118562660236e-06, + "loss": 1.1557, + "step": 757 + }, + { + "epoch": 0.3906545267136231, + "grad_norm": 1.3115184307098389, + "learning_rate": 4.959997390640569e-06, + "loss": 1.1707, + "step": 758 + }, + { + "epoch": 0.39116990207868063, + "grad_norm": 1.3106032609939575, + "learning_rate": 4.959876036305746e-06, + "loss": 1.1481, + "step": 759 + }, + { + "epoch": 0.3916852774437382, + "grad_norm": 1.3089869022369385, + "learning_rate": 4.959754499664762e-06, + "loss": 1.1248, + "step": 760 + }, + { + "epoch": 0.39220065280879574, + "grad_norm": 1.2966090440750122, + "learning_rate": 4.9596327807266245e-06, + "loss": 1.1252, + "step": 761 + }, + { + "epoch": 0.3927160281738533, + "grad_norm": 1.178955316543579, + "learning_rate": 4.959510879500352e-06, + "loss": 1.1103, + "step": 762 + }, + { + "epoch": 0.39323140353891084, + "grad_norm": 1.309419870376587, + "learning_rate": 4.959388795994982e-06, + "loss": 1.1676, + "step": 763 + }, + { + "epoch": 0.39374677890396836, + "grad_norm": 1.2507402896881104, + "learning_rate": 4.959266530219561e-06, + "loss": 1.1411, + "step": 764 + }, + { + "epoch": 0.39426215426902594, + "grad_norm": 1.3529011011123657, + "learning_rate": 4.959144082183149e-06, + "loss": 1.1412, + "step": 765 + }, + { + "epoch": 0.39477752963408347, + "grad_norm": 1.3554461002349854, + "learning_rate": 4.959021451894823e-06, + "loss": 1.0517, + "step": 766 + }, + { + "epoch": 0.39529290499914105, + "grad_norm": 1.2238177061080933, + "learning_rate": 4.958898639363671e-06, + "loss": 1.1235, + "step": 767 + }, + { + "epoch": 0.3958082803641986, + "grad_norm": 1.3070563077926636, + "learning_rate": 4.958775644598793e-06, + "loss": 1.1145, + "step": 768 + }, + { + "epoch": 0.39632365572925615, + "grad_norm": 1.3427777290344238, + "learning_rate": 4.958652467609307e-06, + "loss": 1.1272, + "step": 769 + }, + { + "epoch": 0.3968390310943137, + "grad_norm": 1.2721115350723267, + "learning_rate": 4.95852910840434e-06, + "loss": 1.1605, + "step": 770 + }, + { + "epoch": 0.39735440645937126, + "grad_norm": 1.2291545867919922, + "learning_rate": 4.958405566993035e-06, + "loss": 1.1183, + "step": 771 + }, + { + "epoch": 0.3978697818244288, + "grad_norm": 1.2380768060684204, + "learning_rate": 4.958281843384549e-06, + "loss": 1.1272, + "step": 772 + }, + { + "epoch": 0.39838515718948636, + "grad_norm": 1.2982110977172852, + "learning_rate": 4.958157937588049e-06, + "loss": 1.106, + "step": 773 + }, + { + "epoch": 0.3989005325545439, + "grad_norm": 1.3379300832748413, + "learning_rate": 4.958033849612719e-06, + "loss": 1.155, + "step": 774 + }, + { + "epoch": 0.39941590791960147, + "grad_norm": 1.3295038938522339, + "learning_rate": 4.957909579467757e-06, + "loss": 1.1161, + "step": 775 + }, + { + "epoch": 0.399931283284659, + "grad_norm": 1.2976667881011963, + "learning_rate": 4.9577851271623704e-06, + "loss": 1.1668, + "step": 776 + }, + { + "epoch": 0.40044665864971657, + "grad_norm": 1.2517850399017334, + "learning_rate": 4.9576604927057845e-06, + "loss": 1.0963, + "step": 777 + }, + { + "epoch": 0.4009620340147741, + "grad_norm": 1.291367530822754, + "learning_rate": 4.957535676107235e-06, + "loss": 1.1271, + "step": 778 + }, + { + "epoch": 0.4014774093798316, + "grad_norm": 1.269709587097168, + "learning_rate": 4.957410677375972e-06, + "loss": 1.0947, + "step": 779 + }, + { + "epoch": 0.4019927847448892, + "grad_norm": 1.3467859029769897, + "learning_rate": 4.957285496521261e-06, + "loss": 1.1048, + "step": 780 + }, + { + "epoch": 0.4025081601099467, + "grad_norm": 1.2900623083114624, + "learning_rate": 4.957160133552378e-06, + "loss": 1.1626, + "step": 781 + }, + { + "epoch": 0.4030235354750043, + "grad_norm": 1.2303043603897095, + "learning_rate": 4.957034588478615e-06, + "loss": 1.1012, + "step": 782 + }, + { + "epoch": 0.4035389108400618, + "grad_norm": 1.327517032623291, + "learning_rate": 4.956908861309275e-06, + "loss": 1.1346, + "step": 783 + }, + { + "epoch": 0.4040542862051194, + "grad_norm": 1.3691375255584717, + "learning_rate": 4.956782952053678e-06, + "loss": 1.1497, + "step": 784 + }, + { + "epoch": 0.40456966157017693, + "grad_norm": 1.3795244693756104, + "learning_rate": 4.956656860721153e-06, + "loss": 1.1014, + "step": 785 + }, + { + "epoch": 0.4050850369352345, + "grad_norm": 1.3545103073120117, + "learning_rate": 4.956530587321046e-06, + "loss": 1.1772, + "step": 786 + }, + { + "epoch": 0.40560041230029203, + "grad_norm": 1.2767109870910645, + "learning_rate": 4.956404131862715e-06, + "loss": 1.1162, + "step": 787 + }, + { + "epoch": 0.4061157876653496, + "grad_norm": 1.332641363143921, + "learning_rate": 4.956277494355532e-06, + "loss": 1.0951, + "step": 788 + }, + { + "epoch": 0.40663116303040714, + "grad_norm": 1.45758056640625, + "learning_rate": 4.956150674808883e-06, + "loss": 1.1645, + "step": 789 + }, + { + "epoch": 0.4071465383954647, + "grad_norm": 1.4294954538345337, + "learning_rate": 4.956023673232166e-06, + "loss": 1.1215, + "step": 790 + }, + { + "epoch": 0.40766191376052224, + "grad_norm": 1.3346449136734009, + "learning_rate": 4.955896489634795e-06, + "loss": 1.0929, + "step": 791 + }, + { + "epoch": 0.4081772891255798, + "grad_norm": 1.3155488967895508, + "learning_rate": 4.955769124026193e-06, + "loss": 1.1635, + "step": 792 + }, + { + "epoch": 0.40869266449063735, + "grad_norm": 1.362632155418396, + "learning_rate": 4.955641576415801e-06, + "loss": 1.1359, + "step": 793 + }, + { + "epoch": 0.40920803985569487, + "grad_norm": 1.3555985689163208, + "learning_rate": 4.955513846813072e-06, + "loss": 1.1666, + "step": 794 + }, + { + "epoch": 0.40972341522075245, + "grad_norm": 1.2590464353561401, + "learning_rate": 4.955385935227471e-06, + "loss": 1.1201, + "step": 795 + }, + { + "epoch": 0.41023879058581, + "grad_norm": 1.331552267074585, + "learning_rate": 4.955257841668481e-06, + "loss": 1.1116, + "step": 796 + }, + { + "epoch": 0.41075416595086756, + "grad_norm": 1.262492299079895, + "learning_rate": 4.95512956614559e-06, + "loss": 1.0981, + "step": 797 + }, + { + "epoch": 0.4112695413159251, + "grad_norm": 1.4217736721038818, + "learning_rate": 4.955001108668309e-06, + "loss": 1.1947, + "step": 798 + }, + { + "epoch": 0.41178491668098266, + "grad_norm": 1.3397902250289917, + "learning_rate": 4.954872469246157e-06, + "loss": 1.1093, + "step": 799 + }, + { + "epoch": 0.4123002920460402, + "grad_norm": 1.4462045431137085, + "learning_rate": 4.954743647888667e-06, + "loss": 1.1438, + "step": 800 + }, + { + "epoch": 0.41281566741109776, + "grad_norm": 1.3599969148635864, + "learning_rate": 4.954614644605388e-06, + "loss": 1.1726, + "step": 801 + }, + { + "epoch": 0.4133310427761553, + "grad_norm": 1.3505985736846924, + "learning_rate": 4.954485459405878e-06, + "loss": 1.0917, + "step": 802 + }, + { + "epoch": 0.41384641814121287, + "grad_norm": 1.262619972229004, + "learning_rate": 4.954356092299713e-06, + "loss": 1.1617, + "step": 803 + }, + { + "epoch": 0.4143617935062704, + "grad_norm": 1.3853216171264648, + "learning_rate": 4.954226543296481e-06, + "loss": 1.1576, + "step": 804 + }, + { + "epoch": 0.414877168871328, + "grad_norm": 1.4354254007339478, + "learning_rate": 4.9540968124057826e-06, + "loss": 1.1984, + "step": 805 + }, + { + "epoch": 0.4153925442363855, + "grad_norm": 1.2844696044921875, + "learning_rate": 4.953966899637231e-06, + "loss": 1.1313, + "step": 806 + }, + { + "epoch": 0.4159079196014431, + "grad_norm": 1.290461778640747, + "learning_rate": 4.953836805000457e-06, + "loss": 1.1077, + "step": 807 + }, + { + "epoch": 0.4164232949665006, + "grad_norm": 1.2542216777801514, + "learning_rate": 4.9537065285051e-06, + "loss": 1.1277, + "step": 808 + }, + { + "epoch": 0.4169386703315581, + "grad_norm": 1.2707545757293701, + "learning_rate": 4.953576070160815e-06, + "loss": 1.0971, + "step": 809 + }, + { + "epoch": 0.4174540456966157, + "grad_norm": 1.3372422456741333, + "learning_rate": 4.953445429977272e-06, + "loss": 1.1273, + "step": 810 + }, + { + "epoch": 0.41796942106167323, + "grad_norm": 1.370647668838501, + "learning_rate": 4.953314607964153e-06, + "loss": 1.1665, + "step": 811 + }, + { + "epoch": 0.4184847964267308, + "grad_norm": 1.2144851684570312, + "learning_rate": 4.953183604131151e-06, + "loss": 1.0726, + "step": 812 + }, + { + "epoch": 0.41900017179178833, + "grad_norm": 1.3138102293014526, + "learning_rate": 4.953052418487978e-06, + "loss": 1.2001, + "step": 813 + }, + { + "epoch": 0.4195155471568459, + "grad_norm": 1.2491734027862549, + "learning_rate": 4.952921051044354e-06, + "loss": 1.0989, + "step": 814 + }, + { + "epoch": 0.42003092252190344, + "grad_norm": 1.2666431665420532, + "learning_rate": 4.952789501810017e-06, + "loss": 1.1124, + "step": 815 + }, + { + "epoch": 0.420546297886961, + "grad_norm": 1.3701388835906982, + "learning_rate": 4.952657770794715e-06, + "loss": 1.1148, + "step": 816 + }, + { + "epoch": 0.42106167325201854, + "grad_norm": 1.3455960750579834, + "learning_rate": 4.952525858008211e-06, + "loss": 1.1049, + "step": 817 + }, + { + "epoch": 0.4215770486170761, + "grad_norm": 1.3758878707885742, + "learning_rate": 4.952393763460282e-06, + "loss": 1.1845, + "step": 818 + }, + { + "epoch": 0.42209242398213365, + "grad_norm": 1.3101781606674194, + "learning_rate": 4.952261487160717e-06, + "loss": 1.1154, + "step": 819 + }, + { + "epoch": 0.4226077993471912, + "grad_norm": 1.3576375246047974, + "learning_rate": 4.9521290291193196e-06, + "loss": 1.1373, + "step": 820 + }, + { + "epoch": 0.42312317471224875, + "grad_norm": 1.429225206375122, + "learning_rate": 4.951996389345906e-06, + "loss": 1.1445, + "step": 821 + }, + { + "epoch": 0.42363855007730633, + "grad_norm": 1.511832594871521, + "learning_rate": 4.951863567850307e-06, + "loss": 1.1471, + "step": 822 + }, + { + "epoch": 0.42415392544236385, + "grad_norm": 1.3234866857528687, + "learning_rate": 4.951730564642366e-06, + "loss": 1.1579, + "step": 823 + }, + { + "epoch": 0.4246693008074214, + "grad_norm": 1.3025860786437988, + "learning_rate": 4.951597379731941e-06, + "loss": 1.1451, + "step": 824 + }, + { + "epoch": 0.42518467617247896, + "grad_norm": 1.3261936902999878, + "learning_rate": 4.9514640131289e-06, + "loss": 1.0935, + "step": 825 + }, + { + "epoch": 0.4257000515375365, + "grad_norm": 1.3229318857192993, + "learning_rate": 4.9513304648431306e-06, + "loss": 1.1324, + "step": 826 + }, + { + "epoch": 0.42621542690259406, + "grad_norm": 1.2966307401657104, + "learning_rate": 4.951196734884528e-06, + "loss": 1.1034, + "step": 827 + }, + { + "epoch": 0.4267308022676516, + "grad_norm": 1.2589709758758545, + "learning_rate": 4.951062823263004e-06, + "loss": 1.1125, + "step": 828 + }, + { + "epoch": 0.42724617763270917, + "grad_norm": 1.2551952600479126, + "learning_rate": 4.950928729988483e-06, + "loss": 1.134, + "step": 829 + }, + { + "epoch": 0.4277615529977667, + "grad_norm": 1.4582545757293701, + "learning_rate": 4.950794455070903e-06, + "loss": 1.1099, + "step": 830 + }, + { + "epoch": 0.42827692836282427, + "grad_norm": 1.2585415840148926, + "learning_rate": 4.9506599985202144e-06, + "loss": 1.133, + "step": 831 + }, + { + "epoch": 0.4287923037278818, + "grad_norm": 1.3387125730514526, + "learning_rate": 4.950525360346382e-06, + "loss": 1.0834, + "step": 832 + }, + { + "epoch": 0.4293076790929394, + "grad_norm": 1.356967806816101, + "learning_rate": 4.9503905405593865e-06, + "loss": 1.1399, + "step": 833 + }, + { + "epoch": 0.4298230544579969, + "grad_norm": 1.212633728981018, + "learning_rate": 4.950255539169217e-06, + "loss": 1.1188, + "step": 834 + }, + { + "epoch": 0.4303384298230545, + "grad_norm": 1.3586931228637695, + "learning_rate": 4.9501203561858805e-06, + "loss": 1.1225, + "step": 835 + }, + { + "epoch": 0.430853805188112, + "grad_norm": 1.323394775390625, + "learning_rate": 4.949984991619394e-06, + "loss": 1.1006, + "step": 836 + }, + { + "epoch": 0.4313691805531696, + "grad_norm": 1.360657811164856, + "learning_rate": 4.949849445479791e-06, + "loss": 1.1395, + "step": 837 + }, + { + "epoch": 0.4318845559182271, + "grad_norm": 1.2871806621551514, + "learning_rate": 4.949713717777116e-06, + "loss": 1.1298, + "step": 838 + }, + { + "epoch": 0.43239993128328463, + "grad_norm": 1.318222999572754, + "learning_rate": 4.949577808521429e-06, + "loss": 1.0854, + "step": 839 + }, + { + "epoch": 0.4329153066483422, + "grad_norm": 1.4249770641326904, + "learning_rate": 4.949441717722803e-06, + "loss": 1.1626, + "step": 840 + }, + { + "epoch": 0.43343068201339974, + "grad_norm": 1.3032678365707397, + "learning_rate": 4.949305445391321e-06, + "loss": 1.0744, + "step": 841 + }, + { + "epoch": 0.4339460573784573, + "grad_norm": 1.4344840049743652, + "learning_rate": 4.9491689915370865e-06, + "loss": 1.1501, + "step": 842 + }, + { + "epoch": 0.43446143274351484, + "grad_norm": 1.396562933921814, + "learning_rate": 4.94903235617021e-06, + "loss": 1.0927, + "step": 843 + }, + { + "epoch": 0.4349768081085724, + "grad_norm": 1.4599568843841553, + "learning_rate": 4.948895539300817e-06, + "loss": 1.1006, + "step": 844 + }, + { + "epoch": 0.43549218347362995, + "grad_norm": 1.3650449514389038, + "learning_rate": 4.9487585409390494e-06, + "loss": 1.063, + "step": 845 + }, + { + "epoch": 0.4360075588386875, + "grad_norm": 1.3364917039871216, + "learning_rate": 4.948621361095059e-06, + "loss": 1.1678, + "step": 846 + }, + { + "epoch": 0.43652293420374505, + "grad_norm": 1.3469786643981934, + "learning_rate": 4.948483999779014e-06, + "loss": 1.1068, + "step": 847 + }, + { + "epoch": 0.43703830956880263, + "grad_norm": 1.3044260740280151, + "learning_rate": 4.948346457001092e-06, + "loss": 1.1267, + "step": 848 + }, + { + "epoch": 0.43755368493386015, + "grad_norm": 1.351914405822754, + "learning_rate": 4.948208732771489e-06, + "loss": 1.1117, + "step": 849 + }, + { + "epoch": 0.43806906029891773, + "grad_norm": 1.3472167253494263, + "learning_rate": 4.948070827100411e-06, + "loss": 1.1318, + "step": 850 + }, + { + "epoch": 0.43858443566397526, + "grad_norm": 1.2645403146743774, + "learning_rate": 4.9479327399980775e-06, + "loss": 1.0706, + "step": 851 + }, + { + "epoch": 0.43909981102903284, + "grad_norm": 1.3215440511703491, + "learning_rate": 4.947794471474724e-06, + "loss": 1.1504, + "step": 852 + }, + { + "epoch": 0.43961518639409036, + "grad_norm": 1.308405876159668, + "learning_rate": 4.9476560215405964e-06, + "loss": 1.1974, + "step": 853 + }, + { + "epoch": 0.4401305617591479, + "grad_norm": 1.222504734992981, + "learning_rate": 4.947517390205957e-06, + "loss": 1.1557, + "step": 854 + }, + { + "epoch": 0.44064593712420547, + "grad_norm": 1.300502896308899, + "learning_rate": 4.94737857748108e-06, + "loss": 1.1415, + "step": 855 + }, + { + "epoch": 0.441161312489263, + "grad_norm": 1.3184834718704224, + "learning_rate": 4.947239583376251e-06, + "loss": 1.1552, + "step": 856 + }, + { + "epoch": 0.44167668785432057, + "grad_norm": 1.3779114484786987, + "learning_rate": 4.947100407901773e-06, + "loss": 1.1275, + "step": 857 + }, + { + "epoch": 0.4421920632193781, + "grad_norm": 1.3452224731445312, + "learning_rate": 4.94696105106796e-06, + "loss": 1.1038, + "step": 858 + }, + { + "epoch": 0.4427074385844357, + "grad_norm": 1.2375786304473877, + "learning_rate": 4.946821512885141e-06, + "loss": 1.135, + "step": 859 + }, + { + "epoch": 0.4432228139494932, + "grad_norm": 1.418386459350586, + "learning_rate": 4.946681793363656e-06, + "loss": 1.1499, + "step": 860 + }, + { + "epoch": 0.4437381893145508, + "grad_norm": 1.3137166500091553, + "learning_rate": 4.94654189251386e-06, + "loss": 1.1094, + "step": 861 + }, + { + "epoch": 0.4442535646796083, + "grad_norm": 1.3592449426651, + "learning_rate": 4.9464018103461216e-06, + "loss": 1.1588, + "step": 862 + }, + { + "epoch": 0.4447689400446659, + "grad_norm": 1.3796391487121582, + "learning_rate": 4.946261546870823e-06, + "loss": 1.1628, + "step": 863 + }, + { + "epoch": 0.4452843154097234, + "grad_norm": 1.268865704536438, + "learning_rate": 4.946121102098359e-06, + "loss": 1.0974, + "step": 864 + }, + { + "epoch": 0.445799690774781, + "grad_norm": 1.3379385471343994, + "learning_rate": 4.9459804760391385e-06, + "loss": 1.1612, + "step": 865 + }, + { + "epoch": 0.4463150661398385, + "grad_norm": 1.4032179117202759, + "learning_rate": 4.945839668703583e-06, + "loss": 1.102, + "step": 866 + }, + { + "epoch": 0.4468304415048961, + "grad_norm": 1.3682565689086914, + "learning_rate": 4.945698680102129e-06, + "loss": 1.1517, + "step": 867 + }, + { + "epoch": 0.4473458168699536, + "grad_norm": 1.2709169387817383, + "learning_rate": 4.9455575102452245e-06, + "loss": 1.0855, + "step": 868 + }, + { + "epoch": 0.44786119223501114, + "grad_norm": 1.3753496408462524, + "learning_rate": 4.945416159143332e-06, + "loss": 1.0894, + "step": 869 + }, + { + "epoch": 0.4483765676000687, + "grad_norm": 1.3432855606079102, + "learning_rate": 4.945274626806928e-06, + "loss": 1.1431, + "step": 870 + }, + { + "epoch": 0.44889194296512624, + "grad_norm": 1.214583158493042, + "learning_rate": 4.945132913246501e-06, + "loss": 1.0696, + "step": 871 + }, + { + "epoch": 0.4494073183301838, + "grad_norm": 1.319673776626587, + "learning_rate": 4.944991018472553e-06, + "loss": 1.0505, + "step": 872 + }, + { + "epoch": 0.44992269369524135, + "grad_norm": 1.3592249155044556, + "learning_rate": 4.944848942495603e-06, + "loss": 1.133, + "step": 873 + }, + { + "epoch": 0.45043806906029893, + "grad_norm": 1.2998868227005005, + "learning_rate": 4.944706685326177e-06, + "loss": 1.083, + "step": 874 + }, + { + "epoch": 0.45095344442535645, + "grad_norm": 1.3852722644805908, + "learning_rate": 4.94456424697482e-06, + "loss": 1.0628, + "step": 875 + }, + { + "epoch": 0.45146881979041403, + "grad_norm": 1.280658483505249, + "learning_rate": 4.944421627452088e-06, + "loss": 1.0957, + "step": 876 + }, + { + "epoch": 0.45198419515547156, + "grad_norm": 1.359499216079712, + "learning_rate": 4.9442788267685506e-06, + "loss": 1.1592, + "step": 877 + }, + { + "epoch": 0.45249957052052914, + "grad_norm": 1.2672083377838135, + "learning_rate": 4.944135844934791e-06, + "loss": 1.1968, + "step": 878 + }, + { + "epoch": 0.45301494588558666, + "grad_norm": 1.3375358581542969, + "learning_rate": 4.943992681961406e-06, + "loss": 1.1149, + "step": 879 + }, + { + "epoch": 0.45353032125064424, + "grad_norm": 1.2706693410873413, + "learning_rate": 4.943849337859005e-06, + "loss": 1.1546, + "step": 880 + }, + { + "epoch": 0.45404569661570177, + "grad_norm": 1.268637776374817, + "learning_rate": 4.943705812638212e-06, + "loss": 1.1309, + "step": 881 + }, + { + "epoch": 0.45456107198075935, + "grad_norm": 1.3276478052139282, + "learning_rate": 4.943562106309666e-06, + "loss": 1.0721, + "step": 882 + }, + { + "epoch": 0.45507644734581687, + "grad_norm": 1.2741299867630005, + "learning_rate": 4.943418218884014e-06, + "loss": 1.0749, + "step": 883 + }, + { + "epoch": 0.4555918227108744, + "grad_norm": 1.3312971591949463, + "learning_rate": 4.943274150371921e-06, + "loss": 1.115, + "step": 884 + }, + { + "epoch": 0.456107198075932, + "grad_norm": 1.270114541053772, + "learning_rate": 4.943129900784065e-06, + "loss": 1.1562, + "step": 885 + }, + { + "epoch": 0.4566225734409895, + "grad_norm": 1.3587193489074707, + "learning_rate": 4.942985470131136e-06, + "loss": 1.0775, + "step": 886 + }, + { + "epoch": 0.4571379488060471, + "grad_norm": 1.4444305896759033, + "learning_rate": 4.942840858423838e-06, + "loss": 1.1518, + "step": 887 + }, + { + "epoch": 0.4576533241711046, + "grad_norm": 1.290061354637146, + "learning_rate": 4.942696065672889e-06, + "loss": 1.115, + "step": 888 + }, + { + "epoch": 0.4581686995361622, + "grad_norm": 1.3777953386306763, + "learning_rate": 4.942551091889018e-06, + "loss": 1.1741, + "step": 889 + }, + { + "epoch": 0.4586840749012197, + "grad_norm": 1.272931694984436, + "learning_rate": 4.942405937082971e-06, + "loss": 1.0733, + "step": 890 + }, + { + "epoch": 0.4591994502662773, + "grad_norm": 1.2352920770645142, + "learning_rate": 4.9422606012655065e-06, + "loss": 1.1237, + "step": 891 + }, + { + "epoch": 0.4597148256313348, + "grad_norm": 1.2935038805007935, + "learning_rate": 4.942115084447393e-06, + "loss": 1.1502, + "step": 892 + }, + { + "epoch": 0.4602302009963924, + "grad_norm": 1.3403886556625366, + "learning_rate": 4.941969386639418e-06, + "loss": 1.1346, + "step": 893 + }, + { + "epoch": 0.4607455763614499, + "grad_norm": 1.307370662689209, + "learning_rate": 4.941823507852377e-06, + "loss": 1.0788, + "step": 894 + }, + { + "epoch": 0.4612609517265075, + "grad_norm": 1.2700450420379639, + "learning_rate": 4.941677448097083e-06, + "loss": 1.1311, + "step": 895 + }, + { + "epoch": 0.461776327091565, + "grad_norm": 1.2455085515975952, + "learning_rate": 4.941531207384359e-06, + "loss": 1.085, + "step": 896 + }, + { + "epoch": 0.4622917024566226, + "grad_norm": 1.3723219633102417, + "learning_rate": 4.941384785725046e-06, + "loss": 1.0763, + "step": 897 + }, + { + "epoch": 0.4628070778216801, + "grad_norm": 1.3822237253189087, + "learning_rate": 4.941238183129993e-06, + "loss": 1.1476, + "step": 898 + }, + { + "epoch": 0.46332245318673765, + "grad_norm": 1.3194243907928467, + "learning_rate": 4.941091399610065e-06, + "loss": 1.0974, + "step": 899 + }, + { + "epoch": 0.46383782855179523, + "grad_norm": 1.3286584615707397, + "learning_rate": 4.940944435176143e-06, + "loss": 1.1035, + "step": 900 + }, + { + "epoch": 0.46435320391685275, + "grad_norm": 1.3928455114364624, + "learning_rate": 4.940797289839116e-06, + "loss": 1.1511, + "step": 901 + }, + { + "epoch": 0.46486857928191033, + "grad_norm": 1.3169835805892944, + "learning_rate": 4.940649963609891e-06, + "loss": 1.1494, + "step": 902 + }, + { + "epoch": 0.46538395464696786, + "grad_norm": 1.3504160642623901, + "learning_rate": 4.940502456499384e-06, + "loss": 1.1451, + "step": 903 + }, + { + "epoch": 0.46589933001202544, + "grad_norm": 1.3698813915252686, + "learning_rate": 4.940354768518531e-06, + "loss": 1.1354, + "step": 904 + }, + { + "epoch": 0.46641470537708296, + "grad_norm": 1.4112703800201416, + "learning_rate": 4.940206899678275e-06, + "loss": 1.104, + "step": 905 + }, + { + "epoch": 0.46693008074214054, + "grad_norm": 1.315726399421692, + "learning_rate": 4.9400588499895755e-06, + "loss": 1.0902, + "step": 906 + }, + { + "epoch": 0.46744545610719807, + "grad_norm": 1.2560619115829468, + "learning_rate": 4.939910619463404e-06, + "loss": 1.0996, + "step": 907 + }, + { + "epoch": 0.46796083147225565, + "grad_norm": 1.3877520561218262, + "learning_rate": 4.939762208110747e-06, + "loss": 1.152, + "step": 908 + }, + { + "epoch": 0.46847620683731317, + "grad_norm": 1.3177300691604614, + "learning_rate": 4.939613615942602e-06, + "loss": 1.1251, + "step": 909 + }, + { + "epoch": 0.46899158220237075, + "grad_norm": 1.262856125831604, + "learning_rate": 4.939464842969984e-06, + "loss": 1.0812, + "step": 910 + }, + { + "epoch": 0.4695069575674283, + "grad_norm": 1.3856077194213867, + "learning_rate": 4.939315889203917e-06, + "loss": 1.0771, + "step": 911 + }, + { + "epoch": 0.47002233293248585, + "grad_norm": 1.4262409210205078, + "learning_rate": 4.939166754655441e-06, + "loss": 1.0994, + "step": 912 + }, + { + "epoch": 0.4705377082975434, + "grad_norm": 1.3417930603027344, + "learning_rate": 4.93901743933561e-06, + "loss": 1.1559, + "step": 913 + }, + { + "epoch": 0.4710530836626009, + "grad_norm": 1.364356279373169, + "learning_rate": 4.938867943255486e-06, + "loss": 1.1401, + "step": 914 + }, + { + "epoch": 0.4715684590276585, + "grad_norm": 1.3301299810409546, + "learning_rate": 4.938718266426151e-06, + "loss": 1.1299, + "step": 915 + }, + { + "epoch": 0.472083834392716, + "grad_norm": 1.3044543266296387, + "learning_rate": 4.938568408858699e-06, + "loss": 1.1059, + "step": 916 + }, + { + "epoch": 0.4725992097577736, + "grad_norm": 1.3932881355285645, + "learning_rate": 4.938418370564235e-06, + "loss": 1.1629, + "step": 917 + }, + { + "epoch": 0.4731145851228311, + "grad_norm": 1.4347057342529297, + "learning_rate": 4.938268151553879e-06, + "loss": 1.1394, + "step": 918 + }, + { + "epoch": 0.4736299604878887, + "grad_norm": 1.3268437385559082, + "learning_rate": 4.938117751838762e-06, + "loss": 1.1288, + "step": 919 + }, + { + "epoch": 0.4741453358529462, + "grad_norm": 1.3698822259902954, + "learning_rate": 4.937967171430034e-06, + "loss": 1.1079, + "step": 920 + }, + { + "epoch": 0.4746607112180038, + "grad_norm": 1.2759448289871216, + "learning_rate": 4.937816410338852e-06, + "loss": 1.0551, + "step": 921 + }, + { + "epoch": 0.4751760865830613, + "grad_norm": 1.2582950592041016, + "learning_rate": 4.937665468576389e-06, + "loss": 1.1115, + "step": 922 + }, + { + "epoch": 0.4756914619481189, + "grad_norm": 1.3074204921722412, + "learning_rate": 4.937514346153834e-06, + "loss": 1.1403, + "step": 923 + }, + { + "epoch": 0.4762068373131764, + "grad_norm": 1.356652021408081, + "learning_rate": 4.937363043082385e-06, + "loss": 1.1309, + "step": 924 + }, + { + "epoch": 0.476722212678234, + "grad_norm": 1.4325493574142456, + "learning_rate": 4.937211559373256e-06, + "loss": 1.1881, + "step": 925 + }, + { + "epoch": 0.4772375880432915, + "grad_norm": 1.3235927820205688, + "learning_rate": 4.937059895037674e-06, + "loss": 1.0873, + "step": 926 + }, + { + "epoch": 0.4777529634083491, + "grad_norm": 1.3582242727279663, + "learning_rate": 4.936908050086879e-06, + "loss": 1.0874, + "step": 927 + }, + { + "epoch": 0.47826833877340663, + "grad_norm": 1.3077458143234253, + "learning_rate": 4.9367560245321235e-06, + "loss": 1.1308, + "step": 928 + }, + { + "epoch": 0.47878371413846416, + "grad_norm": 1.432529091835022, + "learning_rate": 4.936603818384676e-06, + "loss": 1.0825, + "step": 929 + }, + { + "epoch": 0.47929908950352174, + "grad_norm": 1.4369008541107178, + "learning_rate": 4.936451431655817e-06, + "loss": 1.1137, + "step": 930 + }, + { + "epoch": 0.47981446486857926, + "grad_norm": 1.3341528177261353, + "learning_rate": 4.936298864356838e-06, + "loss": 1.1584, + "step": 931 + }, + { + "epoch": 0.48032984023363684, + "grad_norm": 1.3432354927062988, + "learning_rate": 4.936146116499047e-06, + "loss": 1.083, + "step": 932 + }, + { + "epoch": 0.48084521559869436, + "grad_norm": 1.2591673135757446, + "learning_rate": 4.935993188093766e-06, + "loss": 1.107, + "step": 933 + }, + { + "epoch": 0.48136059096375194, + "grad_norm": 1.3461925983428955, + "learning_rate": 4.935840079152327e-06, + "loss": 1.1574, + "step": 934 + }, + { + "epoch": 0.48187596632880947, + "grad_norm": 1.3511323928833008, + "learning_rate": 4.9356867896860775e-06, + "loss": 1.1066, + "step": 935 + }, + { + "epoch": 0.48239134169386705, + "grad_norm": 1.3107941150665283, + "learning_rate": 4.935533319706378e-06, + "loss": 1.1095, + "step": 936 + }, + { + "epoch": 0.4829067170589246, + "grad_norm": 1.4062912464141846, + "learning_rate": 4.9353796692246035e-06, + "loss": 1.128, + "step": 937 + }, + { + "epoch": 0.48342209242398215, + "grad_norm": 1.278143048286438, + "learning_rate": 4.93522583825214e-06, + "loss": 1.1347, + "step": 938 + }, + { + "epoch": 0.4839374677890397, + "grad_norm": 1.3441195487976074, + "learning_rate": 4.93507182680039e-06, + "loss": 1.0718, + "step": 939 + }, + { + "epoch": 0.48445284315409726, + "grad_norm": 1.2608786821365356, + "learning_rate": 4.934917634880766e-06, + "loss": 1.0523, + "step": 940 + }, + { + "epoch": 0.4849682185191548, + "grad_norm": 1.3602216243743896, + "learning_rate": 4.934763262504696e-06, + "loss": 1.0999, + "step": 941 + }, + { + "epoch": 0.48548359388421236, + "grad_norm": 1.2475003004074097, + "learning_rate": 4.93460870968362e-06, + "loss": 1.1234, + "step": 942 + }, + { + "epoch": 0.4859989692492699, + "grad_norm": 1.3348537683486938, + "learning_rate": 4.934453976428993e-06, + "loss": 1.1116, + "step": 943 + }, + { + "epoch": 0.4865143446143274, + "grad_norm": 1.2883623838424683, + "learning_rate": 4.9342990627522835e-06, + "loss": 1.1644, + "step": 944 + }, + { + "epoch": 0.487029719979385, + "grad_norm": 1.4656498432159424, + "learning_rate": 4.934143968664971e-06, + "loss": 1.1505, + "step": 945 + }, + { + "epoch": 0.4875450953444425, + "grad_norm": 1.370727300643921, + "learning_rate": 4.93398869417855e-06, + "loss": 1.1315, + "step": 946 + }, + { + "epoch": 0.4880604707095001, + "grad_norm": 1.3100018501281738, + "learning_rate": 4.933833239304529e-06, + "loss": 1.13, + "step": 947 + }, + { + "epoch": 0.4885758460745576, + "grad_norm": 1.259399175643921, + "learning_rate": 4.933677604054428e-06, + "loss": 1.0983, + "step": 948 + }, + { + "epoch": 0.4890912214396152, + "grad_norm": 1.4250516891479492, + "learning_rate": 4.933521788439782e-06, + "loss": 1.0165, + "step": 949 + }, + { + "epoch": 0.4896065968046727, + "grad_norm": 1.3938947916030884, + "learning_rate": 4.933365792472139e-06, + "loss": 1.0528, + "step": 950 + }, + { + "epoch": 0.4901219721697303, + "grad_norm": 1.431322455406189, + "learning_rate": 4.93320961616306e-06, + "loss": 1.1415, + "step": 951 + }, + { + "epoch": 0.4906373475347878, + "grad_norm": 1.3879321813583374, + "learning_rate": 4.93305325952412e-06, + "loss": 1.1523, + "step": 952 + }, + { + "epoch": 0.4911527228998454, + "grad_norm": 1.3865762948989868, + "learning_rate": 4.932896722566905e-06, + "loss": 1.1528, + "step": 953 + }, + { + "epoch": 0.49166809826490293, + "grad_norm": 1.3409475088119507, + "learning_rate": 4.932740005303019e-06, + "loss": 1.0749, + "step": 954 + }, + { + "epoch": 0.4921834736299605, + "grad_norm": 1.3085192441940308, + "learning_rate": 4.932583107744074e-06, + "loss": 1.0723, + "step": 955 + }, + { + "epoch": 0.49269884899501803, + "grad_norm": 1.5081206560134888, + "learning_rate": 4.9324260299017e-06, + "loss": 1.124, + "step": 956 + }, + { + "epoch": 0.4932142243600756, + "grad_norm": 1.3491485118865967, + "learning_rate": 4.932268771787538e-06, + "loss": 1.087, + "step": 957 + }, + { + "epoch": 0.49372959972513314, + "grad_norm": 1.350962519645691, + "learning_rate": 4.932111333413241e-06, + "loss": 1.0825, + "step": 958 + }, + { + "epoch": 0.49424497509019066, + "grad_norm": 1.2890613079071045, + "learning_rate": 4.931953714790479e-06, + "loss": 1.1482, + "step": 959 + }, + { + "epoch": 0.49476035045524824, + "grad_norm": 1.3699564933776855, + "learning_rate": 4.931795915930933e-06, + "loss": 1.15, + "step": 960 + }, + { + "epoch": 0.49527572582030577, + "grad_norm": 1.3651552200317383, + "learning_rate": 4.931637936846298e-06, + "loss": 1.1472, + "step": 961 + }, + { + "epoch": 0.49579110118536335, + "grad_norm": 1.2937597036361694, + "learning_rate": 4.93147977754828e-06, + "loss": 1.1043, + "step": 962 + }, + { + "epoch": 0.49630647655042087, + "grad_norm": 1.3082481622695923, + "learning_rate": 4.931321438048603e-06, + "loss": 1.12, + "step": 963 + }, + { + "epoch": 0.49682185191547845, + "grad_norm": 1.3514046669006348, + "learning_rate": 4.931162918359001e-06, + "loss": 1.1209, + "step": 964 + }, + { + "epoch": 0.497337227280536, + "grad_norm": 1.3890619277954102, + "learning_rate": 4.931004218491223e-06, + "loss": 1.1624, + "step": 965 + }, + { + "epoch": 0.49785260264559356, + "grad_norm": 1.3216668367385864, + "learning_rate": 4.930845338457029e-06, + "loss": 1.052, + "step": 966 + }, + { + "epoch": 0.4983679780106511, + "grad_norm": 1.391402244567871, + "learning_rate": 4.930686278268194e-06, + "loss": 1.1068, + "step": 967 + }, + { + "epoch": 0.49888335337570866, + "grad_norm": 1.3651992082595825, + "learning_rate": 4.930527037936508e-06, + "loss": 1.1142, + "step": 968 + }, + { + "epoch": 0.4993987287407662, + "grad_norm": 1.415177822113037, + "learning_rate": 4.930367617473771e-06, + "loss": 1.0975, + "step": 969 + }, + { + "epoch": 0.49991410410582376, + "grad_norm": 1.352864146232605, + "learning_rate": 4.930208016891799e-06, + "loss": 1.1489, + "step": 970 + }, + { + "epoch": 0.5004294794708813, + "grad_norm": 1.3537369966506958, + "learning_rate": 4.930048236202419e-06, + "loss": 1.1184, + "step": 971 + }, + { + "epoch": 0.5009448548359389, + "grad_norm": 1.3870673179626465, + "learning_rate": 4.929888275417474e-06, + "loss": 1.1284, + "step": 972 + }, + { + "epoch": 0.5014602302009964, + "grad_norm": 1.3259119987487793, + "learning_rate": 4.929728134548818e-06, + "loss": 1.1019, + "step": 973 + }, + { + "epoch": 0.5019756055660539, + "grad_norm": 1.486378788948059, + "learning_rate": 4.92956781360832e-06, + "loss": 1.1106, + "step": 974 + }, + { + "epoch": 0.5024909809311114, + "grad_norm": 1.4134142398834229, + "learning_rate": 4.929407312607861e-06, + "loss": 1.0968, + "step": 975 + }, + { + "epoch": 0.5030063562961691, + "grad_norm": 1.3311936855316162, + "learning_rate": 4.929246631559337e-06, + "loss": 1.1293, + "step": 976 + }, + { + "epoch": 0.5035217316612266, + "grad_norm": 1.3417316675186157, + "learning_rate": 4.929085770474656e-06, + "loss": 1.1283, + "step": 977 + }, + { + "epoch": 0.5040371070262841, + "grad_norm": 1.2947065830230713, + "learning_rate": 4.928924729365739e-06, + "loss": 1.1121, + "step": 978 + }, + { + "epoch": 0.5045524823913417, + "grad_norm": 1.2538444995880127, + "learning_rate": 4.928763508244523e-06, + "loss": 1.0937, + "step": 979 + }, + { + "epoch": 0.5050678577563993, + "grad_norm": 1.2491954565048218, + "learning_rate": 4.928602107122954e-06, + "loss": 1.111, + "step": 980 + }, + { + "epoch": 0.5055832331214568, + "grad_norm": 1.2861011028289795, + "learning_rate": 4.928440526012995e-06, + "loss": 1.0607, + "step": 981 + }, + { + "epoch": 0.5060986084865143, + "grad_norm": 1.4138412475585938, + "learning_rate": 4.928278764926621e-06, + "loss": 1.1033, + "step": 982 + }, + { + "epoch": 0.5066139838515719, + "grad_norm": 1.4112071990966797, + "learning_rate": 4.928116823875821e-06, + "loss": 1.1158, + "step": 983 + }, + { + "epoch": 0.5071293592166295, + "grad_norm": 1.3451502323150635, + "learning_rate": 4.927954702872596e-06, + "loss": 1.1566, + "step": 984 + }, + { + "epoch": 0.507644734581687, + "grad_norm": 1.423324465751648, + "learning_rate": 4.92779240192896e-06, + "loss": 1.0742, + "step": 985 + }, + { + "epoch": 0.5081601099467445, + "grad_norm": 1.401418685913086, + "learning_rate": 4.927629921056944e-06, + "loss": 1.0844, + "step": 986 + }, + { + "epoch": 0.5086754853118021, + "grad_norm": 1.2989522218704224, + "learning_rate": 4.927467260268588e-06, + "loss": 1.0857, + "step": 987 + }, + { + "epoch": 0.5091908606768596, + "grad_norm": 1.2923847436904907, + "learning_rate": 4.927304419575948e-06, + "loss": 1.051, + "step": 988 + }, + { + "epoch": 0.5097062360419172, + "grad_norm": 1.3584847450256348, + "learning_rate": 4.927141398991091e-06, + "loss": 1.08, + "step": 989 + }, + { + "epoch": 0.5102216114069748, + "grad_norm": 1.3007075786590576, + "learning_rate": 4.9269781985261e-06, + "loss": 1.0283, + "step": 990 + }, + { + "epoch": 0.5107369867720323, + "grad_norm": 1.2998332977294922, + "learning_rate": 4.92681481819307e-06, + "loss": 1.1638, + "step": 991 + }, + { + "epoch": 0.5112523621370898, + "grad_norm": 1.3442524671554565, + "learning_rate": 4.92665125800411e-06, + "loss": 1.1349, + "step": 992 + }, + { + "epoch": 0.5117677375021474, + "grad_norm": 1.3257924318313599, + "learning_rate": 4.92648751797134e-06, + "loss": 1.1078, + "step": 993 + }, + { + "epoch": 0.512283112867205, + "grad_norm": 1.4120101928710938, + "learning_rate": 4.926323598106897e-06, + "loss": 1.1897, + "step": 994 + }, + { + "epoch": 0.5127984882322625, + "grad_norm": 1.3012393712997437, + "learning_rate": 4.9261594984229276e-06, + "loss": 1.1238, + "step": 995 + }, + { + "epoch": 0.51331386359732, + "grad_norm": 1.4377069473266602, + "learning_rate": 4.925995218931595e-06, + "loss": 1.078, + "step": 996 + }, + { + "epoch": 0.5138292389623776, + "grad_norm": 1.389089584350586, + "learning_rate": 4.925830759645075e-06, + "loss": 1.0876, + "step": 997 + }, + { + "epoch": 0.5143446143274352, + "grad_norm": 1.3466755151748657, + "learning_rate": 4.925666120575553e-06, + "loss": 1.0879, + "step": 998 + }, + { + "epoch": 0.5148599896924927, + "grad_norm": 1.488387942314148, + "learning_rate": 4.925501301735232e-06, + "loss": 1.1214, + "step": 999 + }, + { + "epoch": 0.5153753650575502, + "grad_norm": 1.3349742889404297, + "learning_rate": 4.92533630313633e-06, + "loss": 1.085, + "step": 1000 + }, + { + "epoch": 0.5158907404226079, + "grad_norm": 1.4069832563400269, + "learning_rate": 4.925171124791071e-06, + "loss": 1.0276, + "step": 1001 + }, + { + "epoch": 0.5164061157876654, + "grad_norm": 1.3818632364273071, + "learning_rate": 4.925005766711699e-06, + "loss": 1.112, + "step": 1002 + }, + { + "epoch": 0.5169214911527229, + "grad_norm": 1.434308409690857, + "learning_rate": 4.9248402289104694e-06, + "loss": 1.1009, + "step": 1003 + }, + { + "epoch": 0.5174368665177804, + "grad_norm": 1.3247915506362915, + "learning_rate": 4.924674511399649e-06, + "loss": 1.0939, + "step": 1004 + }, + { + "epoch": 0.517952241882838, + "grad_norm": 1.3572499752044678, + "learning_rate": 4.92450861419152e-06, + "loss": 1.0953, + "step": 1005 + }, + { + "epoch": 0.5184676172478956, + "grad_norm": 1.2700880765914917, + "learning_rate": 4.9243425372983776e-06, + "loss": 1.0553, + "step": 1006 + }, + { + "epoch": 0.5189829926129531, + "grad_norm": 1.3506757020950317, + "learning_rate": 4.92417628073253e-06, + "loss": 1.0627, + "step": 1007 + }, + { + "epoch": 0.5194983679780106, + "grad_norm": 1.3553335666656494, + "learning_rate": 4.924009844506299e-06, + "loss": 1.1347, + "step": 1008 + }, + { + "epoch": 0.5200137433430682, + "grad_norm": 1.3869633674621582, + "learning_rate": 4.9238432286320194e-06, + "loss": 1.1304, + "step": 1009 + }, + { + "epoch": 0.5205291187081258, + "grad_norm": 1.4746654033660889, + "learning_rate": 4.923676433122039e-06, + "loss": 1.1368, + "step": 1010 + }, + { + "epoch": 0.5210444940731833, + "grad_norm": 1.3538326025009155, + "learning_rate": 4.923509457988719e-06, + "loss": 1.1464, + "step": 1011 + }, + { + "epoch": 0.5215598694382408, + "grad_norm": 1.3130608797073364, + "learning_rate": 4.923342303244436e-06, + "loss": 1.0875, + "step": 1012 + }, + { + "epoch": 0.5220752448032984, + "grad_norm": 1.4428962469100952, + "learning_rate": 4.923174968901576e-06, + "loss": 1.1293, + "step": 1013 + }, + { + "epoch": 0.522590620168356, + "grad_norm": 1.417741298675537, + "learning_rate": 4.923007454972542e-06, + "loss": 1.0696, + "step": 1014 + }, + { + "epoch": 0.5231059955334135, + "grad_norm": 1.3560295104980469, + "learning_rate": 4.9228397614697495e-06, + "loss": 1.076, + "step": 1015 + }, + { + "epoch": 0.523621370898471, + "grad_norm": 1.4185484647750854, + "learning_rate": 4.922671888405625e-06, + "loss": 1.1841, + "step": 1016 + }, + { + "epoch": 0.5241367462635286, + "grad_norm": 1.3187556266784668, + "learning_rate": 4.922503835792609e-06, + "loss": 1.0751, + "step": 1017 + }, + { + "epoch": 0.5246521216285861, + "grad_norm": 1.3971552848815918, + "learning_rate": 4.922335603643159e-06, + "loss": 1.051, + "step": 1018 + }, + { + "epoch": 0.5251674969936437, + "grad_norm": 1.3550360202789307, + "learning_rate": 4.92216719196974e-06, + "loss": 1.102, + "step": 1019 + }, + { + "epoch": 0.5256828723587013, + "grad_norm": 1.286242961883545, + "learning_rate": 4.921998600784837e-06, + "loss": 1.0976, + "step": 1020 + }, + { + "epoch": 0.5261982477237588, + "grad_norm": 1.5528258085250854, + "learning_rate": 4.92182983010094e-06, + "loss": 1.1342, + "step": 1021 + }, + { + "epoch": 0.5267136230888163, + "grad_norm": 1.4291971921920776, + "learning_rate": 4.921660879930561e-06, + "loss": 1.1202, + "step": 1022 + }, + { + "epoch": 0.5272289984538739, + "grad_norm": 1.4119702577590942, + "learning_rate": 4.921491750286219e-06, + "loss": 1.128, + "step": 1023 + }, + { + "epoch": 0.5277443738189315, + "grad_norm": 1.338379979133606, + "learning_rate": 4.9213224411804496e-06, + "loss": 1.0659, + "step": 1024 + }, + { + "epoch": 0.528259749183989, + "grad_norm": 1.3730744123458862, + "learning_rate": 4.9211529526258e-06, + "loss": 1.1565, + "step": 1025 + }, + { + "epoch": 0.5287751245490465, + "grad_norm": 1.3540492057800293, + "learning_rate": 4.920983284634831e-06, + "loss": 1.1323, + "step": 1026 + }, + { + "epoch": 0.5292904999141042, + "grad_norm": 1.4140055179595947, + "learning_rate": 4.9208134372201175e-06, + "loss": 1.1136, + "step": 1027 + }, + { + "epoch": 0.5298058752791617, + "grad_norm": 1.3417617082595825, + "learning_rate": 4.920643410394248e-06, + "loss": 1.0707, + "step": 1028 + }, + { + "epoch": 0.5303212506442192, + "grad_norm": 1.3641825914382935, + "learning_rate": 4.920473204169822e-06, + "loss": 1.0844, + "step": 1029 + }, + { + "epoch": 0.5308366260092767, + "grad_norm": 1.32991361618042, + "learning_rate": 4.920302818559455e-06, + "loss": 1.1099, + "step": 1030 + }, + { + "epoch": 0.5313520013743344, + "grad_norm": 1.4204286336898804, + "learning_rate": 4.920132253575774e-06, + "loss": 1.0921, + "step": 1031 + }, + { + "epoch": 0.5318673767393919, + "grad_norm": 1.4128758907318115, + "learning_rate": 4.91996150923142e-06, + "loss": 1.1581, + "step": 1032 + }, + { + "epoch": 0.5323827521044494, + "grad_norm": 1.3604763746261597, + "learning_rate": 4.9197905855390475e-06, + "loss": 1.0801, + "step": 1033 + }, + { + "epoch": 0.5328981274695069, + "grad_norm": 1.4139909744262695, + "learning_rate": 4.919619482511323e-06, + "loss": 1.111, + "step": 1034 + }, + { + "epoch": 0.5334135028345645, + "grad_norm": 1.2476108074188232, + "learning_rate": 4.919448200160929e-06, + "loss": 1.0655, + "step": 1035 + }, + { + "epoch": 0.5339288781996221, + "grad_norm": 1.53774893283844, + "learning_rate": 4.919276738500558e-06, + "loss": 1.1361, + "step": 1036 + }, + { + "epoch": 0.5344442535646796, + "grad_norm": 1.355755090713501, + "learning_rate": 4.919105097542918e-06, + "loss": 1.0765, + "step": 1037 + }, + { + "epoch": 0.5349596289297371, + "grad_norm": 1.364965796470642, + "learning_rate": 4.918933277300729e-06, + "loss": 1.1845, + "step": 1038 + }, + { + "epoch": 0.5354750042947947, + "grad_norm": 1.4210840463638306, + "learning_rate": 4.918761277786727e-06, + "loss": 1.1407, + "step": 1039 + }, + { + "epoch": 0.5359903796598523, + "grad_norm": 1.3257887363433838, + "learning_rate": 4.918589099013656e-06, + "loss": 1.1206, + "step": 1040 + }, + { + "epoch": 0.5365057550249098, + "grad_norm": 1.3990665674209595, + "learning_rate": 4.918416740994279e-06, + "loss": 1.0949, + "step": 1041 + }, + { + "epoch": 0.5370211303899673, + "grad_norm": 1.3532612323760986, + "learning_rate": 4.918244203741369e-06, + "loss": 1.0698, + "step": 1042 + }, + { + "epoch": 0.5375365057550249, + "grad_norm": 1.2847880125045776, + "learning_rate": 4.918071487267712e-06, + "loss": 1.0495, + "step": 1043 + }, + { + "epoch": 0.5380518811200825, + "grad_norm": 1.421730399131775, + "learning_rate": 4.917898591586111e-06, + "loss": 1.1283, + "step": 1044 + }, + { + "epoch": 0.53856725648514, + "grad_norm": 1.3442106246948242, + "learning_rate": 4.9177255167093776e-06, + "loss": 1.1023, + "step": 1045 + }, + { + "epoch": 0.5390826318501976, + "grad_norm": 1.4238824844360352, + "learning_rate": 4.917552262650339e-06, + "loss": 1.0958, + "step": 1046 + }, + { + "epoch": 0.5395980072152551, + "grad_norm": 1.3223488330841064, + "learning_rate": 4.917378829421836e-06, + "loss": 1.1002, + "step": 1047 + }, + { + "epoch": 0.5401133825803126, + "grad_norm": 1.3856841325759888, + "learning_rate": 4.917205217036721e-06, + "loss": 1.0923, + "step": 1048 + }, + { + "epoch": 0.5406287579453702, + "grad_norm": 1.363553524017334, + "learning_rate": 4.917031425507862e-06, + "loss": 1.0757, + "step": 1049 + }, + { + "epoch": 0.5411441333104278, + "grad_norm": 1.3707692623138428, + "learning_rate": 4.916857454848139e-06, + "loss": 1.0935, + "step": 1050 + }, + { + "epoch": 0.5416595086754853, + "grad_norm": 1.2749682664871216, + "learning_rate": 4.916683305070443e-06, + "loss": 1.0831, + "step": 1051 + }, + { + "epoch": 0.5421748840405428, + "grad_norm": 1.413495421409607, + "learning_rate": 4.916508976187683e-06, + "loss": 1.0782, + "step": 1052 + }, + { + "epoch": 0.5426902594056005, + "grad_norm": 1.3967243432998657, + "learning_rate": 4.916334468212778e-06, + "loss": 1.0928, + "step": 1053 + }, + { + "epoch": 0.543205634770658, + "grad_norm": 1.3892028331756592, + "learning_rate": 4.916159781158661e-06, + "loss": 1.0899, + "step": 1054 + }, + { + "epoch": 0.5437210101357155, + "grad_norm": 1.2645163536071777, + "learning_rate": 4.91598491503828e-06, + "loss": 1.0128, + "step": 1055 + }, + { + "epoch": 0.544236385500773, + "grad_norm": 1.3261868953704834, + "learning_rate": 4.915809869864592e-06, + "loss": 1.0605, + "step": 1056 + }, + { + "epoch": 0.5447517608658307, + "grad_norm": 1.4390619993209839, + "learning_rate": 4.915634645650572e-06, + "loss": 1.1214, + "step": 1057 + }, + { + "epoch": 0.5452671362308882, + "grad_norm": 1.374504804611206, + "learning_rate": 4.915459242409205e-06, + "loss": 1.0504, + "step": 1058 + }, + { + "epoch": 0.5457825115959457, + "grad_norm": 1.4694688320159912, + "learning_rate": 4.915283660153491e-06, + "loss": 1.1, + "step": 1059 + }, + { + "epoch": 0.5462978869610032, + "grad_norm": 1.3922492265701294, + "learning_rate": 4.915107898896443e-06, + "loss": 1.0793, + "step": 1060 + }, + { + "epoch": 0.5468132623260609, + "grad_norm": 1.3784502744674683, + "learning_rate": 4.914931958651086e-06, + "loss": 1.141, + "step": 1061 + }, + { + "epoch": 0.5473286376911184, + "grad_norm": 1.3735002279281616, + "learning_rate": 4.914755839430461e-06, + "loss": 1.0931, + "step": 1062 + }, + { + "epoch": 0.5478440130561759, + "grad_norm": 1.4746862649917603, + "learning_rate": 4.914579541247618e-06, + "loss": 1.1115, + "step": 1063 + }, + { + "epoch": 0.5483593884212334, + "grad_norm": 1.3234630823135376, + "learning_rate": 4.914403064115625e-06, + "loss": 1.13, + "step": 1064 + }, + { + "epoch": 0.548874763786291, + "grad_norm": 1.326145052909851, + "learning_rate": 4.914226408047561e-06, + "loss": 1.0677, + "step": 1065 + }, + { + "epoch": 0.5493901391513486, + "grad_norm": 1.3029916286468506, + "learning_rate": 4.914049573056517e-06, + "loss": 1.1147, + "step": 1066 + }, + { + "epoch": 0.5499055145164061, + "grad_norm": 1.3349426984786987, + "learning_rate": 4.913872559155599e-06, + "loss": 1.1048, + "step": 1067 + }, + { + "epoch": 0.5504208898814636, + "grad_norm": 1.4011579751968384, + "learning_rate": 4.913695366357927e-06, + "loss": 1.1096, + "step": 1068 + }, + { + "epoch": 0.5509362652465212, + "grad_norm": 1.4701299667358398, + "learning_rate": 4.9135179946766306e-06, + "loss": 1.0878, + "step": 1069 + }, + { + "epoch": 0.5514516406115788, + "grad_norm": 1.2991549968719482, + "learning_rate": 4.913340444124858e-06, + "loss": 1.1315, + "step": 1070 + }, + { + "epoch": 0.5519670159766363, + "grad_norm": 1.4357717037200928, + "learning_rate": 4.9131627147157664e-06, + "loss": 1.1378, + "step": 1071 + }, + { + "epoch": 0.5524823913416939, + "grad_norm": 1.3685011863708496, + "learning_rate": 4.912984806462529e-06, + "loss": 1.0955, + "step": 1072 + }, + { + "epoch": 0.5529977667067514, + "grad_norm": 1.36729097366333, + "learning_rate": 4.912806719378328e-06, + "loss": 1.1441, + "step": 1073 + }, + { + "epoch": 0.553513142071809, + "grad_norm": 1.376643419265747, + "learning_rate": 4.912628453476364e-06, + "loss": 1.0875, + "step": 1074 + }, + { + "epoch": 0.5540285174368665, + "grad_norm": 1.2910010814666748, + "learning_rate": 4.912450008769848e-06, + "loss": 1.1218, + "step": 1075 + }, + { + "epoch": 0.5545438928019241, + "grad_norm": 1.327258586883545, + "learning_rate": 4.912271385272005e-06, + "loss": 1.0689, + "step": 1076 + }, + { + "epoch": 0.5550592681669816, + "grad_norm": 1.2881981134414673, + "learning_rate": 4.912092582996074e-06, + "loss": 1.0891, + "step": 1077 + }, + { + "epoch": 0.5555746435320391, + "grad_norm": 1.3386520147323608, + "learning_rate": 4.911913601955305e-06, + "loss": 1.1099, + "step": 1078 + }, + { + "epoch": 0.5560900188970967, + "grad_norm": 1.4942947626113892, + "learning_rate": 4.911734442162963e-06, + "loss": 1.1043, + "step": 1079 + }, + { + "epoch": 0.5566053942621543, + "grad_norm": 1.4550931453704834, + "learning_rate": 4.911555103632326e-06, + "loss": 1.1317, + "step": 1080 + }, + { + "epoch": 0.5571207696272118, + "grad_norm": 1.3831936120986938, + "learning_rate": 4.911375586376685e-06, + "loss": 1.0696, + "step": 1081 + }, + { + "epoch": 0.5576361449922693, + "grad_norm": 1.3928511142730713, + "learning_rate": 4.911195890409345e-06, + "loss": 1.0852, + "step": 1082 + }, + { + "epoch": 0.558151520357327, + "grad_norm": 1.3147382736206055, + "learning_rate": 4.911016015743624e-06, + "loss": 1.1356, + "step": 1083 + }, + { + "epoch": 0.5586668957223845, + "grad_norm": 1.2969478368759155, + "learning_rate": 4.910835962392851e-06, + "loss": 1.055, + "step": 1084 + }, + { + "epoch": 0.559182271087442, + "grad_norm": 1.2879401445388794, + "learning_rate": 4.9106557303703715e-06, + "loss": 1.1252, + "step": 1085 + }, + { + "epoch": 0.5596976464524995, + "grad_norm": 1.4337031841278076, + "learning_rate": 4.910475319689542e-06, + "loss": 1.166, + "step": 1086 + }, + { + "epoch": 0.5602130218175572, + "grad_norm": 1.4629108905792236, + "learning_rate": 4.910294730363734e-06, + "loss": 1.0777, + "step": 1087 + }, + { + "epoch": 0.5607283971826147, + "grad_norm": 1.3451007604599, + "learning_rate": 4.910113962406331e-06, + "loss": 1.1065, + "step": 1088 + }, + { + "epoch": 0.5612437725476722, + "grad_norm": 1.3503655195236206, + "learning_rate": 4.9099330158307295e-06, + "loss": 1.0694, + "step": 1089 + }, + { + "epoch": 0.5617591479127297, + "grad_norm": 1.2731573581695557, + "learning_rate": 4.909751890650341e-06, + "loss": 1.0915, + "step": 1090 + }, + { + "epoch": 0.5622745232777874, + "grad_norm": 1.3867830038070679, + "learning_rate": 4.909570586878587e-06, + "loss": 1.133, + "step": 1091 + }, + { + "epoch": 0.5627898986428449, + "grad_norm": 1.3512017726898193, + "learning_rate": 4.909389104528906e-06, + "loss": 1.1475, + "step": 1092 + }, + { + "epoch": 0.5633052740079024, + "grad_norm": 1.5640790462493896, + "learning_rate": 4.909207443614749e-06, + "loss": 1.1382, + "step": 1093 + }, + { + "epoch": 0.56382064937296, + "grad_norm": 1.3644344806671143, + "learning_rate": 4.909025604149576e-06, + "loss": 1.0482, + "step": 1094 + }, + { + "epoch": 0.5643360247380175, + "grad_norm": 1.4538713693618774, + "learning_rate": 4.908843586146866e-06, + "loss": 1.1135, + "step": 1095 + }, + { + "epoch": 0.5648514001030751, + "grad_norm": 1.4145160913467407, + "learning_rate": 4.908661389620108e-06, + "loss": 1.1097, + "step": 1096 + }, + { + "epoch": 0.5653667754681326, + "grad_norm": 1.3469972610473633, + "learning_rate": 4.908479014582805e-06, + "loss": 1.1048, + "step": 1097 + }, + { + "epoch": 0.5658821508331902, + "grad_norm": 1.4490288496017456, + "learning_rate": 4.908296461048473e-06, + "loss": 1.0747, + "step": 1098 + }, + { + "epoch": 0.5663975261982477, + "grad_norm": 1.4236392974853516, + "learning_rate": 4.908113729030641e-06, + "loss": 1.0718, + "step": 1099 + }, + { + "epoch": 0.5669129015633053, + "grad_norm": 1.399814248085022, + "learning_rate": 4.907930818542853e-06, + "loss": 1.1199, + "step": 1100 + }, + { + "epoch": 0.5674282769283628, + "grad_norm": 1.3749892711639404, + "learning_rate": 4.907747729598663e-06, + "loss": 1.15, + "step": 1101 + }, + { + "epoch": 0.5679436522934204, + "grad_norm": 1.3988521099090576, + "learning_rate": 4.907564462211642e-06, + "loss": 1.1193, + "step": 1102 + }, + { + "epoch": 0.5684590276584779, + "grad_norm": 1.3416619300842285, + "learning_rate": 4.907381016395371e-06, + "loss": 1.0519, + "step": 1103 + }, + { + "epoch": 0.5689744030235355, + "grad_norm": 1.3166208267211914, + "learning_rate": 4.907197392163445e-06, + "loss": 1.1237, + "step": 1104 + }, + { + "epoch": 0.569489778388593, + "grad_norm": 1.3533228635787964, + "learning_rate": 4.907013589529474e-06, + "loss": 1.0962, + "step": 1105 + }, + { + "epoch": 0.5700051537536506, + "grad_norm": 1.477770447731018, + "learning_rate": 4.906829608507081e-06, + "loss": 1.1339, + "step": 1106 + }, + { + "epoch": 0.5705205291187081, + "grad_norm": 1.417343020439148, + "learning_rate": 4.906645449109898e-06, + "loss": 1.1963, + "step": 1107 + }, + { + "epoch": 0.5710359044837656, + "grad_norm": 1.2893075942993164, + "learning_rate": 4.906461111351576e-06, + "loss": 1.1646, + "step": 1108 + }, + { + "epoch": 0.5715512798488233, + "grad_norm": 1.4533003568649292, + "learning_rate": 4.906276595245776e-06, + "loss": 1.1308, + "step": 1109 + }, + { + "epoch": 0.5720666552138808, + "grad_norm": 1.3315889835357666, + "learning_rate": 4.906091900806172e-06, + "loss": 1.0804, + "step": 1110 + }, + { + "epoch": 0.5725820305789383, + "grad_norm": 1.4494061470031738, + "learning_rate": 4.9059070280464525e-06, + "loss": 1.145, + "step": 1111 + }, + { + "epoch": 0.5730974059439958, + "grad_norm": 1.3500940799713135, + "learning_rate": 4.90572197698032e-06, + "loss": 1.0763, + "step": 1112 + }, + { + "epoch": 0.5736127813090535, + "grad_norm": 1.392949104309082, + "learning_rate": 4.905536747621487e-06, + "loss": 1.0601, + "step": 1113 + }, + { + "epoch": 0.574128156674111, + "grad_norm": 1.416709065437317, + "learning_rate": 4.9053513399836825e-06, + "loss": 1.1257, + "step": 1114 + }, + { + "epoch": 0.5746435320391685, + "grad_norm": 1.3534293174743652, + "learning_rate": 4.905165754080648e-06, + "loss": 1.0476, + "step": 1115 + }, + { + "epoch": 0.575158907404226, + "grad_norm": 1.3441128730773926, + "learning_rate": 4.9049799899261354e-06, + "loss": 1.1083, + "step": 1116 + }, + { + "epoch": 0.5756742827692837, + "grad_norm": 1.3544447422027588, + "learning_rate": 4.904794047533914e-06, + "loss": 1.117, + "step": 1117 + }, + { + "epoch": 0.5761896581343412, + "grad_norm": 1.420417070388794, + "learning_rate": 4.904607926917763e-06, + "loss": 1.1451, + "step": 1118 + }, + { + "epoch": 0.5767050334993987, + "grad_norm": 1.4358339309692383, + "learning_rate": 4.904421628091477e-06, + "loss": 1.1677, + "step": 1119 + }, + { + "epoch": 0.5772204088644562, + "grad_norm": 1.32780122756958, + "learning_rate": 4.9042351510688635e-06, + "loss": 1.1154, + "step": 1120 + }, + { + "epoch": 0.5777357842295139, + "grad_norm": 1.3847447633743286, + "learning_rate": 4.904048495863743e-06, + "loss": 1.0686, + "step": 1121 + }, + { + "epoch": 0.5782511595945714, + "grad_norm": 1.330560326576233, + "learning_rate": 4.903861662489946e-06, + "loss": 1.1178, + "step": 1122 + }, + { + "epoch": 0.5787665349596289, + "grad_norm": 1.2878084182739258, + "learning_rate": 4.903674650961321e-06, + "loss": 1.0595, + "step": 1123 + }, + { + "epoch": 0.5792819103246865, + "grad_norm": 1.4180364608764648, + "learning_rate": 4.903487461291728e-06, + "loss": 1.1535, + "step": 1124 + }, + { + "epoch": 0.579797285689744, + "grad_norm": 1.302341103553772, + "learning_rate": 4.90330009349504e-06, + "loss": 1.0497, + "step": 1125 + }, + { + "epoch": 0.5803126610548016, + "grad_norm": 1.3654823303222656, + "learning_rate": 4.9031125475851435e-06, + "loss": 1.1243, + "step": 1126 + }, + { + "epoch": 0.5808280364198591, + "grad_norm": 1.4400690793991089, + "learning_rate": 4.902924823575937e-06, + "loss": 1.104, + "step": 1127 + }, + { + "epoch": 0.5813434117849167, + "grad_norm": 1.3116410970687866, + "learning_rate": 4.902736921481333e-06, + "loss": 1.1156, + "step": 1128 + }, + { + "epoch": 0.5818587871499742, + "grad_norm": 1.3312976360321045, + "learning_rate": 4.902548841315258e-06, + "loss": 1.0912, + "step": 1129 + }, + { + "epoch": 0.5823741625150318, + "grad_norm": 1.3902885913848877, + "learning_rate": 4.902360583091651e-06, + "loss": 1.1086, + "step": 1130 + }, + { + "epoch": 0.5828895378800893, + "grad_norm": 1.3142887353897095, + "learning_rate": 4.9021721468244636e-06, + "loss": 1.0117, + "step": 1131 + }, + { + "epoch": 0.5834049132451469, + "grad_norm": 1.3276392221450806, + "learning_rate": 4.901983532527661e-06, + "loss": 1.1226, + "step": 1132 + }, + { + "epoch": 0.5839202886102044, + "grad_norm": 1.4024556875228882, + "learning_rate": 4.901794740215222e-06, + "loss": 1.1554, + "step": 1133 + }, + { + "epoch": 0.584435663975262, + "grad_norm": 1.5021957159042358, + "learning_rate": 4.90160576990114e-06, + "loss": 1.102, + "step": 1134 + }, + { + "epoch": 0.5849510393403196, + "grad_norm": 1.3124959468841553, + "learning_rate": 4.901416621599417e-06, + "loss": 1.0831, + "step": 1135 + }, + { + "epoch": 0.5854664147053771, + "grad_norm": 1.2595129013061523, + "learning_rate": 4.901227295324073e-06, + "loss": 1.0665, + "step": 1136 + }, + { + "epoch": 0.5859817900704346, + "grad_norm": 1.3545048236846924, + "learning_rate": 4.90103779108914e-06, + "loss": 1.0416, + "step": 1137 + }, + { + "epoch": 0.5864971654354921, + "grad_norm": 1.398465871810913, + "learning_rate": 4.90084810890866e-06, + "loss": 1.1606, + "step": 1138 + }, + { + "epoch": 0.5870125408005498, + "grad_norm": 1.3547437191009521, + "learning_rate": 4.9006582487966935e-06, + "loss": 1.0701, + "step": 1139 + }, + { + "epoch": 0.5875279161656073, + "grad_norm": 1.3768725395202637, + "learning_rate": 4.900468210767309e-06, + "loss": 1.095, + "step": 1140 + }, + { + "epoch": 0.5880432915306648, + "grad_norm": 1.3751181364059448, + "learning_rate": 4.9002779948345925e-06, + "loss": 1.1587, + "step": 1141 + }, + { + "epoch": 0.5885586668957223, + "grad_norm": 1.3258569240570068, + "learning_rate": 4.90008760101264e-06, + "loss": 1.0568, + "step": 1142 + }, + { + "epoch": 0.58907404226078, + "grad_norm": 1.3601237535476685, + "learning_rate": 4.899897029315563e-06, + "loss": 1.086, + "step": 1143 + }, + { + "epoch": 0.5895894176258375, + "grad_norm": 1.4093011617660522, + "learning_rate": 4.899706279757485e-06, + "loss": 1.0956, + "step": 1144 + }, + { + "epoch": 0.590104792990895, + "grad_norm": 1.3243459463119507, + "learning_rate": 4.899515352352541e-06, + "loss": 1.0627, + "step": 1145 + }, + { + "epoch": 0.5906201683559525, + "grad_norm": 1.3498246669769287, + "learning_rate": 4.899324247114883e-06, + "loss": 1.069, + "step": 1146 + }, + { + "epoch": 0.5911355437210102, + "grad_norm": 1.2826563119888306, + "learning_rate": 4.899132964058674e-06, + "loss": 1.1011, + "step": 1147 + }, + { + "epoch": 0.5916509190860677, + "grad_norm": 1.4750365018844604, + "learning_rate": 4.89894150319809e-06, + "loss": 1.0783, + "step": 1148 + }, + { + "epoch": 0.5921662944511252, + "grad_norm": 1.4334691762924194, + "learning_rate": 4.8987498645473195e-06, + "loss": 1.0941, + "step": 1149 + }, + { + "epoch": 0.5926816698161828, + "grad_norm": 1.42925226688385, + "learning_rate": 4.898558048120568e-06, + "loss": 1.0639, + "step": 1150 + }, + { + "epoch": 0.5931970451812404, + "grad_norm": 1.4867373704910278, + "learning_rate": 4.898366053932048e-06, + "loss": 1.1295, + "step": 1151 + }, + { + "epoch": 0.5937124205462979, + "grad_norm": 1.3330955505371094, + "learning_rate": 4.89817388199599e-06, + "loss": 1.0712, + "step": 1152 + }, + { + "epoch": 0.5942277959113554, + "grad_norm": 1.4610333442687988, + "learning_rate": 4.897981532326637e-06, + "loss": 1.07, + "step": 1153 + }, + { + "epoch": 0.594743171276413, + "grad_norm": 1.3709636926651, + "learning_rate": 4.8977890049382434e-06, + "loss": 1.1292, + "step": 1154 + }, + { + "epoch": 0.5952585466414705, + "grad_norm": 1.3390896320343018, + "learning_rate": 4.8975962998450786e-06, + "loss": 1.0636, + "step": 1155 + }, + { + "epoch": 0.5957739220065281, + "grad_norm": 1.4554226398468018, + "learning_rate": 4.897403417061423e-06, + "loss": 1.0734, + "step": 1156 + }, + { + "epoch": 0.5962892973715856, + "grad_norm": 1.5800076723098755, + "learning_rate": 4.897210356601574e-06, + "loss": 1.1332, + "step": 1157 + }, + { + "epoch": 0.5968046727366432, + "grad_norm": 1.3435883522033691, + "learning_rate": 4.897017118479837e-06, + "loss": 1.1483, + "step": 1158 + }, + { + "epoch": 0.5973200481017007, + "grad_norm": 1.3186991214752197, + "learning_rate": 4.8968237027105345e-06, + "loss": 1.1018, + "step": 1159 + }, + { + "epoch": 0.5978354234667583, + "grad_norm": 1.3472167253494263, + "learning_rate": 4.896630109308e-06, + "loss": 1.0525, + "step": 1160 + }, + { + "epoch": 0.5983507988318159, + "grad_norm": 1.330782413482666, + "learning_rate": 4.896436338286583e-06, + "loss": 1.0714, + "step": 1161 + }, + { + "epoch": 0.5988661741968734, + "grad_norm": 1.3701021671295166, + "learning_rate": 4.896242389660643e-06, + "loss": 1.1301, + "step": 1162 + }, + { + "epoch": 0.5993815495619309, + "grad_norm": 1.3650195598602295, + "learning_rate": 4.896048263444553e-06, + "loss": 1.1395, + "step": 1163 + }, + { + "epoch": 0.5998969249269885, + "grad_norm": 1.413641333580017, + "learning_rate": 4.895853959652702e-06, + "loss": 1.0819, + "step": 1164 + }, + { + "epoch": 0.6004123002920461, + "grad_norm": 1.3155937194824219, + "learning_rate": 4.895659478299489e-06, + "loss": 1.0741, + "step": 1165 + }, + { + "epoch": 0.6009276756571036, + "grad_norm": 1.336728811264038, + "learning_rate": 4.895464819399327e-06, + "loss": 1.0835, + "step": 1166 + }, + { + "epoch": 0.6014430510221611, + "grad_norm": 1.292924165725708, + "learning_rate": 4.895269982966644e-06, + "loss": 1.1051, + "step": 1167 + }, + { + "epoch": 0.6019584263872186, + "grad_norm": 1.3154529333114624, + "learning_rate": 4.8950749690158786e-06, + "loss": 1.145, + "step": 1168 + }, + { + "epoch": 0.6024738017522763, + "grad_norm": 1.3772039413452148, + "learning_rate": 4.894879777561484e-06, + "loss": 1.0542, + "step": 1169 + }, + { + "epoch": 0.6029891771173338, + "grad_norm": 1.3813426494598389, + "learning_rate": 4.894684408617927e-06, + "loss": 1.1071, + "step": 1170 + }, + { + "epoch": 0.6035045524823913, + "grad_norm": 1.3168193101882935, + "learning_rate": 4.894488862199685e-06, + "loss": 1.0478, + "step": 1171 + }, + { + "epoch": 0.6040199278474488, + "grad_norm": 1.3762054443359375, + "learning_rate": 4.894293138321252e-06, + "loss": 1.1, + "step": 1172 + }, + { + "epoch": 0.6045353032125065, + "grad_norm": 1.3713799715042114, + "learning_rate": 4.894097236997131e-06, + "loss": 1.1282, + "step": 1173 + }, + { + "epoch": 0.605050678577564, + "grad_norm": 1.3565059900283813, + "learning_rate": 4.893901158241845e-06, + "loss": 1.1271, + "step": 1174 + }, + { + "epoch": 0.6055660539426215, + "grad_norm": 1.3203566074371338, + "learning_rate": 4.893704902069921e-06, + "loss": 1.1487, + "step": 1175 + }, + { + "epoch": 0.606081429307679, + "grad_norm": 1.4336824417114258, + "learning_rate": 4.893508468495908e-06, + "loss": 1.0786, + "step": 1176 + }, + { + "epoch": 0.6065968046727367, + "grad_norm": 1.4044010639190674, + "learning_rate": 4.89331185753436e-06, + "loss": 1.0893, + "step": 1177 + }, + { + "epoch": 0.6071121800377942, + "grad_norm": 1.4692096710205078, + "learning_rate": 4.893115069199852e-06, + "loss": 1.0447, + "step": 1178 + }, + { + "epoch": 0.6076275554028517, + "grad_norm": 1.3869678974151611, + "learning_rate": 4.8929181035069665e-06, + "loss": 1.1459, + "step": 1179 + }, + { + "epoch": 0.6081429307679093, + "grad_norm": 1.3510606288909912, + "learning_rate": 4.892720960470301e-06, + "loss": 1.0778, + "step": 1180 + }, + { + "epoch": 0.6086583061329669, + "grad_norm": 1.3487348556518555, + "learning_rate": 4.892523640104467e-06, + "loss": 1.0876, + "step": 1181 + }, + { + "epoch": 0.6091736814980244, + "grad_norm": 1.39715576171875, + "learning_rate": 4.8923261424240864e-06, + "loss": 1.116, + "step": 1182 + }, + { + "epoch": 0.609689056863082, + "grad_norm": 1.3872671127319336, + "learning_rate": 4.892128467443797e-06, + "loss": 1.0469, + "step": 1183 + }, + { + "epoch": 0.6102044322281395, + "grad_norm": 1.3705984354019165, + "learning_rate": 4.891930615178251e-06, + "loss": 1.1037, + "step": 1184 + }, + { + "epoch": 0.610719807593197, + "grad_norm": 1.3599284887313843, + "learning_rate": 4.8917325856421095e-06, + "loss": 1.0936, + "step": 1185 + }, + { + "epoch": 0.6112351829582546, + "grad_norm": 1.400292992591858, + "learning_rate": 4.891534378850049e-06, + "loss": 1.055, + "step": 1186 + }, + { + "epoch": 0.6117505583233122, + "grad_norm": 1.4634445905685425, + "learning_rate": 4.891335994816759e-06, + "loss": 1.1246, + "step": 1187 + }, + { + "epoch": 0.6122659336883697, + "grad_norm": 1.3618847131729126, + "learning_rate": 4.891137433556943e-06, + "loss": 1.0879, + "step": 1188 + }, + { + "epoch": 0.6127813090534272, + "grad_norm": 1.3145513534545898, + "learning_rate": 4.890938695085315e-06, + "loss": 1.1384, + "step": 1189 + }, + { + "epoch": 0.6132966844184848, + "grad_norm": 1.292287826538086, + "learning_rate": 4.890739779416606e-06, + "loss": 1.1067, + "step": 1190 + }, + { + "epoch": 0.6138120597835424, + "grad_norm": 1.371002197265625, + "learning_rate": 4.8905406865655576e-06, + "loss": 1.0729, + "step": 1191 + }, + { + "epoch": 0.6143274351485999, + "grad_norm": 1.3559002876281738, + "learning_rate": 4.890341416546923e-06, + "loss": 1.059, + "step": 1192 + }, + { + "epoch": 0.6148428105136574, + "grad_norm": 1.379361867904663, + "learning_rate": 4.890141969375473e-06, + "loss": 1.0391, + "step": 1193 + }, + { + "epoch": 0.615358185878715, + "grad_norm": 1.3302912712097168, + "learning_rate": 4.889942345065987e-06, + "loss": 1.0825, + "step": 1194 + }, + { + "epoch": 0.6158735612437726, + "grad_norm": 1.3227627277374268, + "learning_rate": 4.889742543633261e-06, + "loss": 1.0492, + "step": 1195 + }, + { + "epoch": 0.6163889366088301, + "grad_norm": 1.3230408430099487, + "learning_rate": 4.889542565092103e-06, + "loss": 1.0268, + "step": 1196 + }, + { + "epoch": 0.6169043119738876, + "grad_norm": 1.4277966022491455, + "learning_rate": 4.889342409457332e-06, + "loss": 1.143, + "step": 1197 + }, + { + "epoch": 0.6174196873389451, + "grad_norm": 1.4398748874664307, + "learning_rate": 4.889142076743782e-06, + "loss": 1.0711, + "step": 1198 + }, + { + "epoch": 0.6179350627040028, + "grad_norm": 1.3817853927612305, + "learning_rate": 4.888941566966302e-06, + "loss": 1.0831, + "step": 1199 + }, + { + "epoch": 0.6184504380690603, + "grad_norm": 1.395257830619812, + "learning_rate": 4.888740880139751e-06, + "loss": 1.1395, + "step": 1200 + }, + { + "epoch": 0.6189658134341178, + "grad_norm": 1.3928101062774658, + "learning_rate": 4.888540016279002e-06, + "loss": 1.1209, + "step": 1201 + }, + { + "epoch": 0.6194811887991754, + "grad_norm": 1.3110088109970093, + "learning_rate": 4.888338975398941e-06, + "loss": 1.0692, + "step": 1202 + }, + { + "epoch": 0.619996564164233, + "grad_norm": 1.3726956844329834, + "learning_rate": 4.8881377575144695e-06, + "loss": 1.156, + "step": 1203 + }, + { + "epoch": 0.6205119395292905, + "grad_norm": 1.4747378826141357, + "learning_rate": 4.887936362640498e-06, + "loss": 1.0864, + "step": 1204 + }, + { + "epoch": 0.621027314894348, + "grad_norm": 1.3230063915252686, + "learning_rate": 4.887734790791954e-06, + "loss": 1.0657, + "step": 1205 + }, + { + "epoch": 0.6215426902594056, + "grad_norm": 1.4198203086853027, + "learning_rate": 4.8875330419837755e-06, + "loss": 1.085, + "step": 1206 + }, + { + "epoch": 0.6220580656244632, + "grad_norm": 1.403670072555542, + "learning_rate": 4.887331116230914e-06, + "loss": 1.137, + "step": 1207 + }, + { + "epoch": 0.6225734409895207, + "grad_norm": 1.3896167278289795, + "learning_rate": 4.8871290135483355e-06, + "loss": 1.1539, + "step": 1208 + }, + { + "epoch": 0.6230888163545782, + "grad_norm": 1.440527081489563, + "learning_rate": 4.886926733951017e-06, + "loss": 1.1683, + "step": 1209 + }, + { + "epoch": 0.6236041917196358, + "grad_norm": 1.488156795501709, + "learning_rate": 4.886724277453952e-06, + "loss": 1.1122, + "step": 1210 + }, + { + "epoch": 0.6241195670846934, + "grad_norm": 1.3573834896087646, + "learning_rate": 4.886521644072143e-06, + "loss": 1.0381, + "step": 1211 + }, + { + "epoch": 0.6246349424497509, + "grad_norm": 1.3494553565979004, + "learning_rate": 4.886318833820608e-06, + "loss": 1.0298, + "step": 1212 + }, + { + "epoch": 0.6251503178148085, + "grad_norm": 1.4108291864395142, + "learning_rate": 4.886115846714379e-06, + "loss": 1.1535, + "step": 1213 + }, + { + "epoch": 0.625665693179866, + "grad_norm": 1.3653464317321777, + "learning_rate": 4.885912682768498e-06, + "loss": 1.0589, + "step": 1214 + }, + { + "epoch": 0.6261810685449235, + "grad_norm": 1.397425651550293, + "learning_rate": 4.885709341998022e-06, + "loss": 1.1063, + "step": 1215 + }, + { + "epoch": 0.6266964439099811, + "grad_norm": 1.334436297416687, + "learning_rate": 4.885505824418022e-06, + "loss": 1.04, + "step": 1216 + }, + { + "epoch": 0.6272118192750387, + "grad_norm": 1.3866463899612427, + "learning_rate": 4.88530213004358e-06, + "loss": 1.0938, + "step": 1217 + }, + { + "epoch": 0.6277271946400962, + "grad_norm": 1.3958983421325684, + "learning_rate": 4.885098258889793e-06, + "loss": 1.1031, + "step": 1218 + }, + { + "epoch": 0.6282425700051537, + "grad_norm": 1.3861192464828491, + "learning_rate": 4.88489421097177e-06, + "loss": 1.0862, + "step": 1219 + }, + { + "epoch": 0.6287579453702113, + "grad_norm": 1.3057092428207397, + "learning_rate": 4.884689986304634e-06, + "loss": 1.0929, + "step": 1220 + }, + { + "epoch": 0.6292733207352689, + "grad_norm": 1.4449186325073242, + "learning_rate": 4.88448558490352e-06, + "loss": 1.1231, + "step": 1221 + }, + { + "epoch": 0.6297886961003264, + "grad_norm": 1.2825870513916016, + "learning_rate": 4.884281006783576e-06, + "loss": 1.0594, + "step": 1222 + }, + { + "epoch": 0.6303040714653839, + "grad_norm": 1.310887336730957, + "learning_rate": 4.8840762519599635e-06, + "loss": 1.1037, + "step": 1223 + }, + { + "epoch": 0.6308194468304416, + "grad_norm": 1.440755844116211, + "learning_rate": 4.883871320447859e-06, + "loss": 1.1015, + "step": 1224 + }, + { + "epoch": 0.6313348221954991, + "grad_norm": 1.355614185333252, + "learning_rate": 4.883666212262448e-06, + "loss": 1.0333, + "step": 1225 + }, + { + "epoch": 0.6318501975605566, + "grad_norm": 1.4518755674362183, + "learning_rate": 4.883460927418933e-06, + "loss": 1.0318, + "step": 1226 + }, + { + "epoch": 0.6323655729256141, + "grad_norm": 1.4447261095046997, + "learning_rate": 4.883255465932529e-06, + "loss": 1.1, + "step": 1227 + }, + { + "epoch": 0.6328809482906717, + "grad_norm": 1.4496663808822632, + "learning_rate": 4.883049827818461e-06, + "loss": 1.0935, + "step": 1228 + }, + { + "epoch": 0.6333963236557293, + "grad_norm": 1.4826685190200806, + "learning_rate": 4.88284401309197e-06, + "loss": 1.1498, + "step": 1229 + }, + { + "epoch": 0.6339116990207868, + "grad_norm": 1.4179977178573608, + "learning_rate": 4.882638021768311e-06, + "loss": 1.0997, + "step": 1230 + }, + { + "epoch": 0.6344270743858443, + "grad_norm": 1.2896610498428345, + "learning_rate": 4.882431853862748e-06, + "loss": 1.0368, + "step": 1231 + }, + { + "epoch": 0.6349424497509019, + "grad_norm": 1.4019243717193604, + "learning_rate": 4.8822255093905615e-06, + "loss": 1.0817, + "step": 1232 + }, + { + "epoch": 0.6354578251159595, + "grad_norm": 1.3491712808609009, + "learning_rate": 4.882018988367045e-06, + "loss": 1.0498, + "step": 1233 + }, + { + "epoch": 0.635973200481017, + "grad_norm": 1.3700180053710938, + "learning_rate": 4.881812290807502e-06, + "loss": 1.0581, + "step": 1234 + }, + { + "epoch": 0.6364885758460745, + "grad_norm": 1.420124888420105, + "learning_rate": 4.881605416727252e-06, + "loss": 1.1296, + "step": 1235 + }, + { + "epoch": 0.6370039512111321, + "grad_norm": 1.5151277780532837, + "learning_rate": 4.881398366141628e-06, + "loss": 1.0867, + "step": 1236 + }, + { + "epoch": 0.6375193265761897, + "grad_norm": 1.4695781469345093, + "learning_rate": 4.881191139065975e-06, + "loss": 1.0429, + "step": 1237 + }, + { + "epoch": 0.6380347019412472, + "grad_norm": 1.4124763011932373, + "learning_rate": 4.880983735515649e-06, + "loss": 1.0662, + "step": 1238 + }, + { + "epoch": 0.6385500773063048, + "grad_norm": 1.4462312459945679, + "learning_rate": 4.8807761555060225e-06, + "loss": 1.1051, + "step": 1239 + }, + { + "epoch": 0.6390654526713623, + "grad_norm": 1.4130510091781616, + "learning_rate": 4.88056839905248e-06, + "loss": 1.0944, + "step": 1240 + }, + { + "epoch": 0.6395808280364199, + "grad_norm": 1.4933778047561646, + "learning_rate": 4.880360466170417e-06, + "loss": 1.0561, + "step": 1241 + }, + { + "epoch": 0.6400962034014774, + "grad_norm": 1.3993109464645386, + "learning_rate": 4.880152356875247e-06, + "loss": 1.084, + "step": 1242 + }, + { + "epoch": 0.640611578766535, + "grad_norm": 1.3925676345825195, + "learning_rate": 4.87994407118239e-06, + "loss": 1.058, + "step": 1243 + }, + { + "epoch": 0.6411269541315925, + "grad_norm": 1.4072788953781128, + "learning_rate": 4.8797356091072834e-06, + "loss": 1.1107, + "step": 1244 + }, + { + "epoch": 0.64164232949665, + "grad_norm": 1.4816721677780151, + "learning_rate": 4.879526970665378e-06, + "loss": 1.1502, + "step": 1245 + }, + { + "epoch": 0.6421577048617076, + "grad_norm": 1.3737413883209229, + "learning_rate": 4.8793181558721355e-06, + "loss": 1.0962, + "step": 1246 + }, + { + "epoch": 0.6426730802267652, + "grad_norm": 1.4381495714187622, + "learning_rate": 4.879109164743031e-06, + "loss": 1.1229, + "step": 1247 + }, + { + "epoch": 0.6431884555918227, + "grad_norm": 1.4116510152816772, + "learning_rate": 4.878899997293554e-06, + "loss": 1.1239, + "step": 1248 + }, + { + "epoch": 0.6437038309568802, + "grad_norm": 1.3602337837219238, + "learning_rate": 4.878690653539207e-06, + "loss": 1.0246, + "step": 1249 + }, + { + "epoch": 0.6442192063219379, + "grad_norm": 1.4238361120224, + "learning_rate": 4.878481133495503e-06, + "loss": 1.1035, + "step": 1250 + }, + { + "epoch": 0.6447345816869954, + "grad_norm": 1.3942928314208984, + "learning_rate": 4.878271437177972e-06, + "loss": 1.0803, + "step": 1251 + }, + { + "epoch": 0.6452499570520529, + "grad_norm": 1.4026683568954468, + "learning_rate": 4.878061564602153e-06, + "loss": 1.1248, + "step": 1252 + }, + { + "epoch": 0.6457653324171104, + "grad_norm": 1.600054144859314, + "learning_rate": 4.877851515783601e-06, + "loss": 1.0738, + "step": 1253 + }, + { + "epoch": 0.6462807077821681, + "grad_norm": 1.4209798574447632, + "learning_rate": 4.8776412907378845e-06, + "loss": 1.0954, + "step": 1254 + }, + { + "epoch": 0.6467960831472256, + "grad_norm": 1.3372230529785156, + "learning_rate": 4.877430889480581e-06, + "loss": 1.0453, + "step": 1255 + }, + { + "epoch": 0.6473114585122831, + "grad_norm": 1.3959600925445557, + "learning_rate": 4.877220312027285e-06, + "loss": 1.0663, + "step": 1256 + }, + { + "epoch": 0.6478268338773406, + "grad_norm": 1.451054334640503, + "learning_rate": 4.877009558393604e-06, + "loss": 1.1363, + "step": 1257 + }, + { + "epoch": 0.6483422092423982, + "grad_norm": 1.4474360942840576, + "learning_rate": 4.876798628595156e-06, + "loss": 1.1172, + "step": 1258 + }, + { + "epoch": 0.6488575846074558, + "grad_norm": 1.4710158109664917, + "learning_rate": 4.876587522647573e-06, + "loss": 1.1213, + "step": 1259 + }, + { + "epoch": 0.6493729599725133, + "grad_norm": 1.3668112754821777, + "learning_rate": 4.876376240566501e-06, + "loss": 1.1113, + "step": 1260 + }, + { + "epoch": 0.6498883353375708, + "grad_norm": 1.3233379125595093, + "learning_rate": 4.876164782367599e-06, + "loss": 1.1201, + "step": 1261 + }, + { + "epoch": 0.6504037107026284, + "grad_norm": 1.504072904586792, + "learning_rate": 4.875953148066538e-06, + "loss": 1.075, + "step": 1262 + }, + { + "epoch": 0.650919086067686, + "grad_norm": 1.4190407991409302, + "learning_rate": 4.875741337679004e-06, + "loss": 1.0805, + "step": 1263 + }, + { + "epoch": 0.6514344614327435, + "grad_norm": 1.443258285522461, + "learning_rate": 4.875529351220692e-06, + "loss": 1.1517, + "step": 1264 + }, + { + "epoch": 0.651949836797801, + "grad_norm": 1.3786828517913818, + "learning_rate": 4.875317188707315e-06, + "loss": 1.1067, + "step": 1265 + }, + { + "epoch": 0.6524652121628586, + "grad_norm": 1.3303650617599487, + "learning_rate": 4.875104850154596e-06, + "loss": 1.0173, + "step": 1266 + }, + { + "epoch": 0.6529805875279162, + "grad_norm": 1.3950949907302856, + "learning_rate": 4.8748923355782715e-06, + "loss": 1.0967, + "step": 1267 + }, + { + "epoch": 0.6534959628929737, + "grad_norm": 1.3890631198883057, + "learning_rate": 4.874679644994092e-06, + "loss": 1.1126, + "step": 1268 + }, + { + "epoch": 0.6540113382580313, + "grad_norm": 1.3618621826171875, + "learning_rate": 4.874466778417819e-06, + "loss": 1.115, + "step": 1269 + }, + { + "epoch": 0.6545267136230888, + "grad_norm": 1.344192385673523, + "learning_rate": 4.874253735865231e-06, + "loss": 1.0854, + "step": 1270 + }, + { + "epoch": 0.6550420889881464, + "grad_norm": 1.4194031953811646, + "learning_rate": 4.874040517352116e-06, + "loss": 1.1088, + "step": 1271 + }, + { + "epoch": 0.655557464353204, + "grad_norm": 1.3026530742645264, + "learning_rate": 4.873827122894274e-06, + "loss": 1.1148, + "step": 1272 + }, + { + "epoch": 0.6560728397182615, + "grad_norm": 1.3492462635040283, + "learning_rate": 4.873613552507522e-06, + "loss": 1.0606, + "step": 1273 + }, + { + "epoch": 0.656588215083319, + "grad_norm": 1.3427256345748901, + "learning_rate": 4.873399806207688e-06, + "loss": 1.0943, + "step": 1274 + }, + { + "epoch": 0.6571035904483765, + "grad_norm": 1.3541367053985596, + "learning_rate": 4.8731858840106136e-06, + "loss": 1.1692, + "step": 1275 + }, + { + "epoch": 0.6576189658134342, + "grad_norm": 1.361036777496338, + "learning_rate": 4.8729717859321515e-06, + "loss": 1.173, + "step": 1276 + }, + { + "epoch": 0.6581343411784917, + "grad_norm": 1.497694492340088, + "learning_rate": 4.8727575119881705e-06, + "loss": 1.1123, + "step": 1277 + }, + { + "epoch": 0.6586497165435492, + "grad_norm": 1.354684591293335, + "learning_rate": 4.872543062194549e-06, + "loss": 1.0228, + "step": 1278 + }, + { + "epoch": 0.6591650919086067, + "grad_norm": 1.4002902507781982, + "learning_rate": 4.872328436567183e-06, + "loss": 1.0547, + "step": 1279 + }, + { + "epoch": 0.6596804672736644, + "grad_norm": 1.4142106771469116, + "learning_rate": 4.872113635121975e-06, + "loss": 1.0316, + "step": 1280 + }, + { + "epoch": 0.6601958426387219, + "grad_norm": 1.3479201793670654, + "learning_rate": 4.871898657874848e-06, + "loss": 1.0617, + "step": 1281 + }, + { + "epoch": 0.6607112180037794, + "grad_norm": 1.5031355619430542, + "learning_rate": 4.871683504841734e-06, + "loss": 1.1113, + "step": 1282 + }, + { + "epoch": 0.6612265933688369, + "grad_norm": 1.39547598361969, + "learning_rate": 4.871468176038575e-06, + "loss": 1.0954, + "step": 1283 + }, + { + "epoch": 0.6617419687338946, + "grad_norm": 1.3589881658554077, + "learning_rate": 4.8712526714813335e-06, + "loss": 1.0636, + "step": 1284 + }, + { + "epoch": 0.6622573440989521, + "grad_norm": 1.3259470462799072, + "learning_rate": 4.8710369911859784e-06, + "loss": 1.0264, + "step": 1285 + }, + { + "epoch": 0.6627727194640096, + "grad_norm": 1.4223607778549194, + "learning_rate": 4.870821135168495e-06, + "loss": 1.0917, + "step": 1286 + }, + { + "epoch": 0.6632880948290671, + "grad_norm": 1.3349062204360962, + "learning_rate": 4.870605103444881e-06, + "loss": 1.0898, + "step": 1287 + }, + { + "epoch": 0.6638034701941247, + "grad_norm": 1.3264777660369873, + "learning_rate": 4.870388896031147e-06, + "loss": 1.0527, + "step": 1288 + }, + { + "epoch": 0.6643188455591823, + "grad_norm": 1.4408807754516602, + "learning_rate": 4.870172512943316e-06, + "loss": 1.1317, + "step": 1289 + }, + { + "epoch": 0.6648342209242398, + "grad_norm": 1.5054023265838623, + "learning_rate": 4.869955954197425e-06, + "loss": 1.0686, + "step": 1290 + }, + { + "epoch": 0.6653495962892974, + "grad_norm": 1.5121653079986572, + "learning_rate": 4.869739219809523e-06, + "loss": 1.0969, + "step": 1291 + }, + { + "epoch": 0.6658649716543549, + "grad_norm": 1.376417636871338, + "learning_rate": 4.869522309795673e-06, + "loss": 1.0612, + "step": 1292 + }, + { + "epoch": 0.6663803470194125, + "grad_norm": 1.3991574048995972, + "learning_rate": 4.8693052241719515e-06, + "loss": 1.0951, + "step": 1293 + }, + { + "epoch": 0.66689572238447, + "grad_norm": 1.3785655498504639, + "learning_rate": 4.869087962954446e-06, + "loss": 1.071, + "step": 1294 + }, + { + "epoch": 0.6674110977495276, + "grad_norm": 1.4480599164962769, + "learning_rate": 4.868870526159258e-06, + "loss": 1.1226, + "step": 1295 + }, + { + "epoch": 0.6679264731145851, + "grad_norm": 1.4805289506912231, + "learning_rate": 4.868652913802504e-06, + "loss": 1.094, + "step": 1296 + }, + { + "epoch": 0.6684418484796427, + "grad_norm": 1.3418303728103638, + "learning_rate": 4.86843512590031e-06, + "loss": 1.0665, + "step": 1297 + }, + { + "epoch": 0.6689572238447002, + "grad_norm": 1.3156306743621826, + "learning_rate": 4.868217162468815e-06, + "loss": 1.0971, + "step": 1298 + }, + { + "epoch": 0.6694725992097578, + "grad_norm": 1.429788589477539, + "learning_rate": 4.867999023524176e-06, + "loss": 1.1008, + "step": 1299 + }, + { + "epoch": 0.6699879745748153, + "grad_norm": 1.3980108499526978, + "learning_rate": 4.867780709082559e-06, + "loss": 1.1053, + "step": 1300 + }, + { + "epoch": 0.6705033499398729, + "grad_norm": 1.454076886177063, + "learning_rate": 4.867562219160143e-06, + "loss": 1.1428, + "step": 1301 + }, + { + "epoch": 0.6710187253049305, + "grad_norm": 1.5541733503341675, + "learning_rate": 4.86734355377312e-06, + "loss": 1.0759, + "step": 1302 + }, + { + "epoch": 0.671534100669988, + "grad_norm": 1.4767528772354126, + "learning_rate": 4.867124712937696e-06, + "loss": 1.0519, + "step": 1303 + }, + { + "epoch": 0.6720494760350455, + "grad_norm": 1.4581927061080933, + "learning_rate": 4.8669056966700914e-06, + "loss": 1.0816, + "step": 1304 + }, + { + "epoch": 0.672564851400103, + "grad_norm": 1.389357089996338, + "learning_rate": 4.8666865049865356e-06, + "loss": 1.1189, + "step": 1305 + }, + { + "epoch": 0.6730802267651607, + "grad_norm": 1.36599600315094, + "learning_rate": 4.8664671379032755e-06, + "loss": 1.1449, + "step": 1306 + }, + { + "epoch": 0.6735956021302182, + "grad_norm": 1.364134430885315, + "learning_rate": 4.866247595436568e-06, + "loss": 1.1239, + "step": 1307 + }, + { + "epoch": 0.6741109774952757, + "grad_norm": 1.476054310798645, + "learning_rate": 4.866027877602682e-06, + "loss": 1.1238, + "step": 1308 + }, + { + "epoch": 0.6746263528603332, + "grad_norm": 1.428935170173645, + "learning_rate": 4.865807984417903e-06, + "loss": 1.073, + "step": 1309 + }, + { + "epoch": 0.6751417282253909, + "grad_norm": 1.4435595273971558, + "learning_rate": 4.865587915898528e-06, + "loss": 1.1536, + "step": 1310 + }, + { + "epoch": 0.6756571035904484, + "grad_norm": 1.5261706113815308, + "learning_rate": 4.8653676720608674e-06, + "loss": 1.0984, + "step": 1311 + }, + { + "epoch": 0.6761724789555059, + "grad_norm": 1.401324987411499, + "learning_rate": 4.865147252921242e-06, + "loss": 1.0563, + "step": 1312 + }, + { + "epoch": 0.6766878543205634, + "grad_norm": 1.2494412660598755, + "learning_rate": 4.8649266584959875e-06, + "loss": 1.0598, + "step": 1313 + }, + { + "epoch": 0.6772032296856211, + "grad_norm": 1.3429361581802368, + "learning_rate": 4.864705888801454e-06, + "loss": 1.1142, + "step": 1314 + }, + { + "epoch": 0.6777186050506786, + "grad_norm": 1.407556176185608, + "learning_rate": 4.864484943854002e-06, + "loss": 1.0681, + "step": 1315 + }, + { + "epoch": 0.6782339804157361, + "grad_norm": 1.3672014474868774, + "learning_rate": 4.864263823670006e-06, + "loss": 1.1239, + "step": 1316 + }, + { + "epoch": 0.6787493557807937, + "grad_norm": 1.4719024896621704, + "learning_rate": 4.864042528265855e-06, + "loss": 1.1257, + "step": 1317 + }, + { + "epoch": 0.6792647311458512, + "grad_norm": 1.3469836711883545, + "learning_rate": 4.863821057657949e-06, + "loss": 1.0655, + "step": 1318 + }, + { + "epoch": 0.6797801065109088, + "grad_norm": 1.4195460081100464, + "learning_rate": 4.863599411862701e-06, + "loss": 1.0797, + "step": 1319 + }, + { + "epoch": 0.6802954818759663, + "grad_norm": 1.378151297569275, + "learning_rate": 4.863377590896539e-06, + "loss": 1.0966, + "step": 1320 + }, + { + "epoch": 0.6808108572410239, + "grad_norm": 1.3662402629852295, + "learning_rate": 4.863155594775901e-06, + "loss": 1.1436, + "step": 1321 + }, + { + "epoch": 0.6813262326060814, + "grad_norm": 1.300870418548584, + "learning_rate": 4.86293342351724e-06, + "loss": 1.1113, + "step": 1322 + }, + { + "epoch": 0.681841607971139, + "grad_norm": 1.332433819770813, + "learning_rate": 4.862711077137022e-06, + "loss": 1.0535, + "step": 1323 + }, + { + "epoch": 0.6823569833361965, + "grad_norm": 1.4568887948989868, + "learning_rate": 4.8624885556517255e-06, + "loss": 1.096, + "step": 1324 + }, + { + "epoch": 0.6828723587012541, + "grad_norm": 1.3251585960388184, + "learning_rate": 4.8622658590778414e-06, + "loss": 1.0773, + "step": 1325 + }, + { + "epoch": 0.6833877340663116, + "grad_norm": 1.4032732248306274, + "learning_rate": 4.8620429874318745e-06, + "loss": 1.1054, + "step": 1326 + }, + { + "epoch": 0.6839031094313692, + "grad_norm": 1.4974113702774048, + "learning_rate": 4.861819940730342e-06, + "loss": 1.1095, + "step": 1327 + }, + { + "epoch": 0.6844184847964268, + "grad_norm": 1.4674794673919678, + "learning_rate": 4.861596718989775e-06, + "loss": 1.0566, + "step": 1328 + }, + { + "epoch": 0.6849338601614843, + "grad_norm": 1.4152774810791016, + "learning_rate": 4.861373322226717e-06, + "loss": 1.1515, + "step": 1329 + }, + { + "epoch": 0.6854492355265418, + "grad_norm": 1.5060356855392456, + "learning_rate": 4.861149750457724e-06, + "loss": 1.1229, + "step": 1330 + }, + { + "epoch": 0.6859646108915994, + "grad_norm": 1.3054554462432861, + "learning_rate": 4.8609260036993645e-06, + "loss": 1.0185, + "step": 1331 + }, + { + "epoch": 0.686479986256657, + "grad_norm": 1.4473235607147217, + "learning_rate": 4.8607020819682215e-06, + "loss": 1.086, + "step": 1332 + }, + { + "epoch": 0.6869953616217145, + "grad_norm": 1.3452242612838745, + "learning_rate": 4.860477985280891e-06, + "loss": 1.1032, + "step": 1333 + }, + { + "epoch": 0.687510736986772, + "grad_norm": 1.418095588684082, + "learning_rate": 4.86025371365398e-06, + "loss": 1.0907, + "step": 1334 + }, + { + "epoch": 0.6880261123518295, + "grad_norm": 1.3405357599258423, + "learning_rate": 4.86002926710411e-06, + "loss": 1.1112, + "step": 1335 + }, + { + "epoch": 0.6885414877168872, + "grad_norm": 1.4815081357955933, + "learning_rate": 4.859804645647916e-06, + "loss": 1.1308, + "step": 1336 + }, + { + "epoch": 0.6890568630819447, + "grad_norm": 1.3682807683944702, + "learning_rate": 4.859579849302044e-06, + "loss": 1.1069, + "step": 1337 + }, + { + "epoch": 0.6895722384470022, + "grad_norm": 1.3925576210021973, + "learning_rate": 4.8593548780831545e-06, + "loss": 1.1286, + "step": 1338 + }, + { + "epoch": 0.6900876138120597, + "grad_norm": 1.3944441080093384, + "learning_rate": 4.859129732007921e-06, + "loss": 1.115, + "step": 1339 + }, + { + "epoch": 0.6906029891771174, + "grad_norm": 1.5272127389907837, + "learning_rate": 4.85890441109303e-06, + "loss": 1.0707, + "step": 1340 + }, + { + "epoch": 0.6911183645421749, + "grad_norm": 1.3506841659545898, + "learning_rate": 4.858678915355178e-06, + "loss": 1.098, + "step": 1341 + }, + { + "epoch": 0.6916337399072324, + "grad_norm": 1.3435107469558716, + "learning_rate": 4.85845324481108e-06, + "loss": 1.0728, + "step": 1342 + }, + { + "epoch": 0.69214911527229, + "grad_norm": 1.4491636753082275, + "learning_rate": 4.858227399477458e-06, + "loss": 1.1069, + "step": 1343 + }, + { + "epoch": 0.6926644906373476, + "grad_norm": 1.4442695379257202, + "learning_rate": 4.858001379371052e-06, + "loss": 1.1027, + "step": 1344 + }, + { + "epoch": 0.6931798660024051, + "grad_norm": 1.373400092124939, + "learning_rate": 4.857775184508613e-06, + "loss": 1.1197, + "step": 1345 + }, + { + "epoch": 0.6936952413674626, + "grad_norm": 1.4126453399658203, + "learning_rate": 4.857548814906903e-06, + "loss": 1.0845, + "step": 1346 + }, + { + "epoch": 0.6942106167325202, + "grad_norm": 1.4741835594177246, + "learning_rate": 4.857322270582699e-06, + "loss": 1.0952, + "step": 1347 + }, + { + "epoch": 0.6947259920975777, + "grad_norm": 1.427833914756775, + "learning_rate": 4.857095551552792e-06, + "loss": 1.0763, + "step": 1348 + }, + { + "epoch": 0.6952413674626353, + "grad_norm": 1.4624282121658325, + "learning_rate": 4.856868657833983e-06, + "loss": 1.093, + "step": 1349 + }, + { + "epoch": 0.6957567428276928, + "grad_norm": 1.4597184658050537, + "learning_rate": 4.856641589443089e-06, + "loss": 1.0819, + "step": 1350 + }, + { + "epoch": 0.6962721181927504, + "grad_norm": 1.4595695734024048, + "learning_rate": 4.856414346396937e-06, + "loss": 1.1036, + "step": 1351 + }, + { + "epoch": 0.6967874935578079, + "grad_norm": 1.4645100831985474, + "learning_rate": 4.85618692871237e-06, + "loss": 1.0967, + "step": 1352 + }, + { + "epoch": 0.6973028689228655, + "grad_norm": 1.4653542041778564, + "learning_rate": 4.855959336406241e-06, + "loss": 1.0282, + "step": 1353 + }, + { + "epoch": 0.697818244287923, + "grad_norm": 1.3708640336990356, + "learning_rate": 4.855731569495417e-06, + "loss": 1.1137, + "step": 1354 + }, + { + "epoch": 0.6983336196529806, + "grad_norm": 1.3453445434570312, + "learning_rate": 4.855503627996781e-06, + "loss": 1.0917, + "step": 1355 + }, + { + "epoch": 0.6988489950180381, + "grad_norm": 1.4275254011154175, + "learning_rate": 4.855275511927223e-06, + "loss": 1.1415, + "step": 1356 + }, + { + "epoch": 0.6993643703830957, + "grad_norm": 1.384270191192627, + "learning_rate": 4.8550472213036505e-06, + "loss": 1.1024, + "step": 1357 + }, + { + "epoch": 0.6998797457481533, + "grad_norm": 1.4288511276245117, + "learning_rate": 4.854818756142982e-06, + "loss": 1.0809, + "step": 1358 + }, + { + "epoch": 0.7003951211132108, + "grad_norm": 1.397618293762207, + "learning_rate": 4.85459011646215e-06, + "loss": 1.1454, + "step": 1359 + }, + { + "epoch": 0.7009104964782683, + "grad_norm": 1.3595001697540283, + "learning_rate": 4.8543613022781e-06, + "loss": 1.1873, + "step": 1360 + }, + { + "epoch": 0.7014258718433259, + "grad_norm": 1.2345764636993408, + "learning_rate": 4.854132313607789e-06, + "loss": 1.0707, + "step": 1361 + }, + { + "epoch": 0.7019412472083835, + "grad_norm": 1.4134441614151, + "learning_rate": 4.853903150468187e-06, + "loss": 1.1097, + "step": 1362 + }, + { + "epoch": 0.702456622573441, + "grad_norm": 1.4602175951004028, + "learning_rate": 4.85367381287628e-06, + "loss": 1.1177, + "step": 1363 + }, + { + "epoch": 0.7029719979384985, + "grad_norm": 1.4819403886795044, + "learning_rate": 4.853444300849064e-06, + "loss": 1.1511, + "step": 1364 + }, + { + "epoch": 0.703487373303556, + "grad_norm": 1.4806100130081177, + "learning_rate": 4.853214614403546e-06, + "loss": 1.0773, + "step": 1365 + }, + { + "epoch": 0.7040027486686137, + "grad_norm": 1.4940937757492065, + "learning_rate": 4.852984753556752e-06, + "loss": 1.0999, + "step": 1366 + }, + { + "epoch": 0.7045181240336712, + "grad_norm": 1.3197970390319824, + "learning_rate": 4.852754718325715e-06, + "loss": 1.0316, + "step": 1367 + }, + { + "epoch": 0.7050334993987287, + "grad_norm": 1.3321506977081299, + "learning_rate": 4.852524508727485e-06, + "loss": 1.0527, + "step": 1368 + }, + { + "epoch": 0.7055488747637862, + "grad_norm": 1.4333229064941406, + "learning_rate": 4.8522941247791225e-06, + "loss": 1.0176, + "step": 1369 + }, + { + "epoch": 0.7060642501288439, + "grad_norm": 1.4624855518341064, + "learning_rate": 4.852063566497701e-06, + "loss": 1.1335, + "step": 1370 + }, + { + "epoch": 0.7065796254939014, + "grad_norm": 1.4005401134490967, + "learning_rate": 4.851832833900309e-06, + "loss": 1.0841, + "step": 1371 + }, + { + "epoch": 0.7070950008589589, + "grad_norm": 1.5284771919250488, + "learning_rate": 4.851601927004045e-06, + "loss": 1.1569, + "step": 1372 + }, + { + "epoch": 0.7076103762240165, + "grad_norm": 1.4555931091308594, + "learning_rate": 4.851370845826023e-06, + "loss": 1.0373, + "step": 1373 + }, + { + "epoch": 0.7081257515890741, + "grad_norm": 1.3138989210128784, + "learning_rate": 4.8511395903833695e-06, + "loss": 1.0617, + "step": 1374 + }, + { + "epoch": 0.7086411269541316, + "grad_norm": 1.4344639778137207, + "learning_rate": 4.850908160693222e-06, + "loss": 1.0887, + "step": 1375 + }, + { + "epoch": 0.7091565023191891, + "grad_norm": 1.4773739576339722, + "learning_rate": 4.850676556772733e-06, + "loss": 1.1262, + "step": 1376 + }, + { + "epoch": 0.7096718776842467, + "grad_norm": 1.3824067115783691, + "learning_rate": 4.850444778639067e-06, + "loss": 1.1232, + "step": 1377 + }, + { + "epoch": 0.7101872530493042, + "grad_norm": 1.4284874200820923, + "learning_rate": 4.8502128263094e-06, + "loss": 1.0868, + "step": 1378 + }, + { + "epoch": 0.7107026284143618, + "grad_norm": 1.380581021308899, + "learning_rate": 4.849980699800926e-06, + "loss": 1.0432, + "step": 1379 + }, + { + "epoch": 0.7112180037794193, + "grad_norm": 1.4592028856277466, + "learning_rate": 4.849748399130845e-06, + "loss": 1.054, + "step": 1380 + }, + { + "epoch": 0.7117333791444769, + "grad_norm": 1.3985048532485962, + "learning_rate": 4.849515924316373e-06, + "loss": 1.0763, + "step": 1381 + }, + { + "epoch": 0.7122487545095344, + "grad_norm": 1.3862051963806152, + "learning_rate": 4.849283275374742e-06, + "loss": 1.0463, + "step": 1382 + }, + { + "epoch": 0.712764129874592, + "grad_norm": 1.4439160823822021, + "learning_rate": 4.849050452323193e-06, + "loss": 1.0953, + "step": 1383 + }, + { + "epoch": 0.7132795052396496, + "grad_norm": 1.487596035003662, + "learning_rate": 4.848817455178979e-06, + "loss": 1.0759, + "step": 1384 + }, + { + "epoch": 0.7137948806047071, + "grad_norm": 1.5926995277404785, + "learning_rate": 4.848584283959371e-06, + "loss": 1.0944, + "step": 1385 + }, + { + "epoch": 0.7143102559697646, + "grad_norm": 1.4446570873260498, + "learning_rate": 4.848350938681648e-06, + "loss": 1.0602, + "step": 1386 + }, + { + "epoch": 0.7148256313348222, + "grad_norm": 1.3776057958602905, + "learning_rate": 4.848117419363103e-06, + "loss": 1.0768, + "step": 1387 + }, + { + "epoch": 0.7153410066998798, + "grad_norm": 1.321755051612854, + "learning_rate": 4.847883726021044e-06, + "loss": 1.0443, + "step": 1388 + }, + { + "epoch": 0.7158563820649373, + "grad_norm": 1.3724958896636963, + "learning_rate": 4.847649858672789e-06, + "loss": 1.0562, + "step": 1389 + }, + { + "epoch": 0.7163717574299948, + "grad_norm": 1.5155479907989502, + "learning_rate": 4.847415817335672e-06, + "loss": 1.1133, + "step": 1390 + }, + { + "epoch": 0.7168871327950525, + "grad_norm": 1.4411449432373047, + "learning_rate": 4.847181602027038e-06, + "loss": 1.105, + "step": 1391 + }, + { + "epoch": 0.71740250816011, + "grad_norm": 1.482319951057434, + "learning_rate": 4.846947212764244e-06, + "loss": 1.096, + "step": 1392 + }, + { + "epoch": 0.7179178835251675, + "grad_norm": 1.427704095840454, + "learning_rate": 4.846712649564662e-06, + "loss": 1.0962, + "step": 1393 + }, + { + "epoch": 0.718433258890225, + "grad_norm": 1.517991542816162, + "learning_rate": 4.846477912445675e-06, + "loss": 1.1519, + "step": 1394 + }, + { + "epoch": 0.7189486342552825, + "grad_norm": 1.4028595685958862, + "learning_rate": 4.846243001424681e-06, + "loss": 1.0597, + "step": 1395 + }, + { + "epoch": 0.7194640096203402, + "grad_norm": 1.4641873836517334, + "learning_rate": 4.846007916519089e-06, + "loss": 1.1123, + "step": 1396 + }, + { + "epoch": 0.7199793849853977, + "grad_norm": 1.4246588945388794, + "learning_rate": 4.845772657746321e-06, + "loss": 1.1262, + "step": 1397 + }, + { + "epoch": 0.7204947603504552, + "grad_norm": 1.4003269672393799, + "learning_rate": 4.8455372251238145e-06, + "loss": 1.1096, + "step": 1398 + }, + { + "epoch": 0.7210101357155128, + "grad_norm": 1.3927829265594482, + "learning_rate": 4.845301618669017e-06, + "loss": 1.0745, + "step": 1399 + }, + { + "epoch": 0.7215255110805704, + "grad_norm": 1.4084463119506836, + "learning_rate": 4.845065838399388e-06, + "loss": 1.0459, + "step": 1400 + }, + { + "epoch": 0.7220408864456279, + "grad_norm": 1.4316151142120361, + "learning_rate": 4.844829884332404e-06, + "loss": 1.1075, + "step": 1401 + }, + { + "epoch": 0.7225562618106854, + "grad_norm": 1.3410732746124268, + "learning_rate": 4.84459375648555e-06, + "loss": 1.1076, + "step": 1402 + }, + { + "epoch": 0.723071637175743, + "grad_norm": 1.3912278413772583, + "learning_rate": 4.844357454876327e-06, + "loss": 1.0403, + "step": 1403 + }, + { + "epoch": 0.7235870125408006, + "grad_norm": 1.3922302722930908, + "learning_rate": 4.844120979522248e-06, + "loss": 1.1087, + "step": 1404 + }, + { + "epoch": 0.7241023879058581, + "grad_norm": 1.449554204940796, + "learning_rate": 4.843884330440839e-06, + "loss": 1.0056, + "step": 1405 + }, + { + "epoch": 0.7246177632709156, + "grad_norm": 1.482605218887329, + "learning_rate": 4.843647507649637e-06, + "loss": 1.1308, + "step": 1406 + }, + { + "epoch": 0.7251331386359732, + "grad_norm": 1.3908926248550415, + "learning_rate": 4.843410511166194e-06, + "loss": 1.0587, + "step": 1407 + }, + { + "epoch": 0.7256485140010307, + "grad_norm": 1.4425791501998901, + "learning_rate": 4.843173341008075e-06, + "loss": 1.0651, + "step": 1408 + }, + { + "epoch": 0.7261638893660883, + "grad_norm": 1.3466830253601074, + "learning_rate": 4.8429359971928566e-06, + "loss": 1.0018, + "step": 1409 + }, + { + "epoch": 0.7266792647311459, + "grad_norm": 1.416321873664856, + "learning_rate": 4.8426984797381286e-06, + "loss": 1.0695, + "step": 1410 + }, + { + "epoch": 0.7271946400962034, + "grad_norm": 1.4909610748291016, + "learning_rate": 4.842460788661494e-06, + "loss": 1.0843, + "step": 1411 + }, + { + "epoch": 0.7277100154612609, + "grad_norm": 1.4182647466659546, + "learning_rate": 4.842222923980569e-06, + "loss": 1.1163, + "step": 1412 + }, + { + "epoch": 0.7282253908263185, + "grad_norm": 1.4103379249572754, + "learning_rate": 4.841984885712981e-06, + "loss": 1.0438, + "step": 1413 + }, + { + "epoch": 0.7287407661913761, + "grad_norm": 1.3778151273727417, + "learning_rate": 4.841746673876373e-06, + "loss": 1.0951, + "step": 1414 + }, + { + "epoch": 0.7292561415564336, + "grad_norm": 1.4006578922271729, + "learning_rate": 4.841508288488399e-06, + "loss": 1.0421, + "step": 1415 + }, + { + "epoch": 0.7297715169214911, + "grad_norm": 1.3837188482284546, + "learning_rate": 4.8412697295667255e-06, + "loss": 1.0583, + "step": 1416 + }, + { + "epoch": 0.7302868922865487, + "grad_norm": 1.331186294555664, + "learning_rate": 4.841030997129033e-06, + "loss": 1.0416, + "step": 1417 + }, + { + "epoch": 0.7308022676516063, + "grad_norm": 1.5883468389511108, + "learning_rate": 4.840792091193014e-06, + "loss": 1.159, + "step": 1418 + }, + { + "epoch": 0.7313176430166638, + "grad_norm": 1.3742607831954956, + "learning_rate": 4.840553011776376e-06, + "loss": 1.0589, + "step": 1419 + }, + { + "epoch": 0.7318330183817213, + "grad_norm": 1.5150750875473022, + "learning_rate": 4.8403137588968345e-06, + "loss": 1.1202, + "step": 1420 + }, + { + "epoch": 0.732348393746779, + "grad_norm": 1.5002319812774658, + "learning_rate": 4.8400743325721234e-06, + "loss": 1.0265, + "step": 1421 + }, + { + "epoch": 0.7328637691118365, + "grad_norm": 1.386014461517334, + "learning_rate": 4.839834732819987e-06, + "loss": 1.1345, + "step": 1422 + }, + { + "epoch": 0.733379144476894, + "grad_norm": 1.4680486917495728, + "learning_rate": 4.839594959658181e-06, + "loss": 1.1227, + "step": 1423 + }, + { + "epoch": 0.7338945198419515, + "grad_norm": 1.4484657049179077, + "learning_rate": 4.839355013104477e-06, + "loss": 1.0728, + "step": 1424 + }, + { + "epoch": 0.7344098952070091, + "grad_norm": 1.460443139076233, + "learning_rate": 4.839114893176658e-06, + "loss": 1.1153, + "step": 1425 + }, + { + "epoch": 0.7349252705720667, + "grad_norm": 1.4195655584335327, + "learning_rate": 4.838874599892517e-06, + "loss": 1.04, + "step": 1426 + }, + { + "epoch": 0.7354406459371242, + "grad_norm": 1.4055482149124146, + "learning_rate": 4.838634133269866e-06, + "loss": 1.0857, + "step": 1427 + }, + { + "epoch": 0.7359560213021817, + "grad_norm": 1.3839490413665771, + "learning_rate": 4.8383934933265246e-06, + "loss": 1.0826, + "step": 1428 + }, + { + "epoch": 0.7364713966672393, + "grad_norm": 1.41427743434906, + "learning_rate": 4.838152680080328e-06, + "loss": 1.0739, + "step": 1429 + }, + { + "epoch": 0.7369867720322969, + "grad_norm": 1.4746673107147217, + "learning_rate": 4.837911693549122e-06, + "loss": 1.1009, + "step": 1430 + }, + { + "epoch": 0.7375021473973544, + "grad_norm": 1.4033443927764893, + "learning_rate": 4.837670533750769e-06, + "loss": 1.101, + "step": 1431 + }, + { + "epoch": 0.738017522762412, + "grad_norm": 1.5204118490219116, + "learning_rate": 4.837429200703139e-06, + "loss": 1.1172, + "step": 1432 + }, + { + "epoch": 0.7385328981274695, + "grad_norm": 1.424597978591919, + "learning_rate": 4.837187694424119e-06, + "loss": 1.1488, + "step": 1433 + }, + { + "epoch": 0.7390482734925271, + "grad_norm": 1.4700113534927368, + "learning_rate": 4.836946014931608e-06, + "loss": 1.1524, + "step": 1434 + }, + { + "epoch": 0.7395636488575846, + "grad_norm": 1.5409232378005981, + "learning_rate": 4.836704162243517e-06, + "loss": 1.1443, + "step": 1435 + }, + { + "epoch": 0.7400790242226422, + "grad_norm": 1.4130326509475708, + "learning_rate": 4.836462136377769e-06, + "loss": 1.0748, + "step": 1436 + }, + { + "epoch": 0.7405943995876997, + "grad_norm": 1.3824483156204224, + "learning_rate": 4.836219937352302e-06, + "loss": 1.0329, + "step": 1437 + }, + { + "epoch": 0.7411097749527572, + "grad_norm": 1.4915496110916138, + "learning_rate": 4.835977565185067e-06, + "loss": 1.1141, + "step": 1438 + }, + { + "epoch": 0.7416251503178148, + "grad_norm": 1.395885944366455, + "learning_rate": 4.835735019894025e-06, + "loss": 1.0768, + "step": 1439 + }, + { + "epoch": 0.7421405256828724, + "grad_norm": 1.3711423873901367, + "learning_rate": 4.835492301497151e-06, + "loss": 1.0872, + "step": 1440 + }, + { + "epoch": 0.7426559010479299, + "grad_norm": 1.3521068096160889, + "learning_rate": 4.835249410012435e-06, + "loss": 1.0662, + "step": 1441 + }, + { + "epoch": 0.7431712764129874, + "grad_norm": 1.5525802373886108, + "learning_rate": 4.835006345457878e-06, + "loss": 1.0982, + "step": 1442 + }, + { + "epoch": 0.743686651778045, + "grad_norm": 1.4747167825698853, + "learning_rate": 4.8347631078514925e-06, + "loss": 1.0946, + "step": 1443 + }, + { + "epoch": 0.7442020271431026, + "grad_norm": 1.5162389278411865, + "learning_rate": 4.834519697211307e-06, + "loss": 1.0752, + "step": 1444 + }, + { + "epoch": 0.7447174025081601, + "grad_norm": 1.4368155002593994, + "learning_rate": 4.834276113555359e-06, + "loss": 1.0341, + "step": 1445 + }, + { + "epoch": 0.7452327778732176, + "grad_norm": 1.3954429626464844, + "learning_rate": 4.834032356901704e-06, + "loss": 1.086, + "step": 1446 + }, + { + "epoch": 0.7457481532382753, + "grad_norm": 1.3964476585388184, + "learning_rate": 4.833788427268404e-06, + "loss": 1.0735, + "step": 1447 + }, + { + "epoch": 0.7462635286033328, + "grad_norm": 1.421526551246643, + "learning_rate": 4.833544324673539e-06, + "loss": 1.0678, + "step": 1448 + }, + { + "epoch": 0.7467789039683903, + "grad_norm": 1.5207988023757935, + "learning_rate": 4.8333000491352005e-06, + "loss": 1.1284, + "step": 1449 + }, + { + "epoch": 0.7472942793334478, + "grad_norm": 1.478915810585022, + "learning_rate": 4.8330556006714915e-06, + "loss": 1.0822, + "step": 1450 + }, + { + "epoch": 0.7478096546985055, + "grad_norm": 1.372940182685852, + "learning_rate": 4.832810979300527e-06, + "loss": 1.0774, + "step": 1451 + }, + { + "epoch": 0.748325030063563, + "grad_norm": 1.3675280809402466, + "learning_rate": 4.83256618504044e-06, + "loss": 1.1038, + "step": 1452 + }, + { + "epoch": 0.7488404054286205, + "grad_norm": 1.4146181344985962, + "learning_rate": 4.832321217909368e-06, + "loss": 1.1329, + "step": 1453 + }, + { + "epoch": 0.749355780793678, + "grad_norm": 1.511335015296936, + "learning_rate": 4.832076077925469e-06, + "loss": 1.0523, + "step": 1454 + }, + { + "epoch": 0.7498711561587356, + "grad_norm": 1.5131245851516724, + "learning_rate": 4.8318307651069105e-06, + "loss": 1.0709, + "step": 1455 + }, + { + "epoch": 0.7503865315237932, + "grad_norm": 1.3569769859313965, + "learning_rate": 4.831585279471873e-06, + "loss": 1.0382, + "step": 1456 + }, + { + "epoch": 0.7509019068888507, + "grad_norm": 1.4782321453094482, + "learning_rate": 4.83133962103855e-06, + "loss": 1.0544, + "step": 1457 + }, + { + "epoch": 0.7514172822539082, + "grad_norm": 1.3547550439834595, + "learning_rate": 4.831093789825147e-06, + "loss": 1.0903, + "step": 1458 + }, + { + "epoch": 0.7519326576189658, + "grad_norm": 1.5269999504089355, + "learning_rate": 4.830847785849884e-06, + "loss": 1.1192, + "step": 1459 + }, + { + "epoch": 0.7524480329840234, + "grad_norm": 1.4924741983413696, + "learning_rate": 4.830601609130991e-06, + "loss": 1.0927, + "step": 1460 + }, + { + "epoch": 0.7529634083490809, + "grad_norm": 1.3824459314346313, + "learning_rate": 4.830355259686715e-06, + "loss": 1.0975, + "step": 1461 + }, + { + "epoch": 0.7534787837141385, + "grad_norm": 1.3687019348144531, + "learning_rate": 4.8301087375353125e-06, + "loss": 1.0942, + "step": 1462 + }, + { + "epoch": 0.753994159079196, + "grad_norm": 1.36841881275177, + "learning_rate": 4.829862042695053e-06, + "loss": 1.1201, + "step": 1463 + }, + { + "epoch": 0.7545095344442536, + "grad_norm": 1.3933489322662354, + "learning_rate": 4.82961517518422e-06, + "loss": 1.0588, + "step": 1464 + }, + { + "epoch": 0.7550249098093111, + "grad_norm": 1.4302362203598022, + "learning_rate": 4.829368135021111e-06, + "loss": 1.0806, + "step": 1465 + }, + { + "epoch": 0.7555402851743687, + "grad_norm": 1.4782177209854126, + "learning_rate": 4.829120922224031e-06, + "loss": 1.1152, + "step": 1466 + }, + { + "epoch": 0.7560556605394262, + "grad_norm": 1.412581205368042, + "learning_rate": 4.828873536811305e-06, + "loss": 1.0746, + "step": 1467 + }, + { + "epoch": 0.7565710359044837, + "grad_norm": 1.4294135570526123, + "learning_rate": 4.828625978801264e-06, + "loss": 1.045, + "step": 1468 + }, + { + "epoch": 0.7570864112695413, + "grad_norm": 1.5693269968032837, + "learning_rate": 4.828378248212259e-06, + "loss": 1.102, + "step": 1469 + }, + { + "epoch": 0.7576017866345989, + "grad_norm": 1.4787834882736206, + "learning_rate": 4.828130345062646e-06, + "loss": 1.0813, + "step": 1470 + }, + { + "epoch": 0.7581171619996564, + "grad_norm": 1.4717901945114136, + "learning_rate": 4.8278822693708e-06, + "loss": 1.0525, + "step": 1471 + }, + { + "epoch": 0.7586325373647139, + "grad_norm": 1.3925107717514038, + "learning_rate": 4.827634021155104e-06, + "loss": 1.0394, + "step": 1472 + }, + { + "epoch": 0.7591479127297716, + "grad_norm": 1.4415829181671143, + "learning_rate": 4.827385600433959e-06, + "loss": 1.046, + "step": 1473 + }, + { + "epoch": 0.7596632880948291, + "grad_norm": 1.4588543176651, + "learning_rate": 4.827137007225774e-06, + "loss": 1.062, + "step": 1474 + }, + { + "epoch": 0.7601786634598866, + "grad_norm": 1.485809564590454, + "learning_rate": 4.826888241548973e-06, + "loss": 1.1431, + "step": 1475 + }, + { + "epoch": 0.7606940388249441, + "grad_norm": 1.4579215049743652, + "learning_rate": 4.826639303421993e-06, + "loss": 1.0524, + "step": 1476 + }, + { + "epoch": 0.7612094141900018, + "grad_norm": 1.5078346729278564, + "learning_rate": 4.826390192863283e-06, + "loss": 1.0672, + "step": 1477 + }, + { + "epoch": 0.7617247895550593, + "grad_norm": 1.4479871988296509, + "learning_rate": 4.826140909891306e-06, + "loss": 1.1443, + "step": 1478 + }, + { + "epoch": 0.7622401649201168, + "grad_norm": 1.5223277807235718, + "learning_rate": 4.825891454524535e-06, + "loss": 1.1458, + "step": 1479 + }, + { + "epoch": 0.7627555402851743, + "grad_norm": 1.4447087049484253, + "learning_rate": 4.825641826781459e-06, + "loss": 1.1803, + "step": 1480 + }, + { + "epoch": 0.763270915650232, + "grad_norm": 1.4929414987564087, + "learning_rate": 4.8253920266805775e-06, + "loss": 1.1128, + "step": 1481 + }, + { + "epoch": 0.7637862910152895, + "grad_norm": 1.4813355207443237, + "learning_rate": 4.825142054240405e-06, + "loss": 1.0554, + "step": 1482 + }, + { + "epoch": 0.764301666380347, + "grad_norm": 1.3813947439193726, + "learning_rate": 4.824891909479466e-06, + "loss": 1.0799, + "step": 1483 + }, + { + "epoch": 0.7648170417454045, + "grad_norm": 1.278801679611206, + "learning_rate": 4.824641592416299e-06, + "loss": 1.0465, + "step": 1484 + }, + { + "epoch": 0.7653324171104621, + "grad_norm": 1.435333490371704, + "learning_rate": 4.824391103069457e-06, + "loss": 1.1328, + "step": 1485 + }, + { + "epoch": 0.7658477924755197, + "grad_norm": 1.3309084177017212, + "learning_rate": 4.824140441457504e-06, + "loss": 1.0975, + "step": 1486 + }, + { + "epoch": 0.7663631678405772, + "grad_norm": 1.4422417879104614, + "learning_rate": 4.823889607599016e-06, + "loss": 1.0134, + "step": 1487 + }, + { + "epoch": 0.7668785432056348, + "grad_norm": 1.4464055299758911, + "learning_rate": 4.823638601512583e-06, + "loss": 1.1186, + "step": 1488 + }, + { + "epoch": 0.7673939185706923, + "grad_norm": 1.434095859527588, + "learning_rate": 4.823387423216809e-06, + "loss": 1.1073, + "step": 1489 + }, + { + "epoch": 0.7679092939357499, + "grad_norm": 1.3983433246612549, + "learning_rate": 4.823136072730308e-06, + "loss": 1.1088, + "step": 1490 + }, + { + "epoch": 0.7684246693008074, + "grad_norm": 1.3363049030303955, + "learning_rate": 4.822884550071707e-06, + "loss": 1.0794, + "step": 1491 + }, + { + "epoch": 0.768940044665865, + "grad_norm": 1.4010666608810425, + "learning_rate": 4.822632855259649e-06, + "loss": 1.0241, + "step": 1492 + }, + { + "epoch": 0.7694554200309225, + "grad_norm": 1.3678712844848633, + "learning_rate": 4.822380988312787e-06, + "loss": 1.0472, + "step": 1493 + }, + { + "epoch": 0.7699707953959801, + "grad_norm": 1.4276469945907593, + "learning_rate": 4.8221289492497865e-06, + "loss": 1.0723, + "step": 1494 + }, + { + "epoch": 0.7704861707610376, + "grad_norm": 1.6227951049804688, + "learning_rate": 4.821876738089327e-06, + "loss": 1.0553, + "step": 1495 + }, + { + "epoch": 0.7710015461260952, + "grad_norm": 1.3535913228988647, + "learning_rate": 4.821624354850101e-06, + "loss": 1.0506, + "step": 1496 + }, + { + "epoch": 0.7715169214911527, + "grad_norm": 1.3324251174926758, + "learning_rate": 4.821371799550812e-06, + "loss": 1.0405, + "step": 1497 + }, + { + "epoch": 0.7720322968562102, + "grad_norm": 1.3958076238632202, + "learning_rate": 4.821119072210178e-06, + "loss": 1.0549, + "step": 1498 + }, + { + "epoch": 0.7725476722212679, + "grad_norm": 1.536328911781311, + "learning_rate": 4.820866172846929e-06, + "loss": 1.1138, + "step": 1499 + }, + { + "epoch": 0.7730630475863254, + "grad_norm": 1.4695860147476196, + "learning_rate": 4.820613101479809e-06, + "loss": 1.1443, + "step": 1500 + }, + { + "epoch": 0.7735784229513829, + "grad_norm": 1.4227354526519775, + "learning_rate": 4.820359858127571e-06, + "loss": 1.048, + "step": 1501 + }, + { + "epoch": 0.7740937983164404, + "grad_norm": 1.4998966455459595, + "learning_rate": 4.820106442808985e-06, + "loss": 1.1246, + "step": 1502 + }, + { + "epoch": 0.7746091736814981, + "grad_norm": 1.337685465812683, + "learning_rate": 4.819852855542833e-06, + "loss": 1.0523, + "step": 1503 + }, + { + "epoch": 0.7751245490465556, + "grad_norm": 1.4763025045394897, + "learning_rate": 4.8195990963479074e-06, + "loss": 1.0989, + "step": 1504 + }, + { + "epoch": 0.7756399244116131, + "grad_norm": 1.3346480131149292, + "learning_rate": 4.819345165243015e-06, + "loss": 1.0289, + "step": 1505 + }, + { + "epoch": 0.7761552997766706, + "grad_norm": 1.4553526639938354, + "learning_rate": 4.8190910622469755e-06, + "loss": 1.1123, + "step": 1506 + }, + { + "epoch": 0.7766706751417283, + "grad_norm": 1.4475520849227905, + "learning_rate": 4.81883678737862e-06, + "loss": 1.0793, + "step": 1507 + }, + { + "epoch": 0.7771860505067858, + "grad_norm": 1.3199334144592285, + "learning_rate": 4.818582340656796e-06, + "loss": 1.0676, + "step": 1508 + }, + { + "epoch": 0.7777014258718433, + "grad_norm": 1.5657682418823242, + "learning_rate": 4.818327722100357e-06, + "loss": 1.0967, + "step": 1509 + }, + { + "epoch": 0.7782168012369008, + "grad_norm": 1.5058013200759888, + "learning_rate": 4.818072931728176e-06, + "loss": 1.1101, + "step": 1510 + }, + { + "epoch": 0.7787321766019585, + "grad_norm": 1.399551272392273, + "learning_rate": 4.817817969559137e-06, + "loss": 1.0789, + "step": 1511 + }, + { + "epoch": 0.779247551967016, + "grad_norm": 1.3868203163146973, + "learning_rate": 4.8175628356121315e-06, + "loss": 1.0851, + "step": 1512 + }, + { + "epoch": 0.7797629273320735, + "grad_norm": 1.445791244506836, + "learning_rate": 4.817307529906072e-06, + "loss": 1.0933, + "step": 1513 + }, + { + "epoch": 0.780278302697131, + "grad_norm": 1.4244306087493896, + "learning_rate": 4.817052052459879e-06, + "loss": 1.0602, + "step": 1514 + }, + { + "epoch": 0.7807936780621886, + "grad_norm": 1.3899831771850586, + "learning_rate": 4.816796403292485e-06, + "loss": 1.1358, + "step": 1515 + }, + { + "epoch": 0.7813090534272462, + "grad_norm": 1.469082236289978, + "learning_rate": 4.816540582422838e-06, + "loss": 1.0909, + "step": 1516 + }, + { + "epoch": 0.7818244287923037, + "grad_norm": 1.4377481937408447, + "learning_rate": 4.816284589869895e-06, + "loss": 1.0468, + "step": 1517 + }, + { + "epoch": 0.7823398041573613, + "grad_norm": 1.5348589420318604, + "learning_rate": 4.8160284256526315e-06, + "loss": 1.0998, + "step": 1518 + }, + { + "epoch": 0.7828551795224188, + "grad_norm": 1.4987494945526123, + "learning_rate": 4.81577208979003e-06, + "loss": 1.1229, + "step": 1519 + }, + { + "epoch": 0.7833705548874764, + "grad_norm": 1.4241549968719482, + "learning_rate": 4.815515582301089e-06, + "loss": 1.0814, + "step": 1520 + }, + { + "epoch": 0.783885930252534, + "grad_norm": 1.3505808115005493, + "learning_rate": 4.815258903204818e-06, + "loss": 1.0634, + "step": 1521 + }, + { + "epoch": 0.7844013056175915, + "grad_norm": 1.5867079496383667, + "learning_rate": 4.815002052520242e-06, + "loss": 1.0748, + "step": 1522 + }, + { + "epoch": 0.784916680982649, + "grad_norm": 1.5646687746047974, + "learning_rate": 4.8147450302663935e-06, + "loss": 1.0922, + "step": 1523 + }, + { + "epoch": 0.7854320563477066, + "grad_norm": 1.3784677982330322, + "learning_rate": 4.814487836462323e-06, + "loss": 1.1321, + "step": 1524 + }, + { + "epoch": 0.7859474317127642, + "grad_norm": 1.3318524360656738, + "learning_rate": 4.814230471127092e-06, + "loss": 1.0415, + "step": 1525 + }, + { + "epoch": 0.7864628070778217, + "grad_norm": 1.3590197563171387, + "learning_rate": 4.813972934279773e-06, + "loss": 1.1184, + "step": 1526 + }, + { + "epoch": 0.7869781824428792, + "grad_norm": 1.5370324850082397, + "learning_rate": 4.8137152259394525e-06, + "loss": 1.0759, + "step": 1527 + }, + { + "epoch": 0.7874935578079367, + "grad_norm": 1.4619084596633911, + "learning_rate": 4.813457346125231e-06, + "loss": 1.1344, + "step": 1528 + }, + { + "epoch": 0.7880089331729944, + "grad_norm": 1.4561723470687866, + "learning_rate": 4.81319929485622e-06, + "loss": 1.0823, + "step": 1529 + }, + { + "epoch": 0.7885243085380519, + "grad_norm": 1.4673832654953003, + "learning_rate": 4.8129410721515435e-06, + "loss": 1.119, + "step": 1530 + }, + { + "epoch": 0.7890396839031094, + "grad_norm": 1.5363526344299316, + "learning_rate": 4.8126826780303405e-06, + "loss": 1.1164, + "step": 1531 + }, + { + "epoch": 0.7895550592681669, + "grad_norm": 1.4449119567871094, + "learning_rate": 4.812424112511759e-06, + "loss": 1.0176, + "step": 1532 + }, + { + "epoch": 0.7900704346332246, + "grad_norm": 1.3174402713775635, + "learning_rate": 4.8121653756149634e-06, + "loss": 1.069, + "step": 1533 + }, + { + "epoch": 0.7905858099982821, + "grad_norm": 1.4082295894622803, + "learning_rate": 4.811906467359128e-06, + "loss": 1.0688, + "step": 1534 + }, + { + "epoch": 0.7911011853633396, + "grad_norm": 1.4260579347610474, + "learning_rate": 4.811647387763442e-06, + "loss": 1.1151, + "step": 1535 + }, + { + "epoch": 0.7916165607283971, + "grad_norm": 1.3793349266052246, + "learning_rate": 4.811388136847106e-06, + "loss": 1.0744, + "step": 1536 + }, + { + "epoch": 0.7921319360934548, + "grad_norm": 1.3310152292251587, + "learning_rate": 4.811128714629333e-06, + "loss": 1.103, + "step": 1537 + }, + { + "epoch": 0.7926473114585123, + "grad_norm": 1.4189320802688599, + "learning_rate": 4.8108691211293505e-06, + "loss": 1.0989, + "step": 1538 + }, + { + "epoch": 0.7931626868235698, + "grad_norm": 1.5097674131393433, + "learning_rate": 4.810609356366396e-06, + "loss": 1.1754, + "step": 1539 + }, + { + "epoch": 0.7936780621886274, + "grad_norm": 1.4355213642120361, + "learning_rate": 4.810349420359722e-06, + "loss": 1.097, + "step": 1540 + }, + { + "epoch": 0.794193437553685, + "grad_norm": 1.5523383617401123, + "learning_rate": 4.810089313128593e-06, + "loss": 1.1276, + "step": 1541 + }, + { + "epoch": 0.7947088129187425, + "grad_norm": 1.373019814491272, + "learning_rate": 4.809829034692285e-06, + "loss": 1.0699, + "step": 1542 + }, + { + "epoch": 0.7952241882838, + "grad_norm": 1.4765411615371704, + "learning_rate": 4.809568585070089e-06, + "loss": 1.0658, + "step": 1543 + }, + { + "epoch": 0.7957395636488576, + "grad_norm": 1.4936842918395996, + "learning_rate": 4.809307964281308e-06, + "loss": 1.0711, + "step": 1544 + }, + { + "epoch": 0.7962549390139151, + "grad_norm": 1.3967301845550537, + "learning_rate": 4.8090471723452545e-06, + "loss": 1.1055, + "step": 1545 + }, + { + "epoch": 0.7967703143789727, + "grad_norm": 1.496781826019287, + "learning_rate": 4.808786209281259e-06, + "loss": 1.1327, + "step": 1546 + }, + { + "epoch": 0.7972856897440302, + "grad_norm": 1.3763643503189087, + "learning_rate": 4.80852507510866e-06, + "loss": 1.0183, + "step": 1547 + }, + { + "epoch": 0.7978010651090878, + "grad_norm": 1.4482959508895874, + "learning_rate": 4.808263769846811e-06, + "loss": 1.0594, + "step": 1548 + }, + { + "epoch": 0.7983164404741453, + "grad_norm": 1.4853146076202393, + "learning_rate": 4.808002293515079e-06, + "loss": 1.1297, + "step": 1549 + }, + { + "epoch": 0.7988318158392029, + "grad_norm": 1.4585527181625366, + "learning_rate": 4.807740646132843e-06, + "loss": 1.1218, + "step": 1550 + }, + { + "epoch": 0.7993471912042605, + "grad_norm": 1.5468567609786987, + "learning_rate": 4.8074788277194915e-06, + "loss": 1.0809, + "step": 1551 + }, + { + "epoch": 0.799862566569318, + "grad_norm": 1.6054692268371582, + "learning_rate": 4.807216838294431e-06, + "loss": 1.1209, + "step": 1552 + }, + { + "epoch": 0.8003779419343755, + "grad_norm": 1.4000954627990723, + "learning_rate": 4.806954677877076e-06, + "loss": 1.0575, + "step": 1553 + }, + { + "epoch": 0.8008933172994331, + "grad_norm": 1.486785650253296, + "learning_rate": 4.806692346486857e-06, + "loss": 1.0333, + "step": 1554 + }, + { + "epoch": 0.8014086926644907, + "grad_norm": 1.4417297840118408, + "learning_rate": 4.806429844143215e-06, + "loss": 1.1086, + "step": 1555 + }, + { + "epoch": 0.8019240680295482, + "grad_norm": 1.4330593347549438, + "learning_rate": 4.806167170865606e-06, + "loss": 1.0953, + "step": 1556 + }, + { + "epoch": 0.8024394433946057, + "grad_norm": 1.5242398977279663, + "learning_rate": 4.805904326673496e-06, + "loss": 1.081, + "step": 1557 + }, + { + "epoch": 0.8029548187596632, + "grad_norm": 1.4185315370559692, + "learning_rate": 4.805641311586365e-06, + "loss": 1.0364, + "step": 1558 + }, + { + "epoch": 0.8034701941247209, + "grad_norm": 1.4745197296142578, + "learning_rate": 4.805378125623707e-06, + "loss": 1.1042, + "step": 1559 + }, + { + "epoch": 0.8039855694897784, + "grad_norm": 1.4390727281570435, + "learning_rate": 4.805114768805024e-06, + "loss": 1.1048, + "step": 1560 + }, + { + "epoch": 0.8045009448548359, + "grad_norm": 1.3734118938446045, + "learning_rate": 4.804851241149837e-06, + "loss": 1.0185, + "step": 1561 + }, + { + "epoch": 0.8050163202198934, + "grad_norm": 1.424613356590271, + "learning_rate": 4.804587542677675e-06, + "loss": 1.1485, + "step": 1562 + }, + { + "epoch": 0.8055316955849511, + "grad_norm": 1.3926194906234741, + "learning_rate": 4.8043236734080815e-06, + "loss": 1.0809, + "step": 1563 + }, + { + "epoch": 0.8060470709500086, + "grad_norm": 1.406784176826477, + "learning_rate": 4.804059633360612e-06, + "loss": 1.0953, + "step": 1564 + }, + { + "epoch": 0.8065624463150661, + "grad_norm": 1.4212043285369873, + "learning_rate": 4.803795422554837e-06, + "loss": 1.1341, + "step": 1565 + }, + { + "epoch": 0.8070778216801237, + "grad_norm": 1.4541586637496948, + "learning_rate": 4.803531041010336e-06, + "loss": 1.0635, + "step": 1566 + }, + { + "epoch": 0.8075931970451813, + "grad_norm": 1.3861113786697388, + "learning_rate": 4.803266488746702e-06, + "loss": 1.0407, + "step": 1567 + }, + { + "epoch": 0.8081085724102388, + "grad_norm": 1.4320156574249268, + "learning_rate": 4.8030017657835435e-06, + "loss": 1.1112, + "step": 1568 + }, + { + "epoch": 0.8086239477752963, + "grad_norm": 1.3842825889587402, + "learning_rate": 4.8027368721404776e-06, + "loss": 1.0811, + "step": 1569 + }, + { + "epoch": 0.8091393231403539, + "grad_norm": 1.461434245109558, + "learning_rate": 4.802471807837137e-06, + "loss": 1.1129, + "step": 1570 + }, + { + "epoch": 0.8096546985054115, + "grad_norm": 1.4281977415084839, + "learning_rate": 4.8022065728931675e-06, + "loss": 1.1268, + "step": 1571 + }, + { + "epoch": 0.810170073870469, + "grad_norm": 1.3471062183380127, + "learning_rate": 4.801941167328223e-06, + "loss": 1.0179, + "step": 1572 + }, + { + "epoch": 0.8106854492355265, + "grad_norm": 1.4150283336639404, + "learning_rate": 4.8016755911619775e-06, + "loss": 1.0745, + "step": 1573 + }, + { + "epoch": 0.8112008246005841, + "grad_norm": 1.344197392463684, + "learning_rate": 4.8014098444141085e-06, + "loss": 1.1219, + "step": 1574 + }, + { + "epoch": 0.8117161999656416, + "grad_norm": 1.3955225944519043, + "learning_rate": 4.801143927104315e-06, + "loss": 1.0709, + "step": 1575 + }, + { + "epoch": 0.8122315753306992, + "grad_norm": 1.495924472808838, + "learning_rate": 4.800877839252302e-06, + "loss": 1.0954, + "step": 1576 + }, + { + "epoch": 0.8127469506957568, + "grad_norm": 1.537859320640564, + "learning_rate": 4.8006115808777924e-06, + "loss": 1.0299, + "step": 1577 + }, + { + "epoch": 0.8132623260608143, + "grad_norm": 1.3771979808807373, + "learning_rate": 4.800345152000517e-06, + "loss": 1.1131, + "step": 1578 + }, + { + "epoch": 0.8137777014258718, + "grad_norm": 1.3602322340011597, + "learning_rate": 4.8000785526402215e-06, + "loss": 1.0479, + "step": 1579 + }, + { + "epoch": 0.8142930767909294, + "grad_norm": 1.5317022800445557, + "learning_rate": 4.799811782816665e-06, + "loss": 1.0893, + "step": 1580 + }, + { + "epoch": 0.814808452155987, + "grad_norm": 1.4737638235092163, + "learning_rate": 4.799544842549618e-06, + "loss": 1.0801, + "step": 1581 + }, + { + "epoch": 0.8153238275210445, + "grad_norm": 1.4101364612579346, + "learning_rate": 4.799277731858863e-06, + "loss": 1.0891, + "step": 1582 + }, + { + "epoch": 0.815839202886102, + "grad_norm": 1.4352830648422241, + "learning_rate": 4.799010450764198e-06, + "loss": 1.0226, + "step": 1583 + }, + { + "epoch": 0.8163545782511596, + "grad_norm": 1.4002106189727783, + "learning_rate": 4.798742999285431e-06, + "loss": 1.0875, + "step": 1584 + }, + { + "epoch": 0.8168699536162172, + "grad_norm": 1.5443047285079956, + "learning_rate": 4.798475377442381e-06, + "loss": 1.1324, + "step": 1585 + }, + { + "epoch": 0.8173853289812747, + "grad_norm": 1.4441038370132446, + "learning_rate": 4.798207585254886e-06, + "loss": 1.0994, + "step": 1586 + }, + { + "epoch": 0.8179007043463322, + "grad_norm": 1.542736530303955, + "learning_rate": 4.79793962274279e-06, + "loss": 1.1252, + "step": 1587 + }, + { + "epoch": 0.8184160797113897, + "grad_norm": 1.426893711090088, + "learning_rate": 4.797671489925953e-06, + "loss": 1.1229, + "step": 1588 + }, + { + "epoch": 0.8189314550764474, + "grad_norm": 1.498072624206543, + "learning_rate": 4.7974031868242456e-06, + "loss": 1.0909, + "step": 1589 + }, + { + "epoch": 0.8194468304415049, + "grad_norm": 1.4439469575881958, + "learning_rate": 4.797134713457554e-06, + "loss": 1.0871, + "step": 1590 + }, + { + "epoch": 0.8199622058065624, + "grad_norm": 1.5463494062423706, + "learning_rate": 4.7968660698457745e-06, + "loss": 1.1358, + "step": 1591 + }, + { + "epoch": 0.82047758117162, + "grad_norm": 1.5635348558425903, + "learning_rate": 4.796597256008816e-06, + "loss": 1.0658, + "step": 1592 + }, + { + "epoch": 0.8209929565366776, + "grad_norm": 1.3745373487472534, + "learning_rate": 4.796328271966603e-06, + "loss": 1.0555, + "step": 1593 + }, + { + "epoch": 0.8215083319017351, + "grad_norm": 1.3798346519470215, + "learning_rate": 4.796059117739069e-06, + "loss": 1.1016, + "step": 1594 + }, + { + "epoch": 0.8220237072667926, + "grad_norm": 1.4389573335647583, + "learning_rate": 4.795789793346161e-06, + "loss": 1.1273, + "step": 1595 + }, + { + "epoch": 0.8225390826318502, + "grad_norm": 1.5225825309753418, + "learning_rate": 4.7955202988078405e-06, + "loss": 1.1082, + "step": 1596 + }, + { + "epoch": 0.8230544579969078, + "grad_norm": 1.3396738767623901, + "learning_rate": 4.7952506341440785e-06, + "loss": 1.0616, + "step": 1597 + }, + { + "epoch": 0.8235698333619653, + "grad_norm": 1.537752389907837, + "learning_rate": 4.794980799374862e-06, + "loss": 1.128, + "step": 1598 + }, + { + "epoch": 0.8240852087270228, + "grad_norm": 1.5089632272720337, + "learning_rate": 4.794710794520188e-06, + "loss": 1.1146, + "step": 1599 + }, + { + "epoch": 0.8246005840920804, + "grad_norm": 1.4799798727035522, + "learning_rate": 4.794440619600069e-06, + "loss": 1.063, + "step": 1600 + }, + { + "epoch": 0.825115959457138, + "grad_norm": 1.39742910861969, + "learning_rate": 4.794170274634525e-06, + "loss": 1.0339, + "step": 1601 + }, + { + "epoch": 0.8256313348221955, + "grad_norm": 1.3962360620498657, + "learning_rate": 4.793899759643595e-06, + "loss": 1.088, + "step": 1602 + }, + { + "epoch": 0.826146710187253, + "grad_norm": 1.3832945823669434, + "learning_rate": 4.793629074647325e-06, + "loss": 1.1044, + "step": 1603 + }, + { + "epoch": 0.8266620855523106, + "grad_norm": 1.3732894659042358, + "learning_rate": 4.793358219665777e-06, + "loss": 1.0417, + "step": 1604 + }, + { + "epoch": 0.8271774609173681, + "grad_norm": 1.4401576519012451, + "learning_rate": 4.793087194719024e-06, + "loss": 1.0932, + "step": 1605 + }, + { + "epoch": 0.8276928362824257, + "grad_norm": 1.4844118356704712, + "learning_rate": 4.7928159998271524e-06, + "loss": 1.0269, + "step": 1606 + }, + { + "epoch": 0.8282082116474833, + "grad_norm": 1.522064447402954, + "learning_rate": 4.792544635010262e-06, + "loss": 1.0943, + "step": 1607 + }, + { + "epoch": 0.8287235870125408, + "grad_norm": 1.42251718044281, + "learning_rate": 4.792273100288462e-06, + "loss": 1.0776, + "step": 1608 + }, + { + "epoch": 0.8292389623775983, + "grad_norm": 1.3952099084854126, + "learning_rate": 4.792001395681879e-06, + "loss": 1.0534, + "step": 1609 + }, + { + "epoch": 0.829754337742656, + "grad_norm": 1.3191391229629517, + "learning_rate": 4.791729521210649e-06, + "loss": 1.0532, + "step": 1610 + }, + { + "epoch": 0.8302697131077135, + "grad_norm": 1.4547145366668701, + "learning_rate": 4.791457476894918e-06, + "loss": 1.0893, + "step": 1611 + }, + { + "epoch": 0.830785088472771, + "grad_norm": 1.436697006225586, + "learning_rate": 4.791185262754852e-06, + "loss": 1.0766, + "step": 1612 + }, + { + "epoch": 0.8313004638378285, + "grad_norm": 1.414806842803955, + "learning_rate": 4.790912878810623e-06, + "loss": 1.0948, + "step": 1613 + }, + { + "epoch": 0.8318158392028862, + "grad_norm": 1.393926739692688, + "learning_rate": 4.790640325082419e-06, + "loss": 1.0085, + "step": 1614 + }, + { + "epoch": 0.8323312145679437, + "grad_norm": 1.4000269174575806, + "learning_rate": 4.7903676015904375e-06, + "loss": 0.9886, + "step": 1615 + }, + { + "epoch": 0.8328465899330012, + "grad_norm": 1.448590636253357, + "learning_rate": 4.790094708354892e-06, + "loss": 1.0801, + "step": 1616 + }, + { + "epoch": 0.8333619652980587, + "grad_norm": 1.398142695426941, + "learning_rate": 4.789821645396008e-06, + "loss": 1.1023, + "step": 1617 + }, + { + "epoch": 0.8338773406631163, + "grad_norm": 1.4157236814498901, + "learning_rate": 4.789548412734021e-06, + "loss": 1.0309, + "step": 1618 + }, + { + "epoch": 0.8343927160281739, + "grad_norm": 1.4708627462387085, + "learning_rate": 4.789275010389182e-06, + "loss": 1.0756, + "step": 1619 + }, + { + "epoch": 0.8349080913932314, + "grad_norm": 1.467354655265808, + "learning_rate": 4.789001438381753e-06, + "loss": 1.0863, + "step": 1620 + }, + { + "epoch": 0.8354234667582889, + "grad_norm": 1.4737015962600708, + "learning_rate": 4.7887276967320085e-06, + "loss": 1.114, + "step": 1621 + }, + { + "epoch": 0.8359388421233465, + "grad_norm": 1.3399767875671387, + "learning_rate": 4.7884537854602366e-06, + "loss": 1.0471, + "step": 1622 + }, + { + "epoch": 0.8364542174884041, + "grad_norm": 1.4572374820709229, + "learning_rate": 4.788179704586737e-06, + "loss": 1.1347, + "step": 1623 + }, + { + "epoch": 0.8369695928534616, + "grad_norm": 1.3600082397460938, + "learning_rate": 4.787905454131824e-06, + "loss": 1.0226, + "step": 1624 + }, + { + "epoch": 0.8374849682185191, + "grad_norm": 1.410685420036316, + "learning_rate": 4.787631034115819e-06, + "loss": 1.04, + "step": 1625 + }, + { + "epoch": 0.8380003435835767, + "grad_norm": 1.3994461297988892, + "learning_rate": 4.787356444559064e-06, + "loss": 1.0858, + "step": 1626 + }, + { + "epoch": 0.8385157189486343, + "grad_norm": 1.431602120399475, + "learning_rate": 4.7870816854819065e-06, + "loss": 1.1271, + "step": 1627 + }, + { + "epoch": 0.8390310943136918, + "grad_norm": 1.6482824087142944, + "learning_rate": 4.786806756904712e-06, + "loss": 1.1201, + "step": 1628 + }, + { + "epoch": 0.8395464696787494, + "grad_norm": 1.5163174867630005, + "learning_rate": 4.786531658847853e-06, + "loss": 1.0999, + "step": 1629 + }, + { + "epoch": 0.8400618450438069, + "grad_norm": 1.414298415184021, + "learning_rate": 4.786256391331719e-06, + "loss": 1.1207, + "step": 1630 + }, + { + "epoch": 0.8405772204088645, + "grad_norm": 1.457314372062683, + "learning_rate": 4.785980954376711e-06, + "loss": 1.0699, + "step": 1631 + }, + { + "epoch": 0.841092595773922, + "grad_norm": 1.3886388540267944, + "learning_rate": 4.7857053480032426e-06, + "loss": 1.0496, + "step": 1632 + }, + { + "epoch": 0.8416079711389796, + "grad_norm": 1.5067790746688843, + "learning_rate": 4.785429572231738e-06, + "loss": 1.0683, + "step": 1633 + }, + { + "epoch": 0.8421233465040371, + "grad_norm": 1.4911561012268066, + "learning_rate": 4.785153627082636e-06, + "loss": 1.0831, + "step": 1634 + }, + { + "epoch": 0.8426387218690946, + "grad_norm": 1.4155385494232178, + "learning_rate": 4.784877512576388e-06, + "loss": 1.0909, + "step": 1635 + }, + { + "epoch": 0.8431540972341522, + "grad_norm": 1.4064384698867798, + "learning_rate": 4.7846012287334565e-06, + "loss": 1.0895, + "step": 1636 + }, + { + "epoch": 0.8436694725992098, + "grad_norm": 1.4730241298675537, + "learning_rate": 4.784324775574318e-06, + "loss": 1.0602, + "step": 1637 + }, + { + "epoch": 0.8441848479642673, + "grad_norm": 1.4730726480484009, + "learning_rate": 4.78404815311946e-06, + "loss": 1.0693, + "step": 1638 + }, + { + "epoch": 0.8447002233293248, + "grad_norm": 1.4139877557754517, + "learning_rate": 4.783771361389387e-06, + "loss": 1.0198, + "step": 1639 + }, + { + "epoch": 0.8452155986943825, + "grad_norm": 1.3953264951705933, + "learning_rate": 4.783494400404608e-06, + "loss": 1.0857, + "step": 1640 + }, + { + "epoch": 0.84573097405944, + "grad_norm": 1.3684098720550537, + "learning_rate": 4.783217270185651e-06, + "loss": 0.9952, + "step": 1641 + }, + { + "epoch": 0.8462463494244975, + "grad_norm": 1.4131075143814087, + "learning_rate": 4.782939970753056e-06, + "loss": 1.0336, + "step": 1642 + }, + { + "epoch": 0.846761724789555, + "grad_norm": 1.4862349033355713, + "learning_rate": 4.782662502127372e-06, + "loss": 1.0847, + "step": 1643 + }, + { + "epoch": 0.8472771001546127, + "grad_norm": 1.4583141803741455, + "learning_rate": 4.782384864329164e-06, + "loss": 1.0507, + "step": 1644 + }, + { + "epoch": 0.8477924755196702, + "grad_norm": 1.4953395128250122, + "learning_rate": 4.782107057379008e-06, + "loss": 1.1075, + "step": 1645 + }, + { + "epoch": 0.8483078508847277, + "grad_norm": 1.4994615316390991, + "learning_rate": 4.781829081297493e-06, + "loss": 1.1372, + "step": 1646 + }, + { + "epoch": 0.8488232262497852, + "grad_norm": 1.4864728450775146, + "learning_rate": 4.781550936105219e-06, + "loss": 1.055, + "step": 1647 + }, + { + "epoch": 0.8493386016148428, + "grad_norm": 1.4858756065368652, + "learning_rate": 4.781272621822803e-06, + "loss": 1.0852, + "step": 1648 + }, + { + "epoch": 0.8498539769799004, + "grad_norm": 1.4812358617782593, + "learning_rate": 4.780994138470868e-06, + "loss": 1.0363, + "step": 1649 + }, + { + "epoch": 0.8503693523449579, + "grad_norm": 1.4463202953338623, + "learning_rate": 4.780715486070054e-06, + "loss": 1.0709, + "step": 1650 + }, + { + "epoch": 0.8508847277100154, + "grad_norm": 1.392630934715271, + "learning_rate": 4.780436664641014e-06, + "loss": 1.0162, + "step": 1651 + }, + { + "epoch": 0.851400103075073, + "grad_norm": 1.4439646005630493, + "learning_rate": 4.78015767420441e-06, + "loss": 1.0674, + "step": 1652 + }, + { + "epoch": 0.8519154784401306, + "grad_norm": 1.5503273010253906, + "learning_rate": 4.779878514780919e-06, + "loss": 1.0988, + "step": 1653 + }, + { + "epoch": 0.8524308538051881, + "grad_norm": 1.379058837890625, + "learning_rate": 4.779599186391232e-06, + "loss": 1.0799, + "step": 1654 + }, + { + "epoch": 0.8529462291702457, + "grad_norm": 1.4692100286483765, + "learning_rate": 4.779319689056047e-06, + "loss": 1.0233, + "step": 1655 + }, + { + "epoch": 0.8534616045353032, + "grad_norm": 1.5064520835876465, + "learning_rate": 4.7790400227960806e-06, + "loss": 1.0766, + "step": 1656 + }, + { + "epoch": 0.8539769799003608, + "grad_norm": 1.5419743061065674, + "learning_rate": 4.77876018763206e-06, + "loss": 1.0705, + "step": 1657 + }, + { + "epoch": 0.8544923552654183, + "grad_norm": 1.3527169227600098, + "learning_rate": 4.778480183584722e-06, + "loss": 1.0732, + "step": 1658 + }, + { + "epoch": 0.8550077306304759, + "grad_norm": 1.447250247001648, + "learning_rate": 4.778200010674819e-06, + "loss": 1.1307, + "step": 1659 + }, + { + "epoch": 0.8555231059955334, + "grad_norm": 1.3942347764968872, + "learning_rate": 4.777919668923117e-06, + "loss": 0.9875, + "step": 1660 + }, + { + "epoch": 0.856038481360591, + "grad_norm": 1.4627864360809326, + "learning_rate": 4.77763915835039e-06, + "loss": 1.0418, + "step": 1661 + }, + { + "epoch": 0.8565538567256485, + "grad_norm": 1.4622726440429688, + "learning_rate": 4.777358478977428e-06, + "loss": 1.0525, + "step": 1662 + }, + { + "epoch": 0.8570692320907061, + "grad_norm": 1.4471572637557983, + "learning_rate": 4.777077630825035e-06, + "loss": 1.0698, + "step": 1663 + }, + { + "epoch": 0.8575846074557636, + "grad_norm": 1.3427664041519165, + "learning_rate": 4.776796613914022e-06, + "loss": 1.0346, + "step": 1664 + }, + { + "epoch": 0.8580999828208211, + "grad_norm": 1.359036922454834, + "learning_rate": 4.7765154282652174e-06, + "loss": 1.106, + "step": 1665 + }, + { + "epoch": 0.8586153581858788, + "grad_norm": 1.426246166229248, + "learning_rate": 4.7762340738994596e-06, + "loss": 1.0604, + "step": 1666 + }, + { + "epoch": 0.8591307335509363, + "grad_norm": 1.5278077125549316, + "learning_rate": 4.775952550837601e-06, + "loss": 1.1139, + "step": 1667 + }, + { + "epoch": 0.8596461089159938, + "grad_norm": 1.4093070030212402, + "learning_rate": 4.775670859100505e-06, + "loss": 1.1157, + "step": 1668 + }, + { + "epoch": 0.8601614842810513, + "grad_norm": 1.4400460720062256, + "learning_rate": 4.77538899870905e-06, + "loss": 1.0485, + "step": 1669 + }, + { + "epoch": 0.860676859646109, + "grad_norm": 1.4249861240386963, + "learning_rate": 4.775106969684123e-06, + "loss": 1.046, + "step": 1670 + }, + { + "epoch": 0.8611922350111665, + "grad_norm": 1.6405653953552246, + "learning_rate": 4.774824772046627e-06, + "loss": 1.1025, + "step": 1671 + }, + { + "epoch": 0.861707610376224, + "grad_norm": 1.469918966293335, + "learning_rate": 4.774542405817476e-06, + "loss": 1.0906, + "step": 1672 + }, + { + "epoch": 0.8622229857412815, + "grad_norm": 1.4990814924240112, + "learning_rate": 4.774259871017597e-06, + "loss": 1.1187, + "step": 1673 + }, + { + "epoch": 0.8627383611063392, + "grad_norm": 1.4857546091079712, + "learning_rate": 4.773977167667929e-06, + "loss": 1.0131, + "step": 1674 + }, + { + "epoch": 0.8632537364713967, + "grad_norm": 1.5064942836761475, + "learning_rate": 4.773694295789423e-06, + "loss": 1.1068, + "step": 1675 + }, + { + "epoch": 0.8637691118364542, + "grad_norm": 1.4220467805862427, + "learning_rate": 4.773411255403044e-06, + "loss": 1.1122, + "step": 1676 + }, + { + "epoch": 0.8642844872015117, + "grad_norm": 1.460793375968933, + "learning_rate": 4.773128046529769e-06, + "loss": 1.0739, + "step": 1677 + }, + { + "epoch": 0.8647998625665693, + "grad_norm": 1.4518553018569946, + "learning_rate": 4.772844669190586e-06, + "loss": 1.0477, + "step": 1678 + }, + { + "epoch": 0.8653152379316269, + "grad_norm": 1.4068254232406616, + "learning_rate": 4.772561123406497e-06, + "loss": 1.11, + "step": 1679 + }, + { + "epoch": 0.8658306132966844, + "grad_norm": 1.4043121337890625, + "learning_rate": 4.772277409198517e-06, + "loss": 1.0531, + "step": 1680 + }, + { + "epoch": 0.866345988661742, + "grad_norm": 1.501981258392334, + "learning_rate": 4.771993526587671e-06, + "loss": 1.0452, + "step": 1681 + }, + { + "epoch": 0.8668613640267995, + "grad_norm": 1.3823513984680176, + "learning_rate": 4.771709475595e-06, + "loss": 1.1162, + "step": 1682 + }, + { + "epoch": 0.8673767393918571, + "grad_norm": 1.4772850275039673, + "learning_rate": 4.771425256241555e-06, + "loss": 1.0594, + "step": 1683 + }, + { + "epoch": 0.8678921147569146, + "grad_norm": 1.4475082159042358, + "learning_rate": 4.771140868548399e-06, + "loss": 1.102, + "step": 1684 + }, + { + "epoch": 0.8684074901219722, + "grad_norm": 1.3387587070465088, + "learning_rate": 4.770856312536609e-06, + "loss": 1.0464, + "step": 1685 + }, + { + "epoch": 0.8689228654870297, + "grad_norm": 1.4844287633895874, + "learning_rate": 4.770571588227275e-06, + "loss": 1.0448, + "step": 1686 + }, + { + "epoch": 0.8694382408520873, + "grad_norm": 1.4855479001998901, + "learning_rate": 4.7702866956414974e-06, + "loss": 1.0607, + "step": 1687 + }, + { + "epoch": 0.8699536162171448, + "grad_norm": 1.3221168518066406, + "learning_rate": 4.77000163480039e-06, + "loss": 1.0648, + "step": 1688 + }, + { + "epoch": 0.8704689915822024, + "grad_norm": 1.4294415712356567, + "learning_rate": 4.76971640572508e-06, + "loss": 0.9887, + "step": 1689 + }, + { + "epoch": 0.8709843669472599, + "grad_norm": 1.4959137439727783, + "learning_rate": 4.769431008436705e-06, + "loss": 1.1318, + "step": 1690 + }, + { + "epoch": 0.8714997423123174, + "grad_norm": 1.5279430150985718, + "learning_rate": 4.769145442956418e-06, + "loss": 1.1354, + "step": 1691 + }, + { + "epoch": 0.872015117677375, + "grad_norm": 1.42916738986969, + "learning_rate": 4.768859709305382e-06, + "loss": 1.1105, + "step": 1692 + }, + { + "epoch": 0.8725304930424326, + "grad_norm": 1.361976146697998, + "learning_rate": 4.768573807504773e-06, + "loss": 1.038, + "step": 1693 + }, + { + "epoch": 0.8730458684074901, + "grad_norm": 1.5706747770309448, + "learning_rate": 4.76828773757578e-06, + "loss": 1.0973, + "step": 1694 + }, + { + "epoch": 0.8735612437725476, + "grad_norm": 1.5899977684020996, + "learning_rate": 4.768001499539605e-06, + "loss": 1.0994, + "step": 1695 + }, + { + "epoch": 0.8740766191376053, + "grad_norm": 1.4544364213943481, + "learning_rate": 4.767715093417461e-06, + "loss": 1.0912, + "step": 1696 + }, + { + "epoch": 0.8745919945026628, + "grad_norm": 1.4816296100616455, + "learning_rate": 4.767428519230573e-06, + "loss": 1.1046, + "step": 1697 + }, + { + "epoch": 0.8751073698677203, + "grad_norm": 1.491078495979309, + "learning_rate": 4.767141777000181e-06, + "loss": 1.0394, + "step": 1698 + }, + { + "epoch": 0.8756227452327778, + "grad_norm": 1.393487572669983, + "learning_rate": 4.766854866747536e-06, + "loss": 1.1015, + "step": 1699 + }, + { + "epoch": 0.8761381205978355, + "grad_norm": 1.5540200471878052, + "learning_rate": 4.7665677884939005e-06, + "loss": 1.1323, + "step": 1700 + }, + { + "epoch": 0.876653495962893, + "grad_norm": 1.4467482566833496, + "learning_rate": 4.766280542260551e-06, + "loss": 1.1179, + "step": 1701 + }, + { + "epoch": 0.8771688713279505, + "grad_norm": 1.3639161586761475, + "learning_rate": 4.765993128068776e-06, + "loss": 1.0567, + "step": 1702 + }, + { + "epoch": 0.877684246693008, + "grad_norm": 1.432669997215271, + "learning_rate": 4.765705545939876e-06, + "loss": 1.0641, + "step": 1703 + }, + { + "epoch": 0.8781996220580657, + "grad_norm": 1.479915976524353, + "learning_rate": 4.765417795895164e-06, + "loss": 1.0978, + "step": 1704 + }, + { + "epoch": 0.8787149974231232, + "grad_norm": 1.4626078605651855, + "learning_rate": 4.7651298779559675e-06, + "loss": 1.0788, + "step": 1705 + }, + { + "epoch": 0.8792303727881807, + "grad_norm": 1.5150612592697144, + "learning_rate": 4.7648417921436226e-06, + "loss": 1.1402, + "step": 1706 + }, + { + "epoch": 0.8797457481532382, + "grad_norm": 1.407073736190796, + "learning_rate": 4.764553538479481e-06, + "loss": 1.1194, + "step": 1707 + }, + { + "epoch": 0.8802611235182958, + "grad_norm": 1.4320027828216553, + "learning_rate": 4.764265116984905e-06, + "loss": 1.1377, + "step": 1708 + }, + { + "epoch": 0.8807764988833534, + "grad_norm": 1.5515000820159912, + "learning_rate": 4.76397652768127e-06, + "loss": 1.0789, + "step": 1709 + }, + { + "epoch": 0.8812918742484109, + "grad_norm": 1.3981959819793701, + "learning_rate": 4.7636877705899645e-06, + "loss": 1.0753, + "step": 1710 + }, + { + "epoch": 0.8818072496134685, + "grad_norm": 1.3862941265106201, + "learning_rate": 4.7633988457323895e-06, + "loss": 1.1064, + "step": 1711 + }, + { + "epoch": 0.882322624978526, + "grad_norm": 1.468210220336914, + "learning_rate": 4.763109753129956e-06, + "loss": 1.062, + "step": 1712 + }, + { + "epoch": 0.8828380003435836, + "grad_norm": 1.433226227760315, + "learning_rate": 4.762820492804091e-06, + "loss": 1.0428, + "step": 1713 + }, + { + "epoch": 0.8833533757086411, + "grad_norm": 1.4374988079071045, + "learning_rate": 4.76253106477623e-06, + "loss": 1.0639, + "step": 1714 + }, + { + "epoch": 0.8838687510736987, + "grad_norm": 1.3978921175003052, + "learning_rate": 4.762241469067826e-06, + "loss": 1.0801, + "step": 1715 + }, + { + "epoch": 0.8843841264387562, + "grad_norm": 1.521583914756775, + "learning_rate": 4.761951705700339e-06, + "loss": 1.0827, + "step": 1716 + }, + { + "epoch": 0.8848995018038138, + "grad_norm": 1.511265516281128, + "learning_rate": 4.761661774695244e-06, + "loss": 1.0729, + "step": 1717 + }, + { + "epoch": 0.8854148771688714, + "grad_norm": 1.364372968673706, + "learning_rate": 4.761371676074031e-06, + "loss": 1.0126, + "step": 1718 + }, + { + "epoch": 0.8859302525339289, + "grad_norm": 1.4365894794464111, + "learning_rate": 4.761081409858197e-06, + "loss": 1.0979, + "step": 1719 + }, + { + "epoch": 0.8864456278989864, + "grad_norm": 1.5464420318603516, + "learning_rate": 4.760790976069255e-06, + "loss": 1.0434, + "step": 1720 + }, + { + "epoch": 0.8869610032640439, + "grad_norm": 1.3821452856063843, + "learning_rate": 4.760500374728729e-06, + "loss": 1.0707, + "step": 1721 + }, + { + "epoch": 0.8874763786291016, + "grad_norm": 1.4921995401382446, + "learning_rate": 4.760209605858158e-06, + "loss": 1.0479, + "step": 1722 + }, + { + "epoch": 0.8879917539941591, + "grad_norm": 1.4335943460464478, + "learning_rate": 4.759918669479089e-06, + "loss": 1.0297, + "step": 1723 + }, + { + "epoch": 0.8885071293592166, + "grad_norm": 1.6402289867401123, + "learning_rate": 4.759627565613086e-06, + "loss": 1.0822, + "step": 1724 + }, + { + "epoch": 0.8890225047242741, + "grad_norm": 1.4306977987289429, + "learning_rate": 4.7593362942817215e-06, + "loss": 1.102, + "step": 1725 + }, + { + "epoch": 0.8895378800893318, + "grad_norm": 1.355398178100586, + "learning_rate": 4.759044855506584e-06, + "loss": 1.1356, + "step": 1726 + }, + { + "epoch": 0.8900532554543893, + "grad_norm": 1.5569396018981934, + "learning_rate": 4.7587532493092706e-06, + "loss": 1.0976, + "step": 1727 + }, + { + "epoch": 0.8905686308194468, + "grad_norm": 1.453972578048706, + "learning_rate": 4.7584614757113946e-06, + "loss": 1.0623, + "step": 1728 + }, + { + "epoch": 0.8910840061845043, + "grad_norm": 1.3104867935180664, + "learning_rate": 4.758169534734579e-06, + "loss": 1.0573, + "step": 1729 + }, + { + "epoch": 0.891599381549562, + "grad_norm": 1.5181728601455688, + "learning_rate": 4.757877426400459e-06, + "loss": 1.1099, + "step": 1730 + }, + { + "epoch": 0.8921147569146195, + "grad_norm": 1.472532033920288, + "learning_rate": 4.757585150730686e-06, + "loss": 1.1179, + "step": 1731 + }, + { + "epoch": 0.892630132279677, + "grad_norm": 1.4582340717315674, + "learning_rate": 4.757292707746919e-06, + "loss": 1.0528, + "step": 1732 + }, + { + "epoch": 0.8931455076447345, + "grad_norm": 1.2936973571777344, + "learning_rate": 4.757000097470832e-06, + "loss": 1.0641, + "step": 1733 + }, + { + "epoch": 0.8936608830097922, + "grad_norm": 1.4027740955352783, + "learning_rate": 4.756707319924111e-06, + "loss": 1.0126, + "step": 1734 + }, + { + "epoch": 0.8941762583748497, + "grad_norm": 1.448681354522705, + "learning_rate": 4.756414375128454e-06, + "loss": 1.0498, + "step": 1735 + }, + { + "epoch": 0.8946916337399072, + "grad_norm": 1.3526110649108887, + "learning_rate": 4.756121263105573e-06, + "loss": 1.0887, + "step": 1736 + }, + { + "epoch": 0.8952070091049648, + "grad_norm": 1.6526248455047607, + "learning_rate": 4.755827983877189e-06, + "loss": 1.0404, + "step": 1737 + }, + { + "epoch": 0.8957223844700223, + "grad_norm": 1.5655080080032349, + "learning_rate": 4.75553453746504e-06, + "loss": 1.0874, + "step": 1738 + }, + { + "epoch": 0.8962377598350799, + "grad_norm": 1.5430996417999268, + "learning_rate": 4.755240923890871e-06, + "loss": 1.0711, + "step": 1739 + }, + { + "epoch": 0.8967531352001374, + "grad_norm": 1.5913987159729004, + "learning_rate": 4.754947143176445e-06, + "loss": 1.146, + "step": 1740 + }, + { + "epoch": 0.897268510565195, + "grad_norm": 1.5870795249938965, + "learning_rate": 4.754653195343533e-06, + "loss": 1.0612, + "step": 1741 + }, + { + "epoch": 0.8977838859302525, + "grad_norm": 1.4600235223770142, + "learning_rate": 4.754359080413921e-06, + "loss": 1.0459, + "step": 1742 + }, + { + "epoch": 0.8982992612953101, + "grad_norm": 1.448747992515564, + "learning_rate": 4.754064798409406e-06, + "loss": 1.0574, + "step": 1743 + }, + { + "epoch": 0.8988146366603676, + "grad_norm": 1.4169598817825317, + "learning_rate": 4.7537703493517965e-06, + "loss": 1.0326, + "step": 1744 + }, + { + "epoch": 0.8993300120254252, + "grad_norm": 1.4494582414627075, + "learning_rate": 4.753475733262917e-06, + "loss": 1.0229, + "step": 1745 + }, + { + "epoch": 0.8998453873904827, + "grad_norm": 1.4525147676467896, + "learning_rate": 4.7531809501646e-06, + "loss": 1.047, + "step": 1746 + }, + { + "epoch": 0.9003607627555403, + "grad_norm": 1.3870949745178223, + "learning_rate": 4.752886000078694e-06, + "loss": 1.1006, + "step": 1747 + }, + { + "epoch": 0.9008761381205979, + "grad_norm": 1.5141352415084839, + "learning_rate": 4.752590883027058e-06, + "loss": 1.0681, + "step": 1748 + }, + { + "epoch": 0.9013915134856554, + "grad_norm": 1.396710991859436, + "learning_rate": 4.752295599031563e-06, + "loss": 1.1347, + "step": 1749 + }, + { + "epoch": 0.9019068888507129, + "grad_norm": 1.4481197595596313, + "learning_rate": 4.752000148114093e-06, + "loss": 1.0563, + "step": 1750 + }, + { + "epoch": 0.9024222642157704, + "grad_norm": 1.4166992902755737, + "learning_rate": 4.751704530296546e-06, + "loss": 1.0358, + "step": 1751 + }, + { + "epoch": 0.9029376395808281, + "grad_norm": 1.5104217529296875, + "learning_rate": 4.75140874560083e-06, + "loss": 1.066, + "step": 1752 + }, + { + "epoch": 0.9034530149458856, + "grad_norm": 1.3824926614761353, + "learning_rate": 4.751112794048864e-06, + "loss": 1.162, + "step": 1753 + }, + { + "epoch": 0.9039683903109431, + "grad_norm": 1.457301378250122, + "learning_rate": 4.750816675662585e-06, + "loss": 1.0805, + "step": 1754 + }, + { + "epoch": 0.9044837656760006, + "grad_norm": 1.503834843635559, + "learning_rate": 4.750520390463937e-06, + "loss": 1.052, + "step": 1755 + }, + { + "epoch": 0.9049991410410583, + "grad_norm": 1.5135822296142578, + "learning_rate": 4.75022393847488e-06, + "loss": 1.1013, + "step": 1756 + }, + { + "epoch": 0.9055145164061158, + "grad_norm": 1.4753880500793457, + "learning_rate": 4.7499273197173816e-06, + "loss": 1.0819, + "step": 1757 + }, + { + "epoch": 0.9060298917711733, + "grad_norm": 1.4338304996490479, + "learning_rate": 4.749630534213426e-06, + "loss": 1.0369, + "step": 1758 + }, + { + "epoch": 0.9065452671362308, + "grad_norm": 1.4980969429016113, + "learning_rate": 4.74933358198501e-06, + "loss": 1.0672, + "step": 1759 + }, + { + "epoch": 0.9070606425012885, + "grad_norm": 1.4337161779403687, + "learning_rate": 4.74903646305414e-06, + "loss": 1.0707, + "step": 1760 + }, + { + "epoch": 0.907576017866346, + "grad_norm": 1.5216634273529053, + "learning_rate": 4.748739177442837e-06, + "loss": 1.0928, + "step": 1761 + }, + { + "epoch": 0.9080913932314035, + "grad_norm": 1.4658452272415161, + "learning_rate": 4.748441725173132e-06, + "loss": 1.0317, + "step": 1762 + }, + { + "epoch": 0.9086067685964611, + "grad_norm": 1.3911515474319458, + "learning_rate": 4.748144106267071e-06, + "loss": 1.1078, + "step": 1763 + }, + { + "epoch": 0.9091221439615187, + "grad_norm": 1.3948339223861694, + "learning_rate": 4.74784632074671e-06, + "loss": 1.0886, + "step": 1764 + }, + { + "epoch": 0.9096375193265762, + "grad_norm": 1.3779579401016235, + "learning_rate": 4.747548368634119e-06, + "loss": 1.1183, + "step": 1765 + }, + { + "epoch": 0.9101528946916337, + "grad_norm": 1.5271098613739014, + "learning_rate": 4.747250249951381e-06, + "loss": 1.0797, + "step": 1766 + }, + { + "epoch": 0.9106682700566913, + "grad_norm": 1.4378695487976074, + "learning_rate": 4.746951964720589e-06, + "loss": 1.0775, + "step": 1767 + }, + { + "epoch": 0.9111836454217488, + "grad_norm": 1.438913345336914, + "learning_rate": 4.746653512963849e-06, + "loss": 1.0841, + "step": 1768 + }, + { + "epoch": 0.9116990207868064, + "grad_norm": 1.488240122795105, + "learning_rate": 4.74635489470328e-06, + "loss": 1.1125, + "step": 1769 + }, + { + "epoch": 0.912214396151864, + "grad_norm": 1.5185062885284424, + "learning_rate": 4.746056109961014e-06, + "loss": 1.0561, + "step": 1770 + }, + { + "epoch": 0.9127297715169215, + "grad_norm": 1.3826409578323364, + "learning_rate": 4.745757158759194e-06, + "loss": 1.1072, + "step": 1771 + }, + { + "epoch": 0.913245146881979, + "grad_norm": 1.4739314317703247, + "learning_rate": 4.745458041119976e-06, + "loss": 1.0743, + "step": 1772 + }, + { + "epoch": 0.9137605222470366, + "grad_norm": 1.4196566343307495, + "learning_rate": 4.7451587570655274e-06, + "loss": 1.0947, + "step": 1773 + }, + { + "epoch": 0.9142758976120942, + "grad_norm": 1.4553617238998413, + "learning_rate": 4.7448593066180305e-06, + "loss": 1.1062, + "step": 1774 + }, + { + "epoch": 0.9147912729771517, + "grad_norm": 1.4958308935165405, + "learning_rate": 4.744559689799677e-06, + "loss": 1.1129, + "step": 1775 + }, + { + "epoch": 0.9153066483422092, + "grad_norm": 1.470471739768982, + "learning_rate": 4.744259906632672e-06, + "loss": 1.0458, + "step": 1776 + }, + { + "epoch": 0.9158220237072668, + "grad_norm": 1.401932716369629, + "learning_rate": 4.743959957139233e-06, + "loss": 1.0178, + "step": 1777 + }, + { + "epoch": 0.9163373990723244, + "grad_norm": 1.4637651443481445, + "learning_rate": 4.74365984134159e-06, + "loss": 1.0887, + "step": 1778 + }, + { + "epoch": 0.9168527744373819, + "grad_norm": 1.4661166667938232, + "learning_rate": 4.743359559261985e-06, + "loss": 1.0175, + "step": 1779 + }, + { + "epoch": 0.9173681498024394, + "grad_norm": 1.4048773050308228, + "learning_rate": 4.7430591109226735e-06, + "loss": 1.0619, + "step": 1780 + }, + { + "epoch": 0.9178835251674969, + "grad_norm": 1.4620808362960815, + "learning_rate": 4.7427584963459206e-06, + "loss": 1.0759, + "step": 1781 + }, + { + "epoch": 0.9183989005325546, + "grad_norm": 1.4824326038360596, + "learning_rate": 4.742457715554006e-06, + "loss": 1.0613, + "step": 1782 + }, + { + "epoch": 0.9189142758976121, + "grad_norm": 1.4724524021148682, + "learning_rate": 4.742156768569223e-06, + "loss": 1.0845, + "step": 1783 + }, + { + "epoch": 0.9194296512626696, + "grad_norm": 1.4528087377548218, + "learning_rate": 4.741855655413871e-06, + "loss": 1.0805, + "step": 1784 + }, + { + "epoch": 0.9199450266277271, + "grad_norm": 1.5265142917633057, + "learning_rate": 4.741554376110272e-06, + "loss": 1.073, + "step": 1785 + }, + { + "epoch": 0.9204604019927848, + "grad_norm": 1.4664232730865479, + "learning_rate": 4.74125293068075e-06, + "loss": 1.052, + "step": 1786 + }, + { + "epoch": 0.9209757773578423, + "grad_norm": 1.5155519247055054, + "learning_rate": 4.7409513191476464e-06, + "loss": 1.0568, + "step": 1787 + }, + { + "epoch": 0.9214911527228998, + "grad_norm": 1.5170753002166748, + "learning_rate": 4.740649541533316e-06, + "loss": 1.1125, + "step": 1788 + }, + { + "epoch": 0.9220065280879574, + "grad_norm": 1.379411220550537, + "learning_rate": 4.740347597860121e-06, + "loss": 1.0134, + "step": 1789 + }, + { + "epoch": 0.922521903453015, + "grad_norm": 1.4462095499038696, + "learning_rate": 4.7400454881504426e-06, + "loss": 1.1216, + "step": 1790 + }, + { + "epoch": 0.9230372788180725, + "grad_norm": 1.4491463899612427, + "learning_rate": 4.7397432124266685e-06, + "loss": 1.0911, + "step": 1791 + }, + { + "epoch": 0.92355265418313, + "grad_norm": 1.4337164163589478, + "learning_rate": 4.739440770711202e-06, + "loss": 1.1264, + "step": 1792 + }, + { + "epoch": 0.9240680295481876, + "grad_norm": 1.3294486999511719, + "learning_rate": 4.739138163026457e-06, + "loss": 1.0702, + "step": 1793 + }, + { + "epoch": 0.9245834049132452, + "grad_norm": 1.4707309007644653, + "learning_rate": 4.7388353893948594e-06, + "loss": 1.0553, + "step": 1794 + }, + { + "epoch": 0.9250987802783027, + "grad_norm": 1.3893729448318481, + "learning_rate": 4.7385324498388505e-06, + "loss": 1.0428, + "step": 1795 + }, + { + "epoch": 0.9256141556433602, + "grad_norm": 1.4164751768112183, + "learning_rate": 4.738229344380881e-06, + "loss": 1.0856, + "step": 1796 + }, + { + "epoch": 0.9261295310084178, + "grad_norm": 1.6173774003982544, + "learning_rate": 4.737926073043413e-06, + "loss": 1.0157, + "step": 1797 + }, + { + "epoch": 0.9266449063734753, + "grad_norm": 1.326172113418579, + "learning_rate": 4.737622635848924e-06, + "loss": 1.0672, + "step": 1798 + }, + { + "epoch": 0.9271602817385329, + "grad_norm": 1.457281470298767, + "learning_rate": 4.7373190328199024e-06, + "loss": 1.1605, + "step": 1799 + }, + { + "epoch": 0.9276756571035905, + "grad_norm": 1.3793095350265503, + "learning_rate": 4.737015263978849e-06, + "loss": 1.0886, + "step": 1800 + }, + { + "epoch": 0.928191032468648, + "grad_norm": 1.5238492488861084, + "learning_rate": 4.736711329348276e-06, + "loss": 1.1237, + "step": 1801 + }, + { + "epoch": 0.9287064078337055, + "grad_norm": 1.3719145059585571, + "learning_rate": 4.7364072289507086e-06, + "loss": 1.0802, + "step": 1802 + }, + { + "epoch": 0.9292217831987631, + "grad_norm": 1.419771671295166, + "learning_rate": 4.736102962808685e-06, + "loss": 1.0647, + "step": 1803 + }, + { + "epoch": 0.9297371585638207, + "grad_norm": 1.4599947929382324, + "learning_rate": 4.7357985309447534e-06, + "loss": 1.0956, + "step": 1804 + }, + { + "epoch": 0.9302525339288782, + "grad_norm": 1.4967395067214966, + "learning_rate": 4.735493933381478e-06, + "loss": 1.0781, + "step": 1805 + }, + { + "epoch": 0.9307679092939357, + "grad_norm": 1.4744763374328613, + "learning_rate": 4.735189170141431e-06, + "loss": 1.0775, + "step": 1806 + }, + { + "epoch": 0.9312832846589933, + "grad_norm": 1.377068042755127, + "learning_rate": 4.734884241247201e-06, + "loss": 1.0297, + "step": 1807 + }, + { + "epoch": 0.9317986600240509, + "grad_norm": 1.4732213020324707, + "learning_rate": 4.734579146721385e-06, + "loss": 1.1569, + "step": 1808 + }, + { + "epoch": 0.9323140353891084, + "grad_norm": 1.48055100440979, + "learning_rate": 4.734273886586595e-06, + "loss": 1.0906, + "step": 1809 + }, + { + "epoch": 0.9328294107541659, + "grad_norm": 1.4947986602783203, + "learning_rate": 4.7339684608654545e-06, + "loss": 1.0595, + "step": 1810 + }, + { + "epoch": 0.9333447861192234, + "grad_norm": 1.4390573501586914, + "learning_rate": 4.7336628695805996e-06, + "loss": 1.005, + "step": 1811 + }, + { + "epoch": 0.9338601614842811, + "grad_norm": 1.4130473136901855, + "learning_rate": 4.733357112754677e-06, + "loss": 1.0602, + "step": 1812 + }, + { + "epoch": 0.9343755368493386, + "grad_norm": 1.4222996234893799, + "learning_rate": 4.733051190410348e-06, + "loss": 1.0702, + "step": 1813 + }, + { + "epoch": 0.9348909122143961, + "grad_norm": 1.394693374633789, + "learning_rate": 4.732745102570284e-06, + "loss": 1.0983, + "step": 1814 + }, + { + "epoch": 0.9354062875794537, + "grad_norm": 1.4363412857055664, + "learning_rate": 4.732438849257172e-06, + "loss": 1.1394, + "step": 1815 + }, + { + "epoch": 0.9359216629445113, + "grad_norm": 1.5439742803573608, + "learning_rate": 4.732132430493707e-06, + "loss": 1.144, + "step": 1816 + }, + { + "epoch": 0.9364370383095688, + "grad_norm": 1.386092185974121, + "learning_rate": 4.731825846302599e-06, + "loss": 0.9641, + "step": 1817 + }, + { + "epoch": 0.9369524136746263, + "grad_norm": 1.4146597385406494, + "learning_rate": 4.731519096706569e-06, + "loss": 1.0827, + "step": 1818 + }, + { + "epoch": 0.9374677890396839, + "grad_norm": 1.3727401494979858, + "learning_rate": 4.7312121817283515e-06, + "loss": 1.0087, + "step": 1819 + }, + { + "epoch": 0.9379831644047415, + "grad_norm": 1.4614678621292114, + "learning_rate": 4.730905101390692e-06, + "loss": 1.091, + "step": 1820 + }, + { + "epoch": 0.938498539769799, + "grad_norm": 1.3576834201812744, + "learning_rate": 4.730597855716349e-06, + "loss": 0.9969, + "step": 1821 + }, + { + "epoch": 0.9390139151348565, + "grad_norm": 1.4184889793395996, + "learning_rate": 4.7302904447280925e-06, + "loss": 1.084, + "step": 1822 + }, + { + "epoch": 0.9395292904999141, + "grad_norm": 1.560845971107483, + "learning_rate": 4.729982868448707e-06, + "loss": 1.0897, + "step": 1823 + }, + { + "epoch": 0.9400446658649717, + "grad_norm": 1.4771478176116943, + "learning_rate": 4.729675126900987e-06, + "loss": 1.0167, + "step": 1824 + }, + { + "epoch": 0.9405600412300292, + "grad_norm": 1.4902037382125854, + "learning_rate": 4.729367220107738e-06, + "loss": 1.058, + "step": 1825 + }, + { + "epoch": 0.9410754165950868, + "grad_norm": 1.3810451030731201, + "learning_rate": 4.729059148091782e-06, + "loss": 1.0945, + "step": 1826 + }, + { + "epoch": 0.9415907919601443, + "grad_norm": 1.4192668199539185, + "learning_rate": 4.7287509108759485e-06, + "loss": 1.106, + "step": 1827 + }, + { + "epoch": 0.9421061673252018, + "grad_norm": 1.463755488395691, + "learning_rate": 4.728442508483084e-06, + "loss": 1.0567, + "step": 1828 + }, + { + "epoch": 0.9426215426902594, + "grad_norm": 1.4863239526748657, + "learning_rate": 4.728133940936043e-06, + "loss": 1.076, + "step": 1829 + }, + { + "epoch": 0.943136918055317, + "grad_norm": 1.5006253719329834, + "learning_rate": 4.727825208257694e-06, + "loss": 1.06, + "step": 1830 + }, + { + "epoch": 0.9436522934203745, + "grad_norm": 1.5008097887039185, + "learning_rate": 4.72751631047092e-06, + "loss": 1.0828, + "step": 1831 + }, + { + "epoch": 0.944167668785432, + "grad_norm": 1.5320098400115967, + "learning_rate": 4.727207247598612e-06, + "loss": 1.0552, + "step": 1832 + }, + { + "epoch": 0.9446830441504896, + "grad_norm": 1.4904122352600098, + "learning_rate": 4.726898019663675e-06, + "loss": 1.104, + "step": 1833 + }, + { + "epoch": 0.9451984195155472, + "grad_norm": 1.4500375986099243, + "learning_rate": 4.726588626689027e-06, + "loss": 1.0646, + "step": 1834 + }, + { + "epoch": 0.9457137948806047, + "grad_norm": 1.4086084365844727, + "learning_rate": 4.726279068697598e-06, + "loss": 1.0219, + "step": 1835 + }, + { + "epoch": 0.9462291702456622, + "grad_norm": 1.483607530593872, + "learning_rate": 4.72596934571233e-06, + "loss": 1.0645, + "step": 1836 + }, + { + "epoch": 0.9467445456107199, + "grad_norm": 1.4772380590438843, + "learning_rate": 4.725659457756177e-06, + "loss": 1.0689, + "step": 1837 + }, + { + "epoch": 0.9472599209757774, + "grad_norm": 1.5030094385147095, + "learning_rate": 4.725349404852105e-06, + "loss": 1.02, + "step": 1838 + }, + { + "epoch": 0.9477752963408349, + "grad_norm": 1.500354528427124, + "learning_rate": 4.725039187023094e-06, + "loss": 1.054, + "step": 1839 + }, + { + "epoch": 0.9482906717058924, + "grad_norm": 1.4762042760849, + "learning_rate": 4.724728804292132e-06, + "loss": 1.1067, + "step": 1840 + }, + { + "epoch": 0.94880604707095, + "grad_norm": 1.3946943283081055, + "learning_rate": 4.724418256682226e-06, + "loss": 1.0548, + "step": 1841 + }, + { + "epoch": 0.9493214224360076, + "grad_norm": 1.484532117843628, + "learning_rate": 4.724107544216388e-06, + "loss": 1.1011, + "step": 1842 + }, + { + "epoch": 0.9498367978010651, + "grad_norm": 1.4259350299835205, + "learning_rate": 4.723796666917646e-06, + "loss": 1.1398, + "step": 1843 + }, + { + "epoch": 0.9503521731661226, + "grad_norm": 1.4509137868881226, + "learning_rate": 4.723485624809042e-06, + "loss": 1.0666, + "step": 1844 + }, + { + "epoch": 0.9508675485311802, + "grad_norm": 1.504083275794983, + "learning_rate": 4.723174417913625e-06, + "loss": 1.0515, + "step": 1845 + }, + { + "epoch": 0.9513829238962378, + "grad_norm": 1.4487136602401733, + "learning_rate": 4.722863046254461e-06, + "loss": 1.047, + "step": 1846 + }, + { + "epoch": 0.9518982992612953, + "grad_norm": 1.3897000551223755, + "learning_rate": 4.7225515098546246e-06, + "loss": 1.0678, + "step": 1847 + }, + { + "epoch": 0.9524136746263528, + "grad_norm": 1.6226310729980469, + "learning_rate": 4.722239808737207e-06, + "loss": 1.0792, + "step": 1848 + }, + { + "epoch": 0.9529290499914104, + "grad_norm": 1.4630800485610962, + "learning_rate": 4.721927942925308e-06, + "loss": 1.0854, + "step": 1849 + }, + { + "epoch": 0.953444425356468, + "grad_norm": 1.4378085136413574, + "learning_rate": 4.7216159124420385e-06, + "loss": 1.1144, + "step": 1850 + }, + { + "epoch": 0.9539598007215255, + "grad_norm": 1.448045253753662, + "learning_rate": 4.7213037173105255e-06, + "loss": 1.0519, + "step": 1851 + }, + { + "epoch": 0.954475176086583, + "grad_norm": 1.4926683902740479, + "learning_rate": 4.720991357553907e-06, + "loss": 1.0104, + "step": 1852 + }, + { + "epoch": 0.9549905514516406, + "grad_norm": 1.468258261680603, + "learning_rate": 4.720678833195332e-06, + "loss": 1.0357, + "step": 1853 + }, + { + "epoch": 0.9555059268166982, + "grad_norm": 1.4844988584518433, + "learning_rate": 4.720366144257961e-06, + "loss": 1.1176, + "step": 1854 + }, + { + "epoch": 0.9560213021817557, + "grad_norm": 1.3619471788406372, + "learning_rate": 4.7200532907649695e-06, + "loss": 1.0835, + "step": 1855 + }, + { + "epoch": 0.9565366775468133, + "grad_norm": 1.4447712898254395, + "learning_rate": 4.719740272739544e-06, + "loss": 1.1339, + "step": 1856 + }, + { + "epoch": 0.9570520529118708, + "grad_norm": 1.3479328155517578, + "learning_rate": 4.719427090204881e-06, + "loss": 1.0829, + "step": 1857 + }, + { + "epoch": 0.9575674282769283, + "grad_norm": 1.4314838647842407, + "learning_rate": 4.719113743184193e-06, + "loss": 1.0286, + "step": 1858 + }, + { + "epoch": 0.958082803641986, + "grad_norm": 1.3909810781478882, + "learning_rate": 4.7188002317007005e-06, + "loss": 1.0825, + "step": 1859 + }, + { + "epoch": 0.9585981790070435, + "grad_norm": 1.47553288936615, + "learning_rate": 4.718486555777641e-06, + "loss": 1.0356, + "step": 1860 + }, + { + "epoch": 0.959113554372101, + "grad_norm": 1.4584418535232544, + "learning_rate": 4.71817271543826e-06, + "loss": 1.0442, + "step": 1861 + }, + { + "epoch": 0.9596289297371585, + "grad_norm": 1.3760011196136475, + "learning_rate": 4.717858710705817e-06, + "loss": 1.0484, + "step": 1862 + }, + { + "epoch": 0.9601443051022162, + "grad_norm": 1.4110684394836426, + "learning_rate": 4.7175445416035845e-06, + "loss": 1.141, + "step": 1863 + }, + { + "epoch": 0.9606596804672737, + "grad_norm": 1.3495771884918213, + "learning_rate": 4.717230208154845e-06, + "loss": 1.0851, + "step": 1864 + }, + { + "epoch": 0.9611750558323312, + "grad_norm": 1.4929475784301758, + "learning_rate": 4.716915710382895e-06, + "loss": 1.0608, + "step": 1865 + }, + { + "epoch": 0.9616904311973887, + "grad_norm": 1.4913374185562134, + "learning_rate": 4.716601048311043e-06, + "loss": 1.0582, + "step": 1866 + }, + { + "epoch": 0.9622058065624464, + "grad_norm": 1.5947993993759155, + "learning_rate": 4.716286221962607e-06, + "loss": 1.1079, + "step": 1867 + }, + { + "epoch": 0.9627211819275039, + "grad_norm": 1.397476077079773, + "learning_rate": 4.715971231360922e-06, + "loss": 1.0905, + "step": 1868 + }, + { + "epoch": 0.9632365572925614, + "grad_norm": 1.3617136478424072, + "learning_rate": 4.715656076529331e-06, + "loss": 1.0949, + "step": 1869 + }, + { + "epoch": 0.9637519326576189, + "grad_norm": 1.4512900114059448, + "learning_rate": 4.715340757491191e-06, + "loss": 1.0835, + "step": 1870 + }, + { + "epoch": 0.9642673080226765, + "grad_norm": 1.3670510053634644, + "learning_rate": 4.715025274269872e-06, + "loss": 1.024, + "step": 1871 + }, + { + "epoch": 0.9647826833877341, + "grad_norm": 1.524481177330017, + "learning_rate": 4.7147096268887535e-06, + "loss": 1.0453, + "step": 1872 + }, + { + "epoch": 0.9652980587527916, + "grad_norm": 1.3488364219665527, + "learning_rate": 4.71439381537123e-06, + "loss": 1.0241, + "step": 1873 + }, + { + "epoch": 0.9658134341178491, + "grad_norm": 1.4507735967636108, + "learning_rate": 4.714077839740706e-06, + "loss": 1.0571, + "step": 1874 + }, + { + "epoch": 0.9663288094829067, + "grad_norm": 1.4156478643417358, + "learning_rate": 4.7137617000206e-06, + "loss": 1.0305, + "step": 1875 + }, + { + "epoch": 0.9668441848479643, + "grad_norm": 1.511183500289917, + "learning_rate": 4.7134453962343405e-06, + "loss": 1.0363, + "step": 1876 + }, + { + "epoch": 0.9673595602130218, + "grad_norm": 1.5476479530334473, + "learning_rate": 4.71312892840537e-06, + "loss": 1.0561, + "step": 1877 + }, + { + "epoch": 0.9678749355780794, + "grad_norm": 1.5617843866348267, + "learning_rate": 4.712812296557143e-06, + "loss": 1.0723, + "step": 1878 + }, + { + "epoch": 0.9683903109431369, + "grad_norm": 1.3265645503997803, + "learning_rate": 4.712495500713125e-06, + "loss": 1.0373, + "step": 1879 + }, + { + "epoch": 0.9689056863081945, + "grad_norm": 1.4959145784378052, + "learning_rate": 4.7121785408967945e-06, + "loss": 1.0563, + "step": 1880 + }, + { + "epoch": 0.969421061673252, + "grad_norm": 1.4450513124465942, + "learning_rate": 4.711861417131642e-06, + "loss": 1.0584, + "step": 1881 + }, + { + "epoch": 0.9699364370383096, + "grad_norm": 1.4012947082519531, + "learning_rate": 4.711544129441171e-06, + "loss": 1.0402, + "step": 1882 + }, + { + "epoch": 0.9704518124033671, + "grad_norm": 1.5120168924331665, + "learning_rate": 4.7112266778488956e-06, + "loss": 1.1354, + "step": 1883 + }, + { + "epoch": 0.9709671877684247, + "grad_norm": 1.4814488887786865, + "learning_rate": 4.710909062378342e-06, + "loss": 1.0379, + "step": 1884 + }, + { + "epoch": 0.9714825631334822, + "grad_norm": 1.4443869590759277, + "learning_rate": 4.71059128305305e-06, + "loss": 1.049, + "step": 1885 + }, + { + "epoch": 0.9719979384985398, + "grad_norm": 1.6024912595748901, + "learning_rate": 4.710273339896571e-06, + "loss": 1.0218, + "step": 1886 + }, + { + "epoch": 0.9725133138635973, + "grad_norm": 1.5310415029525757, + "learning_rate": 4.709955232932468e-06, + "loss": 1.1243, + "step": 1887 + }, + { + "epoch": 0.9730286892286548, + "grad_norm": 1.4426299333572388, + "learning_rate": 4.7096369621843176e-06, + "loss": 1.0082, + "step": 1888 + }, + { + "epoch": 0.9735440645937125, + "grad_norm": 1.407894253730774, + "learning_rate": 4.709318527675706e-06, + "loss": 1.0448, + "step": 1889 + }, + { + "epoch": 0.97405943995877, + "grad_norm": 1.3813974857330322, + "learning_rate": 4.708999929430234e-06, + "loss": 1.0825, + "step": 1890 + }, + { + "epoch": 0.9745748153238275, + "grad_norm": 1.442893385887146, + "learning_rate": 4.708681167471513e-06, + "loss": 1.053, + "step": 1891 + }, + { + "epoch": 0.975090190688885, + "grad_norm": 1.4274073839187622, + "learning_rate": 4.708362241823166e-06, + "loss": 1.0981, + "step": 1892 + }, + { + "epoch": 0.9756055660539427, + "grad_norm": 1.453469157218933, + "learning_rate": 4.7080431525088325e-06, + "loss": 1.0822, + "step": 1893 + }, + { + "epoch": 0.9761209414190002, + "grad_norm": 1.330512285232544, + "learning_rate": 4.707723899552158e-06, + "loss": 1.0873, + "step": 1894 + }, + { + "epoch": 0.9766363167840577, + "grad_norm": 1.3868836164474487, + "learning_rate": 4.707404482976803e-06, + "loss": 1.0318, + "step": 1895 + }, + { + "epoch": 0.9771516921491152, + "grad_norm": 1.4036799669265747, + "learning_rate": 4.707084902806441e-06, + "loss": 1.0617, + "step": 1896 + }, + { + "epoch": 0.9776670675141729, + "grad_norm": 1.4641084671020508, + "learning_rate": 4.7067651590647564e-06, + "loss": 1.0892, + "step": 1897 + }, + { + "epoch": 0.9781824428792304, + "grad_norm": 1.45862877368927, + "learning_rate": 4.706445251775446e-06, + "loss": 1.0931, + "step": 1898 + }, + { + "epoch": 0.9786978182442879, + "grad_norm": 1.4056271314620972, + "learning_rate": 4.70612518096222e-06, + "loss": 1.0556, + "step": 1899 + }, + { + "epoch": 0.9792131936093454, + "grad_norm": 1.386319875717163, + "learning_rate": 4.7058049466487974e-06, + "loss": 1.0281, + "step": 1900 + }, + { + "epoch": 0.979728568974403, + "grad_norm": 1.4791563749313354, + "learning_rate": 4.705484548858912e-06, + "loss": 1.0581, + "step": 1901 + }, + { + "epoch": 0.9802439443394606, + "grad_norm": 1.5182712078094482, + "learning_rate": 4.70516398761631e-06, + "loss": 1.0856, + "step": 1902 + }, + { + "epoch": 0.9807593197045181, + "grad_norm": 1.4754759073257446, + "learning_rate": 4.704843262944748e-06, + "loss": 1.0979, + "step": 1903 + }, + { + "epoch": 0.9812746950695757, + "grad_norm": 1.4757529497146606, + "learning_rate": 4.704522374867996e-06, + "loss": 1.0508, + "step": 1904 + }, + { + "epoch": 0.9817900704346332, + "grad_norm": 1.5718193054199219, + "learning_rate": 4.704201323409835e-06, + "loss": 1.1101, + "step": 1905 + }, + { + "epoch": 0.9823054457996908, + "grad_norm": 1.4598045349121094, + "learning_rate": 4.703880108594059e-06, + "loss": 1.1212, + "step": 1906 + }, + { + "epoch": 0.9828208211647483, + "grad_norm": 1.3712278604507446, + "learning_rate": 4.703558730444474e-06, + "loss": 1.0175, + "step": 1907 + }, + { + "epoch": 0.9833361965298059, + "grad_norm": 1.3985803127288818, + "learning_rate": 4.7032371889848985e-06, + "loss": 1.0278, + "step": 1908 + }, + { + "epoch": 0.9838515718948634, + "grad_norm": 1.5293681621551514, + "learning_rate": 4.702915484239161e-06, + "loss": 1.1274, + "step": 1909 + }, + { + "epoch": 0.984366947259921, + "grad_norm": 1.5148296356201172, + "learning_rate": 4.7025936162311055e-06, + "loss": 1.0513, + "step": 1910 + }, + { + "epoch": 0.9848823226249785, + "grad_norm": 1.4407379627227783, + "learning_rate": 4.7022715849845844e-06, + "loss": 1.041, + "step": 1911 + }, + { + "epoch": 0.9853976979900361, + "grad_norm": 1.6437877416610718, + "learning_rate": 4.701949390523466e-06, + "loss": 1.0867, + "step": 1912 + }, + { + "epoch": 0.9859130733550936, + "grad_norm": 1.4243892431259155, + "learning_rate": 4.701627032871627e-06, + "loss": 1.0369, + "step": 1913 + }, + { + "epoch": 0.9864284487201512, + "grad_norm": 1.4947564601898193, + "learning_rate": 4.7013045120529595e-06, + "loss": 1.0783, + "step": 1914 + }, + { + "epoch": 0.9869438240852088, + "grad_norm": 1.5142713785171509, + "learning_rate": 4.700981828091365e-06, + "loss": 1.082, + "step": 1915 + }, + { + "epoch": 0.9874591994502663, + "grad_norm": 1.457655668258667, + "learning_rate": 4.70065898101076e-06, + "loss": 1.0564, + "step": 1916 + }, + { + "epoch": 0.9879745748153238, + "grad_norm": 1.4288212060928345, + "learning_rate": 4.700335970835067e-06, + "loss": 1.0861, + "step": 1917 + }, + { + "epoch": 0.9884899501803813, + "grad_norm": 1.4429469108581543, + "learning_rate": 4.70001279758823e-06, + "loss": 1.0327, + "step": 1918 + }, + { + "epoch": 0.989005325545439, + "grad_norm": 1.3988040685653687, + "learning_rate": 4.699689461294197e-06, + "loss": 1.0651, + "step": 1919 + }, + { + "epoch": 0.9895207009104965, + "grad_norm": 1.574169397354126, + "learning_rate": 4.699365961976933e-06, + "loss": 1.0939, + "step": 1920 + }, + { + "epoch": 0.990036076275554, + "grad_norm": 1.3809009790420532, + "learning_rate": 4.699042299660411e-06, + "loss": 1.0724, + "step": 1921 + }, + { + "epoch": 0.9905514516406115, + "grad_norm": 1.5404826402664185, + "learning_rate": 4.69871847436862e-06, + "loss": 1.0632, + "step": 1922 + }, + { + "epoch": 0.9910668270056692, + "grad_norm": 1.4825998544692993, + "learning_rate": 4.698394486125558e-06, + "loss": 1.0113, + "step": 1923 + }, + { + "epoch": 0.9915822023707267, + "grad_norm": 1.4356918334960938, + "learning_rate": 4.698070334955237e-06, + "loss": 1.091, + "step": 1924 + }, + { + "epoch": 0.9920975777357842, + "grad_norm": 1.4427188634872437, + "learning_rate": 4.69774602088168e-06, + "loss": 1.0901, + "step": 1925 + }, + { + "epoch": 0.9926129531008417, + "grad_norm": 1.4783024787902832, + "learning_rate": 4.697421543928924e-06, + "loss": 1.0303, + "step": 1926 + }, + { + "epoch": 0.9931283284658994, + "grad_norm": 1.478934407234192, + "learning_rate": 4.697096904121016e-06, + "loss": 1.1187, + "step": 1927 + }, + { + "epoch": 0.9936437038309569, + "grad_norm": 1.5085822343826294, + "learning_rate": 4.696772101482014e-06, + "loss": 1.0884, + "step": 1928 + }, + { + "epoch": 0.9941590791960144, + "grad_norm": 1.4221833944320679, + "learning_rate": 4.6964471360359925e-06, + "loss": 1.0532, + "step": 1929 + }, + { + "epoch": 0.994674454561072, + "grad_norm": 1.5141397714614868, + "learning_rate": 4.696122007807033e-06, + "loss": 1.1066, + "step": 1930 + }, + { + "epoch": 0.9951898299261295, + "grad_norm": 1.4299890995025635, + "learning_rate": 4.695796716819233e-06, + "loss": 0.9984, + "step": 1931 + }, + { + "epoch": 0.9957052052911871, + "grad_norm": 1.5260546207427979, + "learning_rate": 4.695471263096699e-06, + "loss": 1.1296, + "step": 1932 + }, + { + "epoch": 0.9962205806562446, + "grad_norm": 1.358848214149475, + "learning_rate": 4.695145646663552e-06, + "loss": 1.0145, + "step": 1933 + }, + { + "epoch": 0.9967359560213022, + "grad_norm": 1.4124057292938232, + "learning_rate": 4.694819867543924e-06, + "loss": 1.0814, + "step": 1934 + }, + { + "epoch": 0.9972513313863597, + "grad_norm": 1.4105831384658813, + "learning_rate": 4.694493925761958e-06, + "loss": 1.0178, + "step": 1935 + }, + { + "epoch": 0.9977667067514173, + "grad_norm": 1.4723960161209106, + "learning_rate": 4.694167821341812e-06, + "loss": 1.1139, + "step": 1936 + }, + { + "epoch": 0.9982820821164748, + "grad_norm": 1.416146993637085, + "learning_rate": 4.693841554307653e-06, + "loss": 1.0567, + "step": 1937 + }, + { + "epoch": 0.9987974574815324, + "grad_norm": 1.4232710599899292, + "learning_rate": 4.693515124683662e-06, + "loss": 1.0747, + "step": 1938 + }, + { + "epoch": 0.9993128328465899, + "grad_norm": 1.5446377992630005, + "learning_rate": 4.69318853249403e-06, + "loss": 1.0057, + "step": 1939 + }, + { + "epoch": 0.9998282082116475, + "grad_norm": 1.505684494972229, + "learning_rate": 4.692861777762963e-06, + "loss": 1.0992, + "step": 1940 + } + ], + "logging_steps": 1, + "max_steps": 11640, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 1940, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.345877427776389e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}